qemu/hw/i386/intel_iommu.c
<<
>>
Prefs
   1/*
   2 * QEMU emulation of an Intel IOMMU (VT-d)
   3 *   (DMA Remapping device)
   4 *
   5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
   6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
   7 *
   8 * This program is free software; you can redistribute it and/or modify
   9 * it under the terms of the GNU General Public License as published by
  10 * the Free Software Foundation; either version 2 of the License, or
  11 * (at your option) any later version.
  12
  13 * This program is distributed in the hope that it will be useful,
  14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 * GNU General Public License for more details.
  17
  18 * You should have received a copy of the GNU General Public License along
  19 * with this program; if not, see <http://www.gnu.org/licenses/>.
  20 */
  21
  22#include "qemu/osdep.h"
  23#include "qemu/error-report.h"
  24#include "qapi/error.h"
  25#include "hw/sysbus.h"
  26#include "exec/address-spaces.h"
  27#include "intel_iommu_internal.h"
  28#include "hw/pci/pci.h"
  29#include "hw/pci/pci_bus.h"
  30#include "hw/i386/pc.h"
  31#include "hw/i386/apic-msidef.h"
  32#include "hw/boards.h"
  33#include "hw/i386/x86-iommu.h"
  34#include "hw/pci-host/q35.h"
  35#include "sysemu/kvm.h"
  36#include "hw/i386/apic_internal.h"
  37#include "kvm_i386.h"
  38
  39/*#define DEBUG_INTEL_IOMMU*/
  40#ifdef DEBUG_INTEL_IOMMU
  41enum {
  42    DEBUG_GENERAL, DEBUG_CSR, DEBUG_INV, DEBUG_MMU, DEBUG_FLOG,
  43    DEBUG_CACHE, DEBUG_IR,
  44};
  45#define VTD_DBGBIT(x)   (1 << DEBUG_##x)
  46static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
  47
  48#define VTD_DPRINTF(what, fmt, ...) do { \
  49    if (vtd_dbgflags & VTD_DBGBIT(what)) { \
  50        fprintf(stderr, "(vtd)%s: " fmt "\n", __func__, \
  51                ## __VA_ARGS__); } \
  52    } while (0)
  53#else
  54#define VTD_DPRINTF(what, fmt, ...) do {} while (0)
  55#endif
  56
  57static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
  58                            uint64_t wmask, uint64_t w1cmask)
  59{
  60    stq_le_p(&s->csr[addr], val);
  61    stq_le_p(&s->wmask[addr], wmask);
  62    stq_le_p(&s->w1cmask[addr], w1cmask);
  63}
  64
  65static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
  66{
  67    stq_le_p(&s->womask[addr], mask);
  68}
  69
  70static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
  71                            uint32_t wmask, uint32_t w1cmask)
  72{
  73    stl_le_p(&s->csr[addr], val);
  74    stl_le_p(&s->wmask[addr], wmask);
  75    stl_le_p(&s->w1cmask[addr], w1cmask);
  76}
  77
  78static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
  79{
  80    stl_le_p(&s->womask[addr], mask);
  81}
  82
  83/* "External" get/set operations */
  84static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
  85{
  86    uint64_t oldval = ldq_le_p(&s->csr[addr]);
  87    uint64_t wmask = ldq_le_p(&s->wmask[addr]);
  88    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
  89    stq_le_p(&s->csr[addr],
  90             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
  91}
  92
  93static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
  94{
  95    uint32_t oldval = ldl_le_p(&s->csr[addr]);
  96    uint32_t wmask = ldl_le_p(&s->wmask[addr]);
  97    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
  98    stl_le_p(&s->csr[addr],
  99             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
 100}
 101
 102static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
 103{
 104    uint64_t val = ldq_le_p(&s->csr[addr]);
 105    uint64_t womask = ldq_le_p(&s->womask[addr]);
 106    return val & ~womask;
 107}
 108
 109static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
 110{
 111    uint32_t val = ldl_le_p(&s->csr[addr]);
 112    uint32_t womask = ldl_le_p(&s->womask[addr]);
 113    return val & ~womask;
 114}
 115
 116/* "Internal" get/set operations */
 117static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
 118{
 119    return ldq_le_p(&s->csr[addr]);
 120}
 121
 122static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
 123{
 124    return ldl_le_p(&s->csr[addr]);
 125}
 126
 127static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
 128{
 129    stq_le_p(&s->csr[addr], val);
 130}
 131
 132static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
 133                                        uint32_t clear, uint32_t mask)
 134{
 135    uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
 136    stl_le_p(&s->csr[addr], new_val);
 137    return new_val;
 138}
 139
 140static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
 141                                        uint64_t clear, uint64_t mask)
 142{
 143    uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
 144    stq_le_p(&s->csr[addr], new_val);
 145    return new_val;
 146}
 147
 148/* GHashTable functions */
 149static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 150{
 151    return *((const uint64_t *)v1) == *((const uint64_t *)v2);
 152}
 153
 154static guint vtd_uint64_hash(gconstpointer v)
 155{
 156    return (guint)*(const uint64_t *)v;
 157}
 158
 159static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
 160                                          gpointer user_data)
 161{
 162    VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
 163    uint16_t domain_id = *(uint16_t *)user_data;
 164    return entry->domain_id == domain_id;
 165}
 166
 167/* The shift of an addr for a certain level of paging structure */
 168static inline uint32_t vtd_slpt_level_shift(uint32_t level)
 169{
 170    return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
 171}
 172
 173static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
 174{
 175    return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
 176}
 177
 178static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
 179                                        gpointer user_data)
 180{
 181    VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
 182    VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
 183    uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
 184    uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
 185    return (entry->domain_id == info->domain_id) &&
 186            (((entry->gfn & info->mask) == gfn) ||
 187             (entry->gfn == gfn_tlb));
 188}
 189
 190/* Reset all the gen of VTDAddressSpace to zero and set the gen of
 191 * IntelIOMMUState to 1.
 192 */
 193static void vtd_reset_context_cache(IntelIOMMUState *s)
 194{
 195    VTDAddressSpace *vtd_as;
 196    VTDBus *vtd_bus;
 197    GHashTableIter bus_it;
 198    uint32_t devfn_it;
 199
 200    g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
 201
 202    VTD_DPRINTF(CACHE, "global context_cache_gen=1");
 203    while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
 204        for (devfn_it = 0; devfn_it < X86_IOMMU_PCI_DEVFN_MAX; ++devfn_it) {
 205            vtd_as = vtd_bus->dev_as[devfn_it];
 206            if (!vtd_as) {
 207                continue;
 208            }
 209            vtd_as->context_cache_entry.context_cache_gen = 0;
 210        }
 211    }
 212    s->context_cache_gen = 1;
 213}
 214
 215static void vtd_reset_iotlb(IntelIOMMUState *s)
 216{
 217    assert(s->iotlb);
 218    g_hash_table_remove_all(s->iotlb);
 219}
 220
 221static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
 222                                  uint32_t level)
 223{
 224    return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
 225           ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
 226}
 227
 228static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
 229{
 230    return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
 231}
 232
 233static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
 234                                       hwaddr addr)
 235{
 236    VTDIOTLBEntry *entry;
 237    uint64_t key;
 238    int level;
 239
 240    for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
 241        key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
 242                                source_id, level);
 243        entry = g_hash_table_lookup(s->iotlb, &key);
 244        if (entry) {
 245            goto out;
 246        }
 247    }
 248
 249out:
 250    return entry;
 251}
 252
 253static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
 254                             uint16_t domain_id, hwaddr addr, uint64_t slpte,
 255                             bool read_flags, bool write_flags,
 256                             uint32_t level)
 257{
 258    VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
 259    uint64_t *key = g_malloc(sizeof(*key));
 260    uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
 261
 262    VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64
 263                " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, slpte,
 264                domain_id);
 265    if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
 266        VTD_DPRINTF(CACHE, "iotlb exceeds size limit, forced to reset");
 267        vtd_reset_iotlb(s);
 268    }
 269
 270    entry->gfn = gfn;
 271    entry->domain_id = domain_id;
 272    entry->slpte = slpte;
 273    entry->read_flags = read_flags;
 274    entry->write_flags = write_flags;
 275    entry->mask = vtd_slpt_level_page_mask(level);
 276    *key = vtd_get_iotlb_key(gfn, source_id, level);
 277    g_hash_table_replace(s->iotlb, key, entry);
 278}
 279
 280/* Given the reg addr of both the message data and address, generate an
 281 * interrupt via MSI.
 282 */
 283static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
 284                                   hwaddr mesg_data_reg)
 285{
 286    MSIMessage msi;
 287
 288    assert(mesg_data_reg < DMAR_REG_SIZE);
 289    assert(mesg_addr_reg < DMAR_REG_SIZE);
 290
 291    msi.address = vtd_get_long_raw(s, mesg_addr_reg);
 292    msi.data = vtd_get_long_raw(s, mesg_data_reg);
 293
 294    VTD_DPRINTF(FLOG, "msi: addr 0x%"PRIx64 " data 0x%"PRIx32,
 295                msi.address, msi.data);
 296    apic_get_class()->send_msi(&msi);
 297}
 298
 299/* Generate a fault event to software via MSI if conditions are met.
 300 * Notice that the value of FSTS_REG being passed to it should be the one
 301 * before any update.
 302 */
 303static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
 304{
 305    if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
 306        pre_fsts & VTD_FSTS_IQE) {
 307        VTD_DPRINTF(FLOG, "there are previous interrupt conditions "
 308                    "to be serviced by software, fault event is not generated "
 309                    "(FSTS_REG 0x%"PRIx32 ")", pre_fsts);
 310        return;
 311    }
 312    vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
 313    if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
 314        VTD_DPRINTF(FLOG, "Interrupt Mask set, fault event is not generated");
 315    } else {
 316        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
 317        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
 318    }
 319}
 320
 321/* Check if the Fault (F) field of the Fault Recording Register referenced by
 322 * @index is Set.
 323 */
 324static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
 325{
 326    /* Each reg is 128-bit */
 327    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 328    addr += 8; /* Access the high 64-bit half */
 329
 330    assert(index < DMAR_FRCD_REG_NR);
 331
 332    return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
 333}
 334
 335/* Update the PPF field of Fault Status Register.
 336 * Should be called whenever change the F field of any fault recording
 337 * registers.
 338 */
 339static void vtd_update_fsts_ppf(IntelIOMMUState *s)
 340{
 341    uint32_t i;
 342    uint32_t ppf_mask = 0;
 343
 344    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
 345        if (vtd_is_frcd_set(s, i)) {
 346            ppf_mask = VTD_FSTS_PPF;
 347            break;
 348        }
 349    }
 350    vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
 351    VTD_DPRINTF(FLOG, "set PPF of FSTS_REG to %d", ppf_mask ? 1 : 0);
 352}
 353
 354static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
 355{
 356    /* Each reg is 128-bit */
 357    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 358    addr += 8; /* Access the high 64-bit half */
 359
 360    assert(index < DMAR_FRCD_REG_NR);
 361
 362    vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
 363    vtd_update_fsts_ppf(s);
 364}
 365
 366/* Must not update F field now, should be done later */
 367static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
 368                            uint16_t source_id, hwaddr addr,
 369                            VTDFaultReason fault, bool is_write)
 370{
 371    uint64_t hi = 0, lo;
 372    hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 373
 374    assert(index < DMAR_FRCD_REG_NR);
 375
 376    lo = VTD_FRCD_FI(addr);
 377    hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
 378    if (!is_write) {
 379        hi |= VTD_FRCD_T;
 380    }
 381    vtd_set_quad_raw(s, frcd_reg_addr, lo);
 382    vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
 383    VTD_DPRINTF(FLOG, "record to FRCD_REG #%"PRIu16 ": hi 0x%"PRIx64
 384                ", lo 0x%"PRIx64, index, hi, lo);
 385}
 386
 387/* Try to collapse multiple pending faults from the same requester */
 388static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
 389{
 390    uint32_t i;
 391    uint64_t frcd_reg;
 392    hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
 393
 394    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
 395        frcd_reg = vtd_get_quad_raw(s, addr);
 396        VTD_DPRINTF(FLOG, "frcd_reg #%d 0x%"PRIx64, i, frcd_reg);
 397        if ((frcd_reg & VTD_FRCD_F) &&
 398            ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
 399            return true;
 400        }
 401        addr += 16; /* 128-bit for each */
 402    }
 403    return false;
 404}
 405
 406/* Log and report an DMAR (address translation) fault to software */
 407static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
 408                                  hwaddr addr, VTDFaultReason fault,
 409                                  bool is_write)
 410{
 411    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 412
 413    assert(fault < VTD_FR_MAX);
 414
 415    if (fault == VTD_FR_RESERVED_ERR) {
 416        /* This is not a normal fault reason case. Drop it. */
 417        return;
 418    }
 419    VTD_DPRINTF(FLOG, "sid 0x%"PRIx16 ", fault %d, addr 0x%"PRIx64
 420                ", is_write %d", source_id, fault, addr, is_write);
 421    if (fsts_reg & VTD_FSTS_PFO) {
 422        VTD_DPRINTF(FLOG, "new fault is not recorded due to "
 423                    "Primary Fault Overflow");
 424        return;
 425    }
 426    if (vtd_try_collapse_fault(s, source_id)) {
 427        VTD_DPRINTF(FLOG, "new fault is not recorded due to "
 428                    "compression of faults");
 429        return;
 430    }
 431    if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
 432        VTD_DPRINTF(FLOG, "Primary Fault Overflow and "
 433                    "new fault is not recorded, set PFO field");
 434        vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
 435        return;
 436    }
 437
 438    vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
 439
 440    if (fsts_reg & VTD_FSTS_PPF) {
 441        VTD_DPRINTF(FLOG, "there are pending faults already, "
 442                    "fault event is not generated");
 443        vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
 444        s->next_frcd_reg++;
 445        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
 446            s->next_frcd_reg = 0;
 447        }
 448    } else {
 449        vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
 450                                VTD_FSTS_FRI(s->next_frcd_reg));
 451        vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
 452        s->next_frcd_reg++;
 453        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
 454            s->next_frcd_reg = 0;
 455        }
 456        /* This case actually cause the PPF to be Set.
 457         * So generate fault event (interrupt).
 458         */
 459         vtd_generate_fault_event(s, fsts_reg);
 460    }
 461}
 462
 463/* Handle Invalidation Queue Errors of queued invalidation interface error
 464 * conditions.
 465 */
 466static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
 467{
 468    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 469
 470    vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
 471    vtd_generate_fault_event(s, fsts_reg);
 472}
 473
 474/* Set the IWC field and try to generate an invalidation completion interrupt */
 475static void vtd_generate_completion_event(IntelIOMMUState *s)
 476{
 477    VTD_DPRINTF(INV, "completes an invalidation wait command with "
 478                "Interrupt Flag");
 479    if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
 480        VTD_DPRINTF(INV, "there is a previous interrupt condition to be "
 481                    "serviced by software, "
 482                    "new invalidation event is not generated");
 483        return;
 484    }
 485    vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
 486    vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
 487    if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
 488        VTD_DPRINTF(INV, "IM filed in IECTL_REG is set, new invalidation "
 489                    "event is not generated");
 490        return;
 491    } else {
 492        /* Generate the interrupt event */
 493        vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
 494        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
 495    }
 496}
 497
 498static inline bool vtd_root_entry_present(VTDRootEntry *root)
 499{
 500    return root->val & VTD_ROOT_ENTRY_P;
 501}
 502
 503static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
 504                              VTDRootEntry *re)
 505{
 506    dma_addr_t addr;
 507
 508    addr = s->root + index * sizeof(*re);
 509    if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
 510        VTD_DPRINTF(GENERAL, "error: fail to access root-entry at 0x%"PRIx64
 511                    " + %"PRIu8, s->root, index);
 512        re->val = 0;
 513        return -VTD_FR_ROOT_TABLE_INV;
 514    }
 515    re->val = le64_to_cpu(re->val);
 516    return 0;
 517}
 518
 519static inline bool vtd_context_entry_present(VTDContextEntry *context)
 520{
 521    return context->lo & VTD_CONTEXT_ENTRY_P;
 522}
 523
 524static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index,
 525                                           VTDContextEntry *ce)
 526{
 527    dma_addr_t addr;
 528
 529    if (!vtd_root_entry_present(root)) {
 530        VTD_DPRINTF(GENERAL, "error: root-entry is not present");
 531        return -VTD_FR_ROOT_ENTRY_P;
 532    }
 533    addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce);
 534    if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) {
 535        VTD_DPRINTF(GENERAL, "error: fail to access context-entry at 0x%"PRIx64
 536                    " + %"PRIu8,
 537                    (uint64_t)(root->val & VTD_ROOT_ENTRY_CTP), index);
 538        return -VTD_FR_CONTEXT_TABLE_INV;
 539    }
 540    ce->lo = le64_to_cpu(ce->lo);
 541    ce->hi = le64_to_cpu(ce->hi);
 542    return 0;
 543}
 544
 545static inline dma_addr_t vtd_get_slpt_base_from_context(VTDContextEntry *ce)
 546{
 547    return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
 548}
 549
 550static inline uint64_t vtd_get_slpte_addr(uint64_t slpte)
 551{
 552    return slpte & VTD_SL_PT_BASE_ADDR_MASK;
 553}
 554
 555/* Whether the pte indicates the address of the page frame */
 556static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
 557{
 558    return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
 559}
 560
 561/* Get the content of a spte located in @base_addr[@index] */
 562static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
 563{
 564    uint64_t slpte;
 565
 566    assert(index < VTD_SL_PT_ENTRY_NR);
 567
 568    if (dma_memory_read(&address_space_memory,
 569                        base_addr + index * sizeof(slpte), &slpte,
 570                        sizeof(slpte))) {
 571        slpte = (uint64_t)-1;
 572        return slpte;
 573    }
 574    slpte = le64_to_cpu(slpte);
 575    return slpte;
 576}
 577
 578/* Given a gpa and the level of paging structure, return the offset of current
 579 * level.
 580 */
 581static inline uint32_t vtd_gpa_level_offset(uint64_t gpa, uint32_t level)
 582{
 583    return (gpa >> vtd_slpt_level_shift(level)) &
 584            ((1ULL << VTD_SL_LEVEL_BITS) - 1);
 585}
 586
 587/* Check Capability Register to see if the @level of page-table is supported */
 588static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
 589{
 590    return VTD_CAP_SAGAW_MASK & s->cap &
 591           (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
 592}
 593
 594/* Get the page-table level that hardware should use for the second-level
 595 * page-table walk from the Address Width field of context-entry.
 596 */
 597static inline uint32_t vtd_get_level_from_context_entry(VTDContextEntry *ce)
 598{
 599    return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
 600}
 601
 602static inline uint32_t vtd_get_agaw_from_context_entry(VTDContextEntry *ce)
 603{
 604    return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
 605}
 606
 607static const uint64_t vtd_paging_entry_rsvd_field[] = {
 608    [0] = ~0ULL,
 609    /* For not large page */
 610    [1] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
 611    [2] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
 612    [3] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
 613    [4] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
 614    /* For large page */
 615    [5] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
 616    [6] = 0x1ff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
 617    [7] = 0x3ffff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
 618    [8] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
 619};
 620
 621static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
 622{
 623    if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) {
 624        /* Maybe large page */
 625        return slpte & vtd_paging_entry_rsvd_field[level + 4];
 626    } else {
 627        return slpte & vtd_paging_entry_rsvd_field[level];
 628    }
 629}
 630
 631/* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
 632 * of the translation, can be used for deciding the size of large page.
 633 */
 634static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
 635                            uint64_t *slptep, uint32_t *slpte_level,
 636                            bool *reads, bool *writes)
 637{
 638    dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
 639    uint32_t level = vtd_get_level_from_context_entry(ce);
 640    uint32_t offset;
 641    uint64_t slpte;
 642    uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
 643    uint64_t access_right_check;
 644
 645    /* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in CAP_REG
 646     * and AW in context-entry.
 647     */
 648    if (gpa & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
 649        VTD_DPRINTF(GENERAL, "error: gpa 0x%"PRIx64 " exceeds limits", gpa);
 650        return -VTD_FR_ADDR_BEYOND_MGAW;
 651    }
 652
 653    /* FIXME: what is the Atomics request here? */
 654    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
 655
 656    while (true) {
 657        offset = vtd_gpa_level_offset(gpa, level);
 658        slpte = vtd_get_slpte(addr, offset);
 659
 660        if (slpte == (uint64_t)-1) {
 661            VTD_DPRINTF(GENERAL, "error: fail to access second-level paging "
 662                        "entry at level %"PRIu32 " for gpa 0x%"PRIx64,
 663                        level, gpa);
 664            if (level == vtd_get_level_from_context_entry(ce)) {
 665                /* Invalid programming of context-entry */
 666                return -VTD_FR_CONTEXT_ENTRY_INV;
 667            } else {
 668                return -VTD_FR_PAGING_ENTRY_INV;
 669            }
 670        }
 671        *reads = (*reads) && (slpte & VTD_SL_R);
 672        *writes = (*writes) && (slpte & VTD_SL_W);
 673        if (!(slpte & access_right_check)) {
 674            VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
 675                        "gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
 676                        (is_write ? "write" : "read"), gpa, slpte);
 677            return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
 678        }
 679        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
 680            VTD_DPRINTF(GENERAL, "error: non-zero reserved field in second "
 681                        "level paging entry level %"PRIu32 " slpte 0x%"PRIx64,
 682                        level, slpte);
 683            return -VTD_FR_PAGING_ENTRY_RSVD;
 684        }
 685
 686        if (vtd_is_last_slpte(slpte, level)) {
 687            *slptep = slpte;
 688            *slpte_level = level;
 689            return 0;
 690        }
 691        addr = vtd_get_slpte_addr(slpte);
 692        level--;
 693    }
 694}
 695
 696/* Map a device to its corresponding domain (context-entry) */
 697static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
 698                                    uint8_t devfn, VTDContextEntry *ce)
 699{
 700    VTDRootEntry re;
 701    int ret_fr;
 702
 703    ret_fr = vtd_get_root_entry(s, bus_num, &re);
 704    if (ret_fr) {
 705        return ret_fr;
 706    }
 707
 708    if (!vtd_root_entry_present(&re)) {
 709        VTD_DPRINTF(GENERAL, "error: root-entry #%"PRIu8 " is not present",
 710                    bus_num);
 711        return -VTD_FR_ROOT_ENTRY_P;
 712    } else if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) {
 713        VTD_DPRINTF(GENERAL, "error: non-zero reserved field in root-entry "
 714                    "hi 0x%"PRIx64 " lo 0x%"PRIx64, re.rsvd, re.val);
 715        return -VTD_FR_ROOT_ENTRY_RSVD;
 716    }
 717
 718    ret_fr = vtd_get_context_entry_from_root(&re, devfn, ce);
 719    if (ret_fr) {
 720        return ret_fr;
 721    }
 722
 723    if (!vtd_context_entry_present(ce)) {
 724        VTD_DPRINTF(GENERAL,
 725                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
 726                    "is not present", devfn, bus_num);
 727        return -VTD_FR_CONTEXT_ENTRY_P;
 728    } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
 729               (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
 730        VTD_DPRINTF(GENERAL,
 731                    "error: non-zero reserved field in context-entry "
 732                    "hi 0x%"PRIx64 " lo 0x%"PRIx64, ce->hi, ce->lo);
 733        return -VTD_FR_CONTEXT_ENTRY_RSVD;
 734    }
 735    /* Check if the programming of context-entry is valid */
 736    if (!vtd_is_level_supported(s, vtd_get_level_from_context_entry(ce))) {
 737        VTD_DPRINTF(GENERAL, "error: unsupported Address Width value in "
 738                    "context-entry hi 0x%"PRIx64 " lo 0x%"PRIx64,
 739                    ce->hi, ce->lo);
 740        return -VTD_FR_CONTEXT_ENTRY_INV;
 741    } else if (ce->lo & VTD_CONTEXT_ENTRY_TT) {
 742        VTD_DPRINTF(GENERAL, "error: unsupported Translation Type in "
 743                    "context-entry hi 0x%"PRIx64 " lo 0x%"PRIx64,
 744                    ce->hi, ce->lo);
 745        return -VTD_FR_CONTEXT_ENTRY_INV;
 746    }
 747    return 0;
 748}
 749
 750static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
 751{
 752    return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
 753}
 754
 755static const bool vtd_qualified_faults[] = {
 756    [VTD_FR_RESERVED] = false,
 757    [VTD_FR_ROOT_ENTRY_P] = false,
 758    [VTD_FR_CONTEXT_ENTRY_P] = true,
 759    [VTD_FR_CONTEXT_ENTRY_INV] = true,
 760    [VTD_FR_ADDR_BEYOND_MGAW] = true,
 761    [VTD_FR_WRITE] = true,
 762    [VTD_FR_READ] = true,
 763    [VTD_FR_PAGING_ENTRY_INV] = true,
 764    [VTD_FR_ROOT_TABLE_INV] = false,
 765    [VTD_FR_CONTEXT_TABLE_INV] = false,
 766    [VTD_FR_ROOT_ENTRY_RSVD] = false,
 767    [VTD_FR_PAGING_ENTRY_RSVD] = true,
 768    [VTD_FR_CONTEXT_ENTRY_TT] = true,
 769    [VTD_FR_RESERVED_ERR] = false,
 770    [VTD_FR_MAX] = false,
 771};
 772
 773/* To see if a fault condition is "qualified", which is reported to software
 774 * only if the FPD field in the context-entry used to process the faulting
 775 * request is 0.
 776 */
 777static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
 778{
 779    return vtd_qualified_faults[fault];
 780}
 781
 782static inline bool vtd_is_interrupt_addr(hwaddr addr)
 783{
 784    return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
 785}
 786
 787/* Map dev to context-entry then do a paging-structures walk to do a iommu
 788 * translation.
 789 *
 790 * Called from RCU critical section.
 791 *
 792 * @bus_num: The bus number
 793 * @devfn: The devfn, which is the  combined of device and function number
 794 * @is_write: The access is a write operation
 795 * @entry: IOMMUTLBEntry that contain the addr to be translated and result
 796 */
 797static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
 798                                   uint8_t devfn, hwaddr addr, bool is_write,
 799                                   IOMMUTLBEntry *entry)
 800{
 801    IntelIOMMUState *s = vtd_as->iommu_state;
 802    VTDContextEntry ce;
 803    uint8_t bus_num = pci_bus_num(bus);
 804    VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry;
 805    uint64_t slpte, page_mask;
 806    uint32_t level;
 807    uint16_t source_id = vtd_make_source_id(bus_num, devfn);
 808    int ret_fr;
 809    bool is_fpd_set = false;
 810    bool reads = true;
 811    bool writes = true;
 812    VTDIOTLBEntry *iotlb_entry;
 813
 814    /* Check if the request is in interrupt address range */
 815    if (vtd_is_interrupt_addr(addr)) {
 816        if (is_write) {
 817            /* FIXME: since we don't know the length of the access here, we
 818             * treat Non-DWORD length write requests without PASID as
 819             * interrupt requests, too. Withoud interrupt remapping support,
 820             * we just use 1:1 mapping.
 821             */
 822            VTD_DPRINTF(MMU, "write request to interrupt address "
 823                        "gpa 0x%"PRIx64, addr);
 824            entry->iova = addr & VTD_PAGE_MASK_4K;
 825            entry->translated_addr = addr & VTD_PAGE_MASK_4K;
 826            entry->addr_mask = ~VTD_PAGE_MASK_4K;
 827            entry->perm = IOMMU_WO;
 828            return;
 829        } else {
 830            VTD_DPRINTF(GENERAL, "error: read request from interrupt address "
 831                        "gpa 0x%"PRIx64, addr);
 832            vtd_report_dmar_fault(s, source_id, addr, VTD_FR_READ, is_write);
 833            return;
 834        }
 835    }
 836    /* Try to fetch slpte form IOTLB */
 837    iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
 838    if (iotlb_entry) {
 839        VTD_DPRINTF(CACHE, "hit iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64
 840                    " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr,
 841                    iotlb_entry->slpte, iotlb_entry->domain_id);
 842        slpte = iotlb_entry->slpte;
 843        reads = iotlb_entry->read_flags;
 844        writes = iotlb_entry->write_flags;
 845        page_mask = iotlb_entry->mask;
 846        goto out;
 847    }
 848    /* Try to fetch context-entry from cache first */
 849    if (cc_entry->context_cache_gen == s->context_cache_gen) {
 850        VTD_DPRINTF(CACHE, "hit context-cache bus %d devfn %d "
 851                    "(hi %"PRIx64 " lo %"PRIx64 " gen %"PRIu32 ")",
 852                    bus_num, devfn, cc_entry->context_entry.hi,
 853                    cc_entry->context_entry.lo, cc_entry->context_cache_gen);
 854        ce = cc_entry->context_entry;
 855        is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
 856    } else {
 857        ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
 858        is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
 859        if (ret_fr) {
 860            ret_fr = -ret_fr;
 861            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
 862                VTD_DPRINTF(FLOG, "fault processing is disabled for DMA "
 863                            "requests through this context-entry "
 864                            "(with FPD Set)");
 865            } else {
 866                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
 867            }
 868            return;
 869        }
 870        /* Update context-cache */
 871        VTD_DPRINTF(CACHE, "update context-cache bus %d devfn %d "
 872                    "(hi %"PRIx64 " lo %"PRIx64 " gen %"PRIu32 "->%"PRIu32 ")",
 873                    bus_num, devfn, ce.hi, ce.lo,
 874                    cc_entry->context_cache_gen, s->context_cache_gen);
 875        cc_entry->context_entry = ce;
 876        cc_entry->context_cache_gen = s->context_cache_gen;
 877    }
 878
 879    ret_fr = vtd_gpa_to_slpte(&ce, addr, is_write, &slpte, &level,
 880                              &reads, &writes);
 881    if (ret_fr) {
 882        ret_fr = -ret_fr;
 883        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
 884            VTD_DPRINTF(FLOG, "fault processing is disabled for DMA requests "
 885                        "through this context-entry (with FPD Set)");
 886        } else {
 887            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
 888        }
 889        return;
 890    }
 891
 892    page_mask = vtd_slpt_level_page_mask(level);
 893    vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
 894                     reads, writes, level);
 895out:
 896    entry->iova = addr & page_mask;
 897    entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask;
 898    entry->addr_mask = ~page_mask;
 899    entry->perm = (writes ? 2 : 0) + (reads ? 1 : 0);
 900}
 901
 902static void vtd_root_table_setup(IntelIOMMUState *s)
 903{
 904    s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
 905    s->root_extended = s->root & VTD_RTADDR_RTT;
 906    s->root &= VTD_RTADDR_ADDR_MASK;
 907
 908    VTD_DPRINTF(CSR, "root_table addr 0x%"PRIx64 " %s", s->root,
 909                (s->root_extended ? "(extended)" : ""));
 910}
 911
 912static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
 913                               uint32_t index, uint32_t mask)
 914{
 915    x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
 916}
 917
 918static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
 919{
 920    uint64_t value = 0;
 921    value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
 922    s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
 923    s->intr_root = value & VTD_IRTA_ADDR_MASK;
 924    s->intr_eime = value & VTD_IRTA_EIME;
 925
 926    /* Notify global invalidation */
 927    vtd_iec_notify_all(s, true, 0, 0);
 928
 929    VTD_DPRINTF(CSR, "int remap table addr 0x%"PRIx64 " size %"PRIu32,
 930                s->intr_root, s->intr_size);
 931}
 932
 933static void vtd_context_global_invalidate(IntelIOMMUState *s)
 934{
 935    s->context_cache_gen++;
 936    if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
 937        vtd_reset_context_cache(s);
 938    }
 939}
 940
 941
 942/* Find the VTD address space currently associated with a given bus number,
 943 */
 944static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
 945{
 946    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
 947    if (!vtd_bus) {
 948        /* Iterate over the registered buses to find the one
 949         * which currently hold this bus number, and update the bus_num lookup table:
 950         */
 951        GHashTableIter iter;
 952
 953        g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
 954        while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) {
 955            if (pci_bus_num(vtd_bus->bus) == bus_num) {
 956                s->vtd_as_by_bus_num[bus_num] = vtd_bus;
 957                return vtd_bus;
 958            }
 959        }
 960    }
 961    return vtd_bus;
 962}
 963
 964/* Do a context-cache device-selective invalidation.
 965 * @func_mask: FM field after shifting
 966 */
 967static void vtd_context_device_invalidate(IntelIOMMUState *s,
 968                                          uint16_t source_id,
 969                                          uint16_t func_mask)
 970{
 971    uint16_t mask;
 972    VTDBus *vtd_bus;
 973    VTDAddressSpace *vtd_as;
 974    uint16_t devfn;
 975    uint16_t devfn_it;
 976
 977    switch (func_mask & 3) {
 978    case 0:
 979        mask = 0;   /* No bits in the SID field masked */
 980        break;
 981    case 1:
 982        mask = 4;   /* Mask bit 2 in the SID field */
 983        break;
 984    case 2:
 985        mask = 6;   /* Mask bit 2:1 in the SID field */
 986        break;
 987    case 3:
 988        mask = 7;   /* Mask bit 2:0 in the SID field */
 989        break;
 990    }
 991    mask = ~mask;
 992    VTD_DPRINTF(INV, "device-selective invalidation source 0x%"PRIx16
 993                    " mask %"PRIu16, source_id, mask);
 994    vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
 995    if (vtd_bus) {
 996        devfn = VTD_SID_TO_DEVFN(source_id);
 997        for (devfn_it = 0; devfn_it < X86_IOMMU_PCI_DEVFN_MAX; ++devfn_it) {
 998            vtd_as = vtd_bus->dev_as[devfn_it];
 999            if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
1000                VTD_DPRINTF(INV, "invalidate context-cahce of devfn 0x%"PRIx16,
1001                            devfn_it);
1002                vtd_as->context_cache_entry.context_cache_gen = 0;
1003            }
1004        }
1005    }
1006}
1007
1008/* Context-cache invalidation
1009 * Returns the Context Actual Invalidation Granularity.
1010 * @val: the content of the CCMD_REG
1011 */
1012static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
1013{
1014    uint64_t caig;
1015    uint64_t type = val & VTD_CCMD_CIRG_MASK;
1016
1017    switch (type) {
1018    case VTD_CCMD_DOMAIN_INVL:
1019        VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16,
1020                    (uint16_t)VTD_CCMD_DID(val));
1021        /* Fall through */
1022    case VTD_CCMD_GLOBAL_INVL:
1023        VTD_DPRINTF(INV, "global invalidation");
1024        caig = VTD_CCMD_GLOBAL_INVL_A;
1025        vtd_context_global_invalidate(s);
1026        break;
1027
1028    case VTD_CCMD_DEVICE_INVL:
1029        caig = VTD_CCMD_DEVICE_INVL_A;
1030        vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
1031        break;
1032
1033    default:
1034        VTD_DPRINTF(GENERAL, "error: invalid granularity");
1035        caig = 0;
1036    }
1037    return caig;
1038}
1039
1040static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
1041{
1042    vtd_reset_iotlb(s);
1043}
1044
1045static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
1046{
1047    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
1048                                &domain_id);
1049}
1050
1051static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
1052                                      hwaddr addr, uint8_t am)
1053{
1054    VTDIOTLBPageInvInfo info;
1055
1056    assert(am <= VTD_MAMV);
1057    info.domain_id = domain_id;
1058    info.addr = addr;
1059    info.mask = ~((1 << am) - 1);
1060    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
1061}
1062
1063/* Flush IOTLB
1064 * Returns the IOTLB Actual Invalidation Granularity.
1065 * @val: the content of the IOTLB_REG
1066 */
1067static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
1068{
1069    uint64_t iaig;
1070    uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
1071    uint16_t domain_id;
1072    hwaddr addr;
1073    uint8_t am;
1074
1075    switch (type) {
1076    case VTD_TLB_GLOBAL_FLUSH:
1077        VTD_DPRINTF(INV, "global invalidation");
1078        iaig = VTD_TLB_GLOBAL_FLUSH_A;
1079        vtd_iotlb_global_invalidate(s);
1080        break;
1081
1082    case VTD_TLB_DSI_FLUSH:
1083        domain_id = VTD_TLB_DID(val);
1084        VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16,
1085                    domain_id);
1086        iaig = VTD_TLB_DSI_FLUSH_A;
1087        vtd_iotlb_domain_invalidate(s, domain_id);
1088        break;
1089
1090    case VTD_TLB_PSI_FLUSH:
1091        domain_id = VTD_TLB_DID(val);
1092        addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
1093        am = VTD_IVA_AM(addr);
1094        addr = VTD_IVA_ADDR(addr);
1095        VTD_DPRINTF(INV, "page-selective invalidation domain 0x%"PRIx16
1096                    " addr 0x%"PRIx64 " mask %"PRIu8, domain_id, addr, am);
1097        if (am > VTD_MAMV) {
1098            VTD_DPRINTF(GENERAL, "error: supported max address mask value is "
1099                        "%"PRIu8, (uint8_t)VTD_MAMV);
1100            iaig = 0;
1101            break;
1102        }
1103        iaig = VTD_TLB_PSI_FLUSH_A;
1104        vtd_iotlb_page_invalidate(s, domain_id, addr, am);
1105        break;
1106
1107    default:
1108        VTD_DPRINTF(GENERAL, "error: invalid granularity");
1109        iaig = 0;
1110    }
1111    return iaig;
1112}
1113
1114static inline bool vtd_queued_inv_enable_check(IntelIOMMUState *s)
1115{
1116    return s->iq_tail == 0;
1117}
1118
1119static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
1120{
1121    return s->qi_enabled && (s->iq_tail == s->iq_head) &&
1122           (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
1123}
1124
1125static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
1126{
1127    uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
1128
1129    VTD_DPRINTF(INV, "Queued Invalidation Enable %s", (en ? "on" : "off"));
1130    if (en) {
1131        if (vtd_queued_inv_enable_check(s)) {
1132            s->iq = iqa_val & VTD_IQA_IQA_MASK;
1133            /* 2^(x+8) entries */
1134            s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
1135            s->qi_enabled = true;
1136            VTD_DPRINTF(INV, "DMAR_IQA_REG 0x%"PRIx64, iqa_val);
1137            VTD_DPRINTF(INV, "Invalidation Queue addr 0x%"PRIx64 " size %d",
1138                        s->iq, s->iq_size);
1139            /* Ok - report back to driver */
1140            vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
1141        } else {
1142            VTD_DPRINTF(GENERAL, "error: can't enable Queued Invalidation: "
1143                        "tail %"PRIu16, s->iq_tail);
1144        }
1145    } else {
1146        if (vtd_queued_inv_disable_check(s)) {
1147            /* disable Queued Invalidation */
1148            vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
1149            s->iq_head = 0;
1150            s->qi_enabled = false;
1151            /* Ok - report back to driver */
1152            vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
1153        } else {
1154            VTD_DPRINTF(GENERAL, "error: can't disable Queued Invalidation: "
1155                        "head %"PRIu16 ", tail %"PRIu16
1156                        ", last_descriptor %"PRIu8,
1157                        s->iq_head, s->iq_tail, s->iq_last_desc_type);
1158        }
1159    }
1160}
1161
1162/* Set Root Table Pointer */
1163static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
1164{
1165    VTD_DPRINTF(CSR, "set Root Table Pointer");
1166
1167    vtd_root_table_setup(s);
1168    /* Ok - report back to driver */
1169    vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
1170}
1171
1172/* Set Interrupt Remap Table Pointer */
1173static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
1174{
1175    VTD_DPRINTF(CSR, "set Interrupt Remap Table Pointer");
1176
1177    vtd_interrupt_remap_table_setup(s);
1178    /* Ok - report back to driver */
1179    vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
1180}
1181
1182/* Handle Translation Enable/Disable */
1183static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
1184{
1185    VTD_DPRINTF(CSR, "Translation Enable %s", (en ? "on" : "off"));
1186
1187    if (en) {
1188        s->dmar_enabled = true;
1189        /* Ok - report back to driver */
1190        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
1191    } else {
1192        s->dmar_enabled = false;
1193
1194        /* Clear the index of Fault Recording Register */
1195        s->next_frcd_reg = 0;
1196        /* Ok - report back to driver */
1197        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
1198    }
1199}
1200
1201/* Handle Interrupt Remap Enable/Disable */
1202static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
1203{
1204    VTD_DPRINTF(CSR, "Interrupt Remap Enable %s", (en ? "on" : "off"));
1205
1206    if (en) {
1207        s->intr_enabled = true;
1208        /* Ok - report back to driver */
1209        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
1210    } else {
1211        s->intr_enabled = false;
1212        /* Ok - report back to driver */
1213        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
1214    }
1215}
1216
1217/* Handle write to Global Command Register */
1218static void vtd_handle_gcmd_write(IntelIOMMUState *s)
1219{
1220    uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
1221    uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
1222    uint32_t changed = status ^ val;
1223
1224    VTD_DPRINTF(CSR, "value 0x%"PRIx32 " status 0x%"PRIx32, val, status);
1225    if (changed & VTD_GCMD_TE) {
1226        /* Translation enable/disable */
1227        vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
1228    }
1229    if (val & VTD_GCMD_SRTP) {
1230        /* Set/update the root-table pointer */
1231        vtd_handle_gcmd_srtp(s);
1232    }
1233    if (changed & VTD_GCMD_QIE) {
1234        /* Queued Invalidation Enable */
1235        vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
1236    }
1237    if (val & VTD_GCMD_SIRTP) {
1238        /* Set/update the interrupt remapping root-table pointer */
1239        vtd_handle_gcmd_sirtp(s);
1240    }
1241    if (changed & VTD_GCMD_IRE) {
1242        /* Interrupt remap enable/disable */
1243        vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
1244    }
1245}
1246
1247/* Handle write to Context Command Register */
1248static void vtd_handle_ccmd_write(IntelIOMMUState *s)
1249{
1250    uint64_t ret;
1251    uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
1252
1253    /* Context-cache invalidation request */
1254    if (val & VTD_CCMD_ICC) {
1255        if (s->qi_enabled) {
1256            VTD_DPRINTF(GENERAL, "error: Queued Invalidation enabled, "
1257                        "should not use register-based invalidation");
1258            return;
1259        }
1260        ret = vtd_context_cache_invalidate(s, val);
1261        /* Invalidation completed. Change something to show */
1262        vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
1263        ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
1264                                      ret);
1265        VTD_DPRINTF(INV, "CCMD_REG write-back val: 0x%"PRIx64, ret);
1266    }
1267}
1268
1269/* Handle write to IOTLB Invalidation Register */
1270static void vtd_handle_iotlb_write(IntelIOMMUState *s)
1271{
1272    uint64_t ret;
1273    uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
1274
1275    /* IOTLB invalidation request */
1276    if (val & VTD_TLB_IVT) {
1277        if (s->qi_enabled) {
1278            VTD_DPRINTF(GENERAL, "error: Queued Invalidation enabled, "
1279                        "should not use register-based invalidation");
1280            return;
1281        }
1282        ret = vtd_iotlb_flush(s, val);
1283        /* Invalidation completed. Change something to show */
1284        vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
1285        ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
1286                                      VTD_TLB_FLUSH_GRANU_MASK_A, ret);
1287        VTD_DPRINTF(INV, "IOTLB_REG write-back val: 0x%"PRIx64, ret);
1288    }
1289}
1290
1291/* Fetch an Invalidation Descriptor from the Invalidation Queue */
1292static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
1293                             VTDInvDesc *inv_desc)
1294{
1295    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
1296    if (dma_memory_read(&address_space_memory, addr, inv_desc,
1297        sizeof(*inv_desc))) {
1298        VTD_DPRINTF(GENERAL, "error: fail to fetch Invalidation Descriptor "
1299                    "base_addr 0x%"PRIx64 " offset %"PRIu32, base_addr, offset);
1300        inv_desc->lo = 0;
1301        inv_desc->hi = 0;
1302
1303        return false;
1304    }
1305    inv_desc->lo = le64_to_cpu(inv_desc->lo);
1306    inv_desc->hi = le64_to_cpu(inv_desc->hi);
1307    return true;
1308}
1309
1310static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
1311{
1312    if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
1313        (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
1314        VTD_DPRINTF(GENERAL, "error: non-zero reserved field in Invalidation "
1315                    "Wait Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64,
1316                    inv_desc->hi, inv_desc->lo);
1317        return false;
1318    }
1319    if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
1320        /* Status Write */
1321        uint32_t status_data = (uint32_t)(inv_desc->lo >>
1322                               VTD_INV_DESC_WAIT_DATA_SHIFT);
1323
1324        assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
1325
1326        /* FIXME: need to be masked with HAW? */
1327        dma_addr_t status_addr = inv_desc->hi;
1328        VTD_DPRINTF(INV, "status data 0x%x, status addr 0x%"PRIx64,
1329                    status_data, status_addr);
1330        status_data = cpu_to_le32(status_data);
1331        if (dma_memory_write(&address_space_memory, status_addr, &status_data,
1332                             sizeof(status_data))) {
1333            VTD_DPRINTF(GENERAL, "error: fail to perform a coherent write");
1334            return false;
1335        }
1336    } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
1337        /* Interrupt flag */
1338        VTD_DPRINTF(INV, "Invalidation Wait Descriptor interrupt completion");
1339        vtd_generate_completion_event(s);
1340    } else {
1341        VTD_DPRINTF(GENERAL, "error: invalid Invalidation Wait Descriptor: "
1342                    "hi 0x%"PRIx64 " lo 0x%"PRIx64, inv_desc->hi, inv_desc->lo);
1343        return false;
1344    }
1345    return true;
1346}
1347
1348static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
1349                                           VTDInvDesc *inv_desc)
1350{
1351    if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
1352        VTD_DPRINTF(GENERAL, "error: non-zero reserved field in Context-cache "
1353                    "Invalidate Descriptor");
1354        return false;
1355    }
1356    switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
1357    case VTD_INV_DESC_CC_DOMAIN:
1358        VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16,
1359                    (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
1360        /* Fall through */
1361    case VTD_INV_DESC_CC_GLOBAL:
1362        VTD_DPRINTF(INV, "global invalidation");
1363        vtd_context_global_invalidate(s);
1364        break;
1365
1366    case VTD_INV_DESC_CC_DEVICE:
1367        vtd_context_device_invalidate(s, VTD_INV_DESC_CC_SID(inv_desc->lo),
1368                                      VTD_INV_DESC_CC_FM(inv_desc->lo));
1369        break;
1370
1371    default:
1372        VTD_DPRINTF(GENERAL, "error: invalid granularity in Context-cache "
1373                    "Invalidate Descriptor hi 0x%"PRIx64  " lo 0x%"PRIx64,
1374                    inv_desc->hi, inv_desc->lo);
1375        return false;
1376    }
1377    return true;
1378}
1379
1380static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
1381{
1382    uint16_t domain_id;
1383    uint8_t am;
1384    hwaddr addr;
1385
1386    if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
1387        (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
1388        VTD_DPRINTF(GENERAL, "error: non-zero reserved field in IOTLB "
1389                    "Invalidate Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64,
1390                    inv_desc->hi, inv_desc->lo);
1391        return false;
1392    }
1393
1394    switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
1395    case VTD_INV_DESC_IOTLB_GLOBAL:
1396        VTD_DPRINTF(INV, "global invalidation");
1397        vtd_iotlb_global_invalidate(s);
1398        break;
1399
1400    case VTD_INV_DESC_IOTLB_DOMAIN:
1401        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
1402        VTD_DPRINTF(INV, "domain-selective invalidation domain 0x%"PRIx16,
1403                    domain_id);
1404        vtd_iotlb_domain_invalidate(s, domain_id);
1405        break;
1406
1407    case VTD_INV_DESC_IOTLB_PAGE:
1408        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
1409        addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
1410        am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
1411        VTD_DPRINTF(INV, "page-selective invalidation domain 0x%"PRIx16
1412                    " addr 0x%"PRIx64 " mask %"PRIu8, domain_id, addr, am);
1413        if (am > VTD_MAMV) {
1414            VTD_DPRINTF(GENERAL, "error: supported max address mask value is "
1415                        "%"PRIu8, (uint8_t)VTD_MAMV);
1416            return false;
1417        }
1418        vtd_iotlb_page_invalidate(s, domain_id, addr, am);
1419        break;
1420
1421    default:
1422        VTD_DPRINTF(GENERAL, "error: invalid granularity in IOTLB Invalidate "
1423                    "Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64,
1424                    inv_desc->hi, inv_desc->lo);
1425        return false;
1426    }
1427    return true;
1428}
1429
1430static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
1431                                     VTDInvDesc *inv_desc)
1432{
1433    VTD_DPRINTF(INV, "inv ir glob %d index %d mask %d",
1434                inv_desc->iec.granularity,
1435                inv_desc->iec.index,
1436                inv_desc->iec.index_mask);
1437
1438    vtd_iec_notify_all(s, !inv_desc->iec.granularity,
1439                       inv_desc->iec.index,
1440                       inv_desc->iec.index_mask);
1441
1442    return true;
1443}
1444
1445static bool vtd_process_inv_desc(IntelIOMMUState *s)
1446{
1447    VTDInvDesc inv_desc;
1448    uint8_t desc_type;
1449
1450    VTD_DPRINTF(INV, "iq head %"PRIu16, s->iq_head);
1451    if (!vtd_get_inv_desc(s->iq, s->iq_head, &inv_desc)) {
1452        s->iq_last_desc_type = VTD_INV_DESC_NONE;
1453        return false;
1454    }
1455    desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
1456    /* FIXME: should update at first or at last? */
1457    s->iq_last_desc_type = desc_type;
1458
1459    switch (desc_type) {
1460    case VTD_INV_DESC_CC:
1461        VTD_DPRINTF(INV, "Context-cache Invalidate Descriptor hi 0x%"PRIx64
1462                    " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo);
1463        if (!vtd_process_context_cache_desc(s, &inv_desc)) {
1464            return false;
1465        }
1466        break;
1467
1468    case VTD_INV_DESC_IOTLB:
1469        VTD_DPRINTF(INV, "IOTLB Invalidate Descriptor hi 0x%"PRIx64
1470                    " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo);
1471        if (!vtd_process_iotlb_desc(s, &inv_desc)) {
1472            return false;
1473        }
1474        break;
1475
1476    case VTD_INV_DESC_WAIT:
1477        VTD_DPRINTF(INV, "Invalidation Wait Descriptor hi 0x%"PRIx64
1478                    " lo 0x%"PRIx64, inv_desc.hi, inv_desc.lo);
1479        if (!vtd_process_wait_desc(s, &inv_desc)) {
1480            return false;
1481        }
1482        break;
1483
1484    case VTD_INV_DESC_IEC:
1485        VTD_DPRINTF(INV, "Invalidation Interrupt Entry Cache "
1486                    "Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64,
1487                    inv_desc.hi, inv_desc.lo);
1488        if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
1489            return false;
1490        }
1491        break;
1492
1493    default:
1494        VTD_DPRINTF(GENERAL, "error: unkonw Invalidation Descriptor type "
1495                    "hi 0x%"PRIx64 " lo 0x%"PRIx64 " type %"PRIu8,
1496                    inv_desc.hi, inv_desc.lo, desc_type);
1497        return false;
1498    }
1499    s->iq_head++;
1500    if (s->iq_head == s->iq_size) {
1501        s->iq_head = 0;
1502    }
1503    return true;
1504}
1505
1506/* Try to fetch and process more Invalidation Descriptors */
1507static void vtd_fetch_inv_desc(IntelIOMMUState *s)
1508{
1509    VTD_DPRINTF(INV, "fetch Invalidation Descriptors");
1510    if (s->iq_tail >= s->iq_size) {
1511        /* Detects an invalid Tail pointer */
1512        VTD_DPRINTF(GENERAL, "error: iq_tail is %"PRIu16
1513                    " while iq_size is %"PRIu16, s->iq_tail, s->iq_size);
1514        vtd_handle_inv_queue_error(s);
1515        return;
1516    }
1517    while (s->iq_head != s->iq_tail) {
1518        if (!vtd_process_inv_desc(s)) {
1519            /* Invalidation Queue Errors */
1520            vtd_handle_inv_queue_error(s);
1521            break;
1522        }
1523        /* Must update the IQH_REG in time */
1524        vtd_set_quad_raw(s, DMAR_IQH_REG,
1525                         (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) &
1526                         VTD_IQH_QH_MASK);
1527    }
1528}
1529
1530/* Handle write to Invalidation Queue Tail Register */
1531static void vtd_handle_iqt_write(IntelIOMMUState *s)
1532{
1533    uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
1534
1535    s->iq_tail = VTD_IQT_QT(val);
1536    VTD_DPRINTF(INV, "set iq tail %"PRIu16, s->iq_tail);
1537    if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
1538        /* Process Invalidation Queue here */
1539        vtd_fetch_inv_desc(s);
1540    }
1541}
1542
1543static void vtd_handle_fsts_write(IntelIOMMUState *s)
1544{
1545    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
1546    uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
1547    uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
1548
1549    if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
1550        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
1551        VTD_DPRINTF(FLOG, "all pending interrupt conditions serviced, clear "
1552                    "IP field of FECTL_REG");
1553    }
1554    /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
1555     * Descriptors if there are any when Queued Invalidation is enabled?
1556     */
1557}
1558
1559static void vtd_handle_fectl_write(IntelIOMMUState *s)
1560{
1561    uint32_t fectl_reg;
1562    /* FIXME: when software clears the IM field, check the IP field. But do we
1563     * need to compare the old value and the new value to conclude that
1564     * software clears the IM field? Or just check if the IM field is zero?
1565     */
1566    fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
1567    if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
1568        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
1569        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
1570        VTD_DPRINTF(FLOG, "IM field is cleared, generate "
1571                    "fault event interrupt");
1572    }
1573}
1574
1575static void vtd_handle_ics_write(IntelIOMMUState *s)
1576{
1577    uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
1578    uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
1579
1580    if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
1581        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
1582        VTD_DPRINTF(INV, "pending completion interrupt condition serviced, "
1583                    "clear IP field of IECTL_REG");
1584    }
1585}
1586
1587static void vtd_handle_iectl_write(IntelIOMMUState *s)
1588{
1589    uint32_t iectl_reg;
1590    /* FIXME: when software clears the IM field, check the IP field. But do we
1591     * need to compare the old value and the new value to conclude that
1592     * software clears the IM field? Or just check if the IM field is zero?
1593     */
1594    iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
1595    if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
1596        vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
1597        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
1598        VTD_DPRINTF(INV, "IM field is cleared, generate "
1599                    "invalidation event interrupt");
1600    }
1601}
1602
1603static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
1604{
1605    IntelIOMMUState *s = opaque;
1606    uint64_t val;
1607
1608    if (addr + size > DMAR_REG_SIZE) {
1609        VTD_DPRINTF(GENERAL, "error: addr outside region: max 0x%"PRIx64
1610                    ", got 0x%"PRIx64 " %d",
1611                    (uint64_t)DMAR_REG_SIZE, addr, size);
1612        return (uint64_t)-1;
1613    }
1614
1615    switch (addr) {
1616    /* Root Table Address Register, 64-bit */
1617    case DMAR_RTADDR_REG:
1618        if (size == 4) {
1619            val = s->root & ((1ULL << 32) - 1);
1620        } else {
1621            val = s->root;
1622        }
1623        break;
1624
1625    case DMAR_RTADDR_REG_HI:
1626        assert(size == 4);
1627        val = s->root >> 32;
1628        break;
1629
1630    /* Invalidation Queue Address Register, 64-bit */
1631    case DMAR_IQA_REG:
1632        val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
1633        if (size == 4) {
1634            val = val & ((1ULL << 32) - 1);
1635        }
1636        break;
1637
1638    case DMAR_IQA_REG_HI:
1639        assert(size == 4);
1640        val = s->iq >> 32;
1641        break;
1642
1643    default:
1644        if (size == 4) {
1645            val = vtd_get_long(s, addr);
1646        } else {
1647            val = vtd_get_quad(s, addr);
1648        }
1649    }
1650    VTD_DPRINTF(CSR, "addr 0x%"PRIx64 " size %d val 0x%"PRIx64,
1651                addr, size, val);
1652    return val;
1653}
1654
1655static void vtd_mem_write(void *opaque, hwaddr addr,
1656                          uint64_t val, unsigned size)
1657{
1658    IntelIOMMUState *s = opaque;
1659
1660    if (addr + size > DMAR_REG_SIZE) {
1661        VTD_DPRINTF(GENERAL, "error: addr outside region: max 0x%"PRIx64
1662                    ", got 0x%"PRIx64 " %d",
1663                    (uint64_t)DMAR_REG_SIZE, addr, size);
1664        return;
1665    }
1666
1667    switch (addr) {
1668    /* Global Command Register, 32-bit */
1669    case DMAR_GCMD_REG:
1670        VTD_DPRINTF(CSR, "DMAR_GCMD_REG write addr 0x%"PRIx64
1671                    ", size %d, val 0x%"PRIx64, addr, size, val);
1672        vtd_set_long(s, addr, val);
1673        vtd_handle_gcmd_write(s);
1674        break;
1675
1676    /* Context Command Register, 64-bit */
1677    case DMAR_CCMD_REG:
1678        VTD_DPRINTF(CSR, "DMAR_CCMD_REG write addr 0x%"PRIx64
1679                    ", size %d, val 0x%"PRIx64, addr, size, val);
1680        if (size == 4) {
1681            vtd_set_long(s, addr, val);
1682        } else {
1683            vtd_set_quad(s, addr, val);
1684            vtd_handle_ccmd_write(s);
1685        }
1686        break;
1687
1688    case DMAR_CCMD_REG_HI:
1689        VTD_DPRINTF(CSR, "DMAR_CCMD_REG_HI write addr 0x%"PRIx64
1690                    ", size %d, val 0x%"PRIx64, addr, size, val);
1691        assert(size == 4);
1692        vtd_set_long(s, addr, val);
1693        vtd_handle_ccmd_write(s);
1694        break;
1695
1696    /* IOTLB Invalidation Register, 64-bit */
1697    case DMAR_IOTLB_REG:
1698        VTD_DPRINTF(INV, "DMAR_IOTLB_REG write addr 0x%"PRIx64
1699                    ", size %d, val 0x%"PRIx64, addr, size, val);
1700        if (size == 4) {
1701            vtd_set_long(s, addr, val);
1702        } else {
1703            vtd_set_quad(s, addr, val);
1704            vtd_handle_iotlb_write(s);
1705        }
1706        break;
1707
1708    case DMAR_IOTLB_REG_HI:
1709        VTD_DPRINTF(INV, "DMAR_IOTLB_REG_HI write addr 0x%"PRIx64
1710                    ", size %d, val 0x%"PRIx64, addr, size, val);
1711        assert(size == 4);
1712        vtd_set_long(s, addr, val);
1713        vtd_handle_iotlb_write(s);
1714        break;
1715
1716    /* Invalidate Address Register, 64-bit */
1717    case DMAR_IVA_REG:
1718        VTD_DPRINTF(INV, "DMAR_IVA_REG write addr 0x%"PRIx64
1719                    ", size %d, val 0x%"PRIx64, addr, size, val);
1720        if (size == 4) {
1721            vtd_set_long(s, addr, val);
1722        } else {
1723            vtd_set_quad(s, addr, val);
1724        }
1725        break;
1726
1727    case DMAR_IVA_REG_HI:
1728        VTD_DPRINTF(INV, "DMAR_IVA_REG_HI write addr 0x%"PRIx64
1729                    ", size %d, val 0x%"PRIx64, addr, size, val);
1730        assert(size == 4);
1731        vtd_set_long(s, addr, val);
1732        break;
1733
1734    /* Fault Status Register, 32-bit */
1735    case DMAR_FSTS_REG:
1736        VTD_DPRINTF(FLOG, "DMAR_FSTS_REG write addr 0x%"PRIx64
1737                    ", size %d, val 0x%"PRIx64, addr, size, val);
1738        assert(size == 4);
1739        vtd_set_long(s, addr, val);
1740        vtd_handle_fsts_write(s);
1741        break;
1742
1743    /* Fault Event Control Register, 32-bit */
1744    case DMAR_FECTL_REG:
1745        VTD_DPRINTF(FLOG, "DMAR_FECTL_REG write addr 0x%"PRIx64
1746                    ", size %d, val 0x%"PRIx64, addr, size, val);
1747        assert(size == 4);
1748        vtd_set_long(s, addr, val);
1749        vtd_handle_fectl_write(s);
1750        break;
1751
1752    /* Fault Event Data Register, 32-bit */
1753    case DMAR_FEDATA_REG:
1754        VTD_DPRINTF(FLOG, "DMAR_FEDATA_REG write addr 0x%"PRIx64
1755                    ", size %d, val 0x%"PRIx64, addr, size, val);
1756        assert(size == 4);
1757        vtd_set_long(s, addr, val);
1758        break;
1759
1760    /* Fault Event Address Register, 32-bit */
1761    case DMAR_FEADDR_REG:
1762        VTD_DPRINTF(FLOG, "DMAR_FEADDR_REG write addr 0x%"PRIx64
1763                    ", size %d, val 0x%"PRIx64, addr, size, val);
1764        assert(size == 4);
1765        vtd_set_long(s, addr, val);
1766        break;
1767
1768    /* Fault Event Upper Address Register, 32-bit */
1769    case DMAR_FEUADDR_REG:
1770        VTD_DPRINTF(FLOG, "DMAR_FEUADDR_REG write addr 0x%"PRIx64
1771                    ", size %d, val 0x%"PRIx64, addr, size, val);
1772        assert(size == 4);
1773        vtd_set_long(s, addr, val);
1774        break;
1775
1776    /* Protected Memory Enable Register, 32-bit */
1777    case DMAR_PMEN_REG:
1778        VTD_DPRINTF(CSR, "DMAR_PMEN_REG write addr 0x%"PRIx64
1779                    ", size %d, val 0x%"PRIx64, addr, size, val);
1780        assert(size == 4);
1781        vtd_set_long(s, addr, val);
1782        break;
1783
1784    /* Root Table Address Register, 64-bit */
1785    case DMAR_RTADDR_REG:
1786        VTD_DPRINTF(CSR, "DMAR_RTADDR_REG write addr 0x%"PRIx64
1787                    ", size %d, val 0x%"PRIx64, addr, size, val);
1788        if (size == 4) {
1789            vtd_set_long(s, addr, val);
1790        } else {
1791            vtd_set_quad(s, addr, val);
1792        }
1793        break;
1794
1795    case DMAR_RTADDR_REG_HI:
1796        VTD_DPRINTF(CSR, "DMAR_RTADDR_REG_HI write addr 0x%"PRIx64
1797                    ", size %d, val 0x%"PRIx64, addr, size, val);
1798        assert(size == 4);
1799        vtd_set_long(s, addr, val);
1800        break;
1801
1802    /* Invalidation Queue Tail Register, 64-bit */
1803    case DMAR_IQT_REG:
1804        VTD_DPRINTF(INV, "DMAR_IQT_REG write addr 0x%"PRIx64
1805                    ", size %d, val 0x%"PRIx64, addr, size, val);
1806        if (size == 4) {
1807            vtd_set_long(s, addr, val);
1808        } else {
1809            vtd_set_quad(s, addr, val);
1810        }
1811        vtd_handle_iqt_write(s);
1812        break;
1813
1814    case DMAR_IQT_REG_HI:
1815        VTD_DPRINTF(INV, "DMAR_IQT_REG_HI write addr 0x%"PRIx64
1816                    ", size %d, val 0x%"PRIx64, addr, size, val);
1817        assert(size == 4);
1818        vtd_set_long(s, addr, val);
1819        /* 19:63 of IQT_REG is RsvdZ, do nothing here */
1820        break;
1821
1822    /* Invalidation Queue Address Register, 64-bit */
1823    case DMAR_IQA_REG:
1824        VTD_DPRINTF(INV, "DMAR_IQA_REG write addr 0x%"PRIx64
1825                    ", size %d, val 0x%"PRIx64, addr, size, val);
1826        if (size == 4) {
1827            vtd_set_long(s, addr, val);
1828        } else {
1829            vtd_set_quad(s, addr, val);
1830        }
1831        break;
1832
1833    case DMAR_IQA_REG_HI:
1834        VTD_DPRINTF(INV, "DMAR_IQA_REG_HI write addr 0x%"PRIx64
1835                    ", size %d, val 0x%"PRIx64, addr, size, val);
1836        assert(size == 4);
1837        vtd_set_long(s, addr, val);
1838        break;
1839
1840    /* Invalidation Completion Status Register, 32-bit */
1841    case DMAR_ICS_REG:
1842        VTD_DPRINTF(INV, "DMAR_ICS_REG write addr 0x%"PRIx64
1843                    ", size %d, val 0x%"PRIx64, addr, size, val);
1844        assert(size == 4);
1845        vtd_set_long(s, addr, val);
1846        vtd_handle_ics_write(s);
1847        break;
1848
1849    /* Invalidation Event Control Register, 32-bit */
1850    case DMAR_IECTL_REG:
1851        VTD_DPRINTF(INV, "DMAR_IECTL_REG write addr 0x%"PRIx64
1852                    ", size %d, val 0x%"PRIx64, addr, size, val);
1853        assert(size == 4);
1854        vtd_set_long(s, addr, val);
1855        vtd_handle_iectl_write(s);
1856        break;
1857
1858    /* Invalidation Event Data Register, 32-bit */
1859    case DMAR_IEDATA_REG:
1860        VTD_DPRINTF(INV, "DMAR_IEDATA_REG write addr 0x%"PRIx64
1861                    ", size %d, val 0x%"PRIx64, addr, size, val);
1862        assert(size == 4);
1863        vtd_set_long(s, addr, val);
1864        break;
1865
1866    /* Invalidation Event Address Register, 32-bit */
1867    case DMAR_IEADDR_REG:
1868        VTD_DPRINTF(INV, "DMAR_IEADDR_REG write addr 0x%"PRIx64
1869                    ", size %d, val 0x%"PRIx64, addr, size, val);
1870        assert(size == 4);
1871        vtd_set_long(s, addr, val);
1872        break;
1873
1874    /* Invalidation Event Upper Address Register, 32-bit */
1875    case DMAR_IEUADDR_REG:
1876        VTD_DPRINTF(INV, "DMAR_IEUADDR_REG write addr 0x%"PRIx64
1877                    ", size %d, val 0x%"PRIx64, addr, size, val);
1878        assert(size == 4);
1879        vtd_set_long(s, addr, val);
1880        break;
1881
1882    /* Fault Recording Registers, 128-bit */
1883    case DMAR_FRCD_REG_0_0:
1884        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_0 write addr 0x%"PRIx64
1885                    ", size %d, val 0x%"PRIx64, addr, size, val);
1886        if (size == 4) {
1887            vtd_set_long(s, addr, val);
1888        } else {
1889            vtd_set_quad(s, addr, val);
1890        }
1891        break;
1892
1893    case DMAR_FRCD_REG_0_1:
1894        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_1 write addr 0x%"PRIx64
1895                    ", size %d, val 0x%"PRIx64, addr, size, val);
1896        assert(size == 4);
1897        vtd_set_long(s, addr, val);
1898        break;
1899
1900    case DMAR_FRCD_REG_0_2:
1901        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_2 write addr 0x%"PRIx64
1902                    ", size %d, val 0x%"PRIx64, addr, size, val);
1903        if (size == 4) {
1904            vtd_set_long(s, addr, val);
1905        } else {
1906            vtd_set_quad(s, addr, val);
1907            /* May clear bit 127 (Fault), update PPF */
1908            vtd_update_fsts_ppf(s);
1909        }
1910        break;
1911
1912    case DMAR_FRCD_REG_0_3:
1913        VTD_DPRINTF(FLOG, "DMAR_FRCD_REG_0_3 write addr 0x%"PRIx64
1914                    ", size %d, val 0x%"PRIx64, addr, size, val);
1915        assert(size == 4);
1916        vtd_set_long(s, addr, val);
1917        /* May clear bit 127 (Fault), update PPF */
1918        vtd_update_fsts_ppf(s);
1919        break;
1920
1921    case DMAR_IRTA_REG:
1922        VTD_DPRINTF(IR, "DMAR_IRTA_REG write addr 0x%"PRIx64
1923                    ", size %d, val 0x%"PRIx64, addr, size, val);
1924        if (size == 4) {
1925            vtd_set_long(s, addr, val);
1926        } else {
1927            vtd_set_quad(s, addr, val);
1928        }
1929        break;
1930
1931    case DMAR_IRTA_REG_HI:
1932        VTD_DPRINTF(IR, "DMAR_IRTA_REG_HI write addr 0x%"PRIx64
1933                    ", size %d, val 0x%"PRIx64, addr, size, val);
1934        assert(size == 4);
1935        vtd_set_long(s, addr, val);
1936        break;
1937
1938    default:
1939        VTD_DPRINTF(GENERAL, "error: unhandled reg write addr 0x%"PRIx64
1940                    ", size %d, val 0x%"PRIx64, addr, size, val);
1941        if (size == 4) {
1942            vtd_set_long(s, addr, val);
1943        } else {
1944            vtd_set_quad(s, addr, val);
1945        }
1946    }
1947}
1948
1949static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
1950                                         bool is_write)
1951{
1952    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
1953    IntelIOMMUState *s = vtd_as->iommu_state;
1954    IOMMUTLBEntry ret = {
1955        .target_as = &address_space_memory,
1956        .iova = addr,
1957        .translated_addr = 0,
1958        .addr_mask = ~(hwaddr)0,
1959        .perm = IOMMU_NONE,
1960    };
1961
1962    if (!s->dmar_enabled) {
1963        /* DMAR disabled, passthrough, use 4k-page*/
1964        ret.iova = addr & VTD_PAGE_MASK_4K;
1965        ret.translated_addr = addr & VTD_PAGE_MASK_4K;
1966        ret.addr_mask = ~VTD_PAGE_MASK_4K;
1967        ret.perm = IOMMU_RW;
1968        return ret;
1969    }
1970
1971    vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn, addr,
1972                           is_write, &ret);
1973    VTD_DPRINTF(MMU,
1974                "bus %"PRIu8 " slot %"PRIu8 " func %"PRIu8 " devfn %"PRIu8
1975                " gpa 0x%"PRIx64 " hpa 0x%"PRIx64, pci_bus_num(vtd_as->bus),
1976                VTD_PCI_SLOT(vtd_as->devfn), VTD_PCI_FUNC(vtd_as->devfn),
1977                vtd_as->devfn, addr, ret.translated_addr);
1978    return ret;
1979}
1980
1981static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
1982                                          IOMMUNotifierFlag old,
1983                                          IOMMUNotifierFlag new)
1984{
1985    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
1986
1987    if (new & IOMMU_NOTIFIER_MAP) {
1988        error_report("Device at bus %s addr %02x.%d requires iommu "
1989                     "notifier which is currently not supported by "
1990                     "intel-iommu emulation",
1991                     vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
1992                     PCI_FUNC(vtd_as->devfn));
1993        exit(1);
1994    }
1995}
1996
1997static const VMStateDescription vtd_vmstate = {
1998    .name = "iommu-intel",
1999    .unmigratable = 1,
2000};
2001
2002static const MemoryRegionOps vtd_mem_ops = {
2003    .read = vtd_mem_read,
2004    .write = vtd_mem_write,
2005    .endianness = DEVICE_LITTLE_ENDIAN,
2006    .impl = {
2007        .min_access_size = 4,
2008        .max_access_size = 8,
2009    },
2010    .valid = {
2011        .min_access_size = 4,
2012        .max_access_size = 8,
2013    },
2014};
2015
2016static Property vtd_properties[] = {
2017    DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
2018    DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
2019                            ON_OFF_AUTO_AUTO),
2020    DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
2021    DEFINE_PROP_END_OF_LIST(),
2022};
2023
2024/* Read IRTE entry with specific index */
2025static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
2026                        VTD_IR_TableEntry *entry, uint16_t sid)
2027{
2028    static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
2029        {0xffff, 0xfffb, 0xfff9, 0xfff8};
2030    dma_addr_t addr = 0x00;
2031    uint16_t mask, source_id;
2032    uint8_t bus, bus_max, bus_min;
2033
2034    addr = iommu->intr_root + index * sizeof(*entry);
2035    if (dma_memory_read(&address_space_memory, addr, entry,
2036                        sizeof(*entry))) {
2037        VTD_DPRINTF(GENERAL, "error: fail to access IR root at 0x%"PRIx64
2038                    " + %"PRIu16, iommu->intr_root, index);
2039        return -VTD_FR_IR_ROOT_INVAL;
2040    }
2041
2042    if (!entry->irte.present) {
2043        VTD_DPRINTF(GENERAL, "error: present flag not set in IRTE"
2044                    " entry index %u value 0x%"PRIx64 " 0x%"PRIx64,
2045                    index, le64_to_cpu(entry->data[1]),
2046                    le64_to_cpu(entry->data[0]));
2047        return -VTD_FR_IR_ENTRY_P;
2048    }
2049
2050    if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
2051        entry->irte.__reserved_2) {
2052        VTD_DPRINTF(GENERAL, "error: IRTE entry index %"PRIu16
2053                    " reserved fields non-zero: 0x%"PRIx64 " 0x%"PRIx64,
2054                    index, le64_to_cpu(entry->data[1]),
2055                    le64_to_cpu(entry->data[0]));
2056        return -VTD_FR_IR_IRTE_RSVD;
2057    }
2058
2059    if (sid != X86_IOMMU_SID_INVALID) {
2060        /* Validate IRTE SID */
2061        source_id = le32_to_cpu(entry->irte.source_id);
2062        switch (entry->irte.sid_vtype) {
2063        case VTD_SVT_NONE:
2064            VTD_DPRINTF(IR, "No SID validation for IRTE index %d", index);
2065            break;
2066
2067        case VTD_SVT_ALL:
2068            mask = vtd_svt_mask[entry->irte.sid_q];
2069            if ((source_id & mask) != (sid & mask)) {
2070                VTD_DPRINTF(GENERAL, "SID validation for IRTE index "
2071                            "%d failed (reqid 0x%04x sid 0x%04x)", index,
2072                            sid, source_id);
2073                return -VTD_FR_IR_SID_ERR;
2074            }
2075            break;
2076
2077        case VTD_SVT_BUS:
2078            bus_max = source_id >> 8;
2079            bus_min = source_id & 0xff;
2080            bus = sid >> 8;
2081            if (bus > bus_max || bus < bus_min) {
2082                VTD_DPRINTF(GENERAL, "SID validation for IRTE index %d "
2083                            "failed (bus %d outside %d-%d)", index, bus,
2084                            bus_min, bus_max);
2085                return -VTD_FR_IR_SID_ERR;
2086            }
2087            break;
2088
2089        default:
2090            VTD_DPRINTF(GENERAL, "Invalid SVT bits (0x%x) in IRTE index "
2091                        "%d", entry->irte.sid_vtype, index);
2092            /* Take this as verification failure. */
2093            return -VTD_FR_IR_SID_ERR;
2094            break;
2095        }
2096    }
2097
2098    return 0;
2099}
2100
2101/* Fetch IRQ information of specific IR index */
2102static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
2103                             VTDIrq *irq, uint16_t sid)
2104{
2105    VTD_IR_TableEntry irte = {};
2106    int ret = 0;
2107
2108    ret = vtd_irte_get(iommu, index, &irte, sid);
2109    if (ret) {
2110        return ret;
2111    }
2112
2113    irq->trigger_mode = irte.irte.trigger_mode;
2114    irq->vector = irte.irte.vector;
2115    irq->delivery_mode = irte.irte.delivery_mode;
2116    irq->dest = le32_to_cpu(irte.irte.dest_id);
2117    if (!iommu->intr_eime) {
2118#define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
2119#define  VTD_IR_APIC_DEST_SHIFT        (8)
2120        irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
2121            VTD_IR_APIC_DEST_SHIFT;
2122    }
2123    irq->dest_mode = irte.irte.dest_mode;
2124    irq->redir_hint = irte.irte.redir_hint;
2125
2126    VTD_DPRINTF(IR, "remapping interrupt index %d: trig:%u,vec:%u,"
2127                "deliver:%u,dest:%u,dest_mode:%u", index,
2128                irq->trigger_mode, irq->vector, irq->delivery_mode,
2129                irq->dest, irq->dest_mode);
2130
2131    return 0;
2132}
2133
2134/* Generate one MSI message from VTDIrq info */
2135static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out)
2136{
2137    VTD_MSIMessage msg = {};
2138
2139    /* Generate address bits */
2140    msg.dest_mode = irq->dest_mode;
2141    msg.redir_hint = irq->redir_hint;
2142    msg.dest = irq->dest;
2143    msg.__addr_hi = irq->dest & 0xffffff00;
2144    msg.__addr_head = cpu_to_le32(0xfee);
2145    /* Keep this from original MSI address bits */
2146    msg.__not_used = irq->msi_addr_last_bits;
2147
2148    /* Generate data bits */
2149    msg.vector = irq->vector;
2150    msg.delivery_mode = irq->delivery_mode;
2151    msg.level = 1;
2152    msg.trigger_mode = irq->trigger_mode;
2153
2154    msg_out->address = msg.msi_addr;
2155    msg_out->data = msg.msi_data;
2156}
2157
2158/* Interrupt remapping for MSI/MSI-X entry */
2159static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
2160                                   MSIMessage *origin,
2161                                   MSIMessage *translated,
2162                                   uint16_t sid)
2163{
2164    int ret = 0;
2165    VTD_IR_MSIAddress addr;
2166    uint16_t index;
2167    VTDIrq irq = {};
2168
2169    assert(origin && translated);
2170
2171    if (!iommu || !iommu->intr_enabled) {
2172        goto do_not_translate;
2173    }
2174
2175    if (origin->address & VTD_MSI_ADDR_HI_MASK) {
2176        VTD_DPRINTF(GENERAL, "error: MSI addr high 32 bits nonzero"
2177                    " during interrupt remapping: 0x%"PRIx32,
2178                    (uint32_t)((origin->address & VTD_MSI_ADDR_HI_MASK) >> \
2179                    VTD_MSI_ADDR_HI_SHIFT));
2180        return -VTD_FR_IR_REQ_RSVD;
2181    }
2182
2183    addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
2184    if (addr.addr.__head != 0xfee) {
2185        VTD_DPRINTF(GENERAL, "error: MSI addr low 32 bits invalid: "
2186                    "0x%"PRIx32, addr.data);
2187        return -VTD_FR_IR_REQ_RSVD;
2188    }
2189
2190    /* This is compatible mode. */
2191    if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
2192        goto do_not_translate;
2193    }
2194
2195    index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
2196
2197#define  VTD_IR_MSI_DATA_SUBHANDLE       (0x0000ffff)
2198#define  VTD_IR_MSI_DATA_RESERVED        (0xffff0000)
2199
2200    if (addr.addr.sub_valid) {
2201        /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
2202        index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
2203    }
2204
2205    ret = vtd_remap_irq_get(iommu, index, &irq, sid);
2206    if (ret) {
2207        return ret;
2208    }
2209
2210    if (addr.addr.sub_valid) {
2211        VTD_DPRINTF(IR, "received MSI interrupt");
2212        if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
2213            VTD_DPRINTF(GENERAL, "error: MSI data bits non-zero for "
2214                        "interrupt remappable entry: 0x%"PRIx32,
2215                        origin->data);
2216            return -VTD_FR_IR_REQ_RSVD;
2217        }
2218    } else {
2219        uint8_t vector = origin->data & 0xff;
2220        uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
2221
2222        VTD_DPRINTF(IR, "received IOAPIC interrupt");
2223        /* IOAPIC entry vector should be aligned with IRTE vector
2224         * (see vt-d spec 5.1.5.1). */
2225        if (vector != irq.vector) {
2226            VTD_DPRINTF(GENERAL, "IOAPIC vector inconsistent: "
2227                        "entry: %d, IRTE: %d, index: %d",
2228                        vector, irq.vector, index);
2229        }
2230
2231        /* The Trigger Mode field must match the Trigger Mode in the IRTE.
2232         * (see vt-d spec 5.1.5.1). */
2233        if (trigger_mode != irq.trigger_mode) {
2234            VTD_DPRINTF(GENERAL, "IOAPIC trigger mode inconsistent: "
2235                        "entry: %u, IRTE: %u, index: %d",
2236                        trigger_mode, irq.trigger_mode, index);
2237        }
2238
2239    }
2240
2241    /*
2242     * We'd better keep the last two bits, assuming that guest OS
2243     * might modify it. Keep it does not hurt after all.
2244     */
2245    irq.msi_addr_last_bits = addr.addr.__not_care;
2246
2247    /* Translate VTDIrq to MSI message */
2248    vtd_generate_msi_message(&irq, translated);
2249
2250    VTD_DPRINTF(IR, "mapping MSI 0x%"PRIx64":0x%"PRIx32 " -> "
2251                "0x%"PRIx64":0x%"PRIx32, origin->address, origin->data,
2252                translated->address, translated->data);
2253    return 0;
2254
2255do_not_translate:
2256    memcpy(translated, origin, sizeof(*origin));
2257    return 0;
2258}
2259
2260static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
2261                         MSIMessage *dst, uint16_t sid)
2262{
2263    return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
2264                                   src, dst, sid);
2265}
2266
2267static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
2268                                   uint64_t *data, unsigned size,
2269                                   MemTxAttrs attrs)
2270{
2271    return MEMTX_OK;
2272}
2273
2274static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
2275                                    uint64_t value, unsigned size,
2276                                    MemTxAttrs attrs)
2277{
2278    int ret = 0;
2279    MSIMessage from = {}, to = {};
2280    uint16_t sid = X86_IOMMU_SID_INVALID;
2281
2282    from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
2283    from.data = (uint32_t) value;
2284
2285    if (!attrs.unspecified) {
2286        /* We have explicit Source ID */
2287        sid = attrs.requester_id;
2288    }
2289
2290    ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
2291    if (ret) {
2292        /* TODO: report error */
2293        VTD_DPRINTF(GENERAL, "int remap fail for addr 0x%"PRIx64
2294                    " data 0x%"PRIx32, from.address, from.data);
2295        /* Drop this interrupt */
2296        return MEMTX_ERROR;
2297    }
2298
2299    VTD_DPRINTF(IR, "delivering MSI 0x%"PRIx64":0x%"PRIx32
2300                " for device sid 0x%04x",
2301                to.address, to.data, sid);
2302
2303    apic_get_class()->send_msi(&to);
2304
2305    return MEMTX_OK;
2306}
2307
2308static const MemoryRegionOps vtd_mem_ir_ops = {
2309    .read_with_attrs = vtd_mem_ir_read,
2310    .write_with_attrs = vtd_mem_ir_write,
2311    .endianness = DEVICE_LITTLE_ENDIAN,
2312    .impl = {
2313        .min_access_size = 4,
2314        .max_access_size = 4,
2315    },
2316    .valid = {
2317        .min_access_size = 4,
2318        .max_access_size = 4,
2319    },
2320};
2321
2322VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
2323{
2324    uintptr_t key = (uintptr_t)bus;
2325    VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
2326    VTDAddressSpace *vtd_dev_as;
2327
2328    if (!vtd_bus) {
2329        /* No corresponding free() */
2330        vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
2331                            X86_IOMMU_PCI_DEVFN_MAX);
2332        vtd_bus->bus = bus;
2333        key = (uintptr_t)bus;
2334        g_hash_table_insert(s->vtd_as_by_busptr, &key, vtd_bus);
2335    }
2336
2337    vtd_dev_as = vtd_bus->dev_as[devfn];
2338
2339    if (!vtd_dev_as) {
2340        vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace));
2341
2342        vtd_dev_as->bus = bus;
2343        vtd_dev_as->devfn = (uint8_t)devfn;
2344        vtd_dev_as->iommu_state = s;
2345        vtd_dev_as->context_cache_entry.context_cache_gen = 0;
2346        memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s),
2347                                 &s->iommu_ops, "intel_iommu", UINT64_MAX);
2348        memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
2349                              &vtd_mem_ir_ops, s, "intel_iommu_ir",
2350                              VTD_INTERRUPT_ADDR_SIZE);
2351        memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST,
2352                                    &vtd_dev_as->iommu_ir);
2353        address_space_init(&vtd_dev_as->as,
2354                           &vtd_dev_as->iommu, "intel_iommu");
2355    }
2356    return vtd_dev_as;
2357}
2358
2359/* Do the initialization. It will also be called when reset, so pay
2360 * attention when adding new initialization stuff.
2361 */
2362static void vtd_init(IntelIOMMUState *s)
2363{
2364    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
2365
2366    memset(s->csr, 0, DMAR_REG_SIZE);
2367    memset(s->wmask, 0, DMAR_REG_SIZE);
2368    memset(s->w1cmask, 0, DMAR_REG_SIZE);
2369    memset(s->womask, 0, DMAR_REG_SIZE);
2370
2371    s->iommu_ops.translate = vtd_iommu_translate;
2372    s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed;
2373    s->root = 0;
2374    s->root_extended = false;
2375    s->dmar_enabled = false;
2376    s->iq_head = 0;
2377    s->iq_tail = 0;
2378    s->iq = 0;
2379    s->iq_size = 0;
2380    s->qi_enabled = false;
2381    s->iq_last_desc_type = VTD_INV_DESC_NONE;
2382    s->next_frcd_reg = 0;
2383    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
2384             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
2385    s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
2386
2387    if (x86_iommu->intr_supported) {
2388        s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
2389        if (s->intr_eim == ON_OFF_AUTO_ON) {
2390            s->ecap |= VTD_ECAP_EIM;
2391        }
2392        assert(s->intr_eim != ON_OFF_AUTO_AUTO);
2393    }
2394
2395    vtd_reset_context_cache(s);
2396    vtd_reset_iotlb(s);
2397
2398    /* Define registers with default values and bit semantics */
2399    vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
2400    vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
2401    vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
2402    vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
2403    vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
2404    vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
2405    vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffff000ULL, 0);
2406    vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
2407    vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
2408
2409    /* Advanced Fault Logging not supported */
2410    vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
2411    vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
2412    vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
2413    vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
2414
2415    /* Treated as RsvdZ when EIM in ECAP_REG is not supported
2416     * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
2417     */
2418    vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
2419
2420    /* Treated as RO for implementations that PLMR and PHMR fields reported
2421     * as Clear in the CAP_REG.
2422     * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
2423     */
2424    vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
2425
2426    vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
2427    vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
2428    vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff007ULL, 0);
2429    vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
2430    vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
2431    vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
2432    vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
2433    /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
2434    vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
2435
2436    /* IOTLB registers */
2437    vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
2438    vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
2439    vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
2440
2441    /* Fault Recording Registers, 128-bit */
2442    vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
2443    vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
2444
2445    /*
2446     * Interrupt remapping registers.
2447     */
2448    vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
2449}
2450
2451/* Should not reset address_spaces when reset because devices will still use
2452 * the address space they got at first (won't ask the bus again).
2453 */
2454static void vtd_reset(DeviceState *dev)
2455{
2456    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
2457
2458    VTD_DPRINTF(GENERAL, "");
2459    vtd_init(s);
2460}
2461
2462static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
2463{
2464    IntelIOMMUState *s = opaque;
2465    VTDAddressSpace *vtd_as;
2466
2467    assert(0 <= devfn && devfn < X86_IOMMU_PCI_DEVFN_MAX);
2468
2469    vtd_as = vtd_find_add_as(s, bus, devfn);
2470    return &vtd_as->as;
2471}
2472
2473static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
2474{
2475    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
2476
2477    /* Currently Intel IOMMU IR only support "kernel-irqchip={off|split}" */
2478    if (x86_iommu->intr_supported && kvm_irqchip_in_kernel() &&
2479        !kvm_irqchip_is_split()) {
2480        error_setg(errp, "Intel Interrupt Remapping cannot work with "
2481                         "kernel-irqchip=on, please use 'split|off'.");
2482        return false;
2483    }
2484    if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu->intr_supported) {
2485        error_setg(errp, "eim=on cannot be selected without intremap=on");
2486        return false;
2487    }
2488
2489    if (s->intr_eim == ON_OFF_AUTO_AUTO) {
2490        s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
2491                      && x86_iommu->intr_supported ?
2492                                              ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2493    }
2494    if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
2495        if (!kvm_irqchip_in_kernel()) {
2496            error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
2497            return false;
2498        }
2499        if (!kvm_enable_x2apic()) {
2500            error_setg(errp, "eim=on requires support on the KVM side"
2501                             "(X2APIC_API, first shipped in v4.7)");
2502            return false;
2503        }
2504    }
2505
2506    return true;
2507}
2508
2509static void vtd_realize(DeviceState *dev, Error **errp)
2510{
2511    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
2512    PCIBus *bus = pcms->bus;
2513    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
2514    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
2515
2516    VTD_DPRINTF(GENERAL, "");
2517    x86_iommu->type = TYPE_INTEL;
2518
2519    if (!vtd_decide_config(s, errp)) {
2520        return;
2521    }
2522
2523    memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
2524    memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
2525                          "intel_iommu", DMAR_REG_SIZE);
2526    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
2527    /* No corresponding destroy */
2528    s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
2529                                     g_free, g_free);
2530    s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
2531                                              g_free, g_free);
2532    vtd_init(s);
2533    sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
2534    pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
2535    /* Pseudo address space under root PCI bus. */
2536    pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
2537}
2538
2539static void vtd_class_init(ObjectClass *klass, void *data)
2540{
2541    DeviceClass *dc = DEVICE_CLASS(klass);
2542    X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass);
2543
2544    dc->reset = vtd_reset;
2545    dc->vmsd = &vtd_vmstate;
2546    dc->props = vtd_properties;
2547    dc->hotpluggable = false;
2548    x86_class->realize = vtd_realize;
2549    x86_class->int_remap = vtd_int_remap;
2550}
2551
2552static const TypeInfo vtd_info = {
2553    .name          = TYPE_INTEL_IOMMU_DEVICE,
2554    .parent        = TYPE_X86_IOMMU_DEVICE,
2555    .instance_size = sizeof(IntelIOMMUState),
2556    .class_init    = vtd_class_init,
2557};
2558
2559static void vtd_register_types(void)
2560{
2561    VTD_DPRINTF(GENERAL, "");
2562    type_register_static(&vtd_info);
2563}
2564
2565type_init(vtd_register_types)
2566