qemu/hw/i386/amd_iommu.c
<<
>>
Prefs
   1/*
   2 * QEMU emulation of AMD IOMMU (AMD-Vi)
   3 *
   4 * Copyright (C) 2011 Eduard - Gabriel Munteanu
   5 * Copyright (C) 2015, 2016 David Kiarie Kahurani
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2 of the License, or
  10 * (at your option) any later version.
  11
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 * GNU General Public License for more details.
  16
  17 * You should have received a copy of the GNU General Public License along
  18 * with this program; if not, see <http://www.gnu.org/licenses/>.
  19 *
  20 * Cache implementation inspired by hw/i386/intel_iommu.c
  21 */
  22#include "qemu/osdep.h"
  23#include "hw/i386/pc.h"
  24#include "hw/pci/msi.h"
  25#include "hw/pci/pci_bus.h"
  26#include "amd_iommu.h"
  27#include "qapi/error.h"
  28#include "qemu/error-report.h"
  29#include "hw/i386/apic_internal.h"
  30#include "trace.h"
  31#include "hw/i386/apic-msidef.h"
  32
  33/* used AMD-Vi MMIO registers */
  34const char *amdvi_mmio_low[] = {
  35    "AMDVI_MMIO_DEVTAB_BASE",
  36    "AMDVI_MMIO_CMDBUF_BASE",
  37    "AMDVI_MMIO_EVTLOG_BASE",
  38    "AMDVI_MMIO_CONTROL",
  39    "AMDVI_MMIO_EXCL_BASE",
  40    "AMDVI_MMIO_EXCL_LIMIT",
  41    "AMDVI_MMIO_EXT_FEATURES",
  42    "AMDVI_MMIO_PPR_BASE",
  43    "UNHANDLED"
  44};
  45const char *amdvi_mmio_high[] = {
  46    "AMDVI_MMIO_COMMAND_HEAD",
  47    "AMDVI_MMIO_COMMAND_TAIL",
  48    "AMDVI_MMIO_EVTLOG_HEAD",
  49    "AMDVI_MMIO_EVTLOG_TAIL",
  50    "AMDVI_MMIO_STATUS",
  51    "AMDVI_MMIO_PPR_HEAD",
  52    "AMDVI_MMIO_PPR_TAIL",
  53    "UNHANDLED"
  54};
  55
  56struct AMDVIAddressSpace {
  57    uint8_t bus_num;            /* bus number                           */
  58    uint8_t devfn;              /* device function                      */
  59    AMDVIState *iommu_state;    /* AMDVI - one per machine              */
  60    MemoryRegion root;          /* AMDVI Root memory map region */
  61    IOMMUMemoryRegion iommu;    /* Device's address translation region  */
  62    MemoryRegion iommu_ir;      /* Device's interrupt remapping region  */
  63    AddressSpace as;            /* device's corresponding address space */
  64};
  65
  66/* AMDVI cache entry */
  67typedef struct AMDVIIOTLBEntry {
  68    uint16_t domid;             /* assigned domain id  */
  69    uint16_t devid;             /* device owning entry */
  70    uint64_t perms;             /* access permissions  */
  71    uint64_t translated_addr;   /* translated address  */
  72    uint64_t page_mask;         /* physical page size  */
  73} AMDVIIOTLBEntry;
  74
  75/* configure MMIO registers at startup/reset */
  76static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
  77                           uint64_t romask, uint64_t w1cmask)
  78{
  79    stq_le_p(&s->mmior[addr], val);
  80    stq_le_p(&s->romask[addr], romask);
  81    stq_le_p(&s->w1cmask[addr], w1cmask);
  82}
  83
  84static uint16_t amdvi_readw(AMDVIState *s, hwaddr addr)
  85{
  86    return lduw_le_p(&s->mmior[addr]);
  87}
  88
  89static uint32_t amdvi_readl(AMDVIState *s, hwaddr addr)
  90{
  91    return ldl_le_p(&s->mmior[addr]);
  92}
  93
  94static uint64_t amdvi_readq(AMDVIState *s, hwaddr addr)
  95{
  96    return ldq_le_p(&s->mmior[addr]);
  97}
  98
  99/* internal write */
 100static void amdvi_writeq_raw(AMDVIState *s, uint64_t val, hwaddr addr)
 101{
 102    stq_le_p(&s->mmior[addr], val);
 103}
 104
 105/* external write */
 106static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val)
 107{
 108    uint16_t romask = lduw_le_p(&s->romask[addr]);
 109    uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]);
 110    uint16_t oldval = lduw_le_p(&s->mmior[addr]);
 111    stw_le_p(&s->mmior[addr],
 112            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 113}
 114
 115static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val)
 116{
 117    uint32_t romask = ldl_le_p(&s->romask[addr]);
 118    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
 119    uint32_t oldval = ldl_le_p(&s->mmior[addr]);
 120    stl_le_p(&s->mmior[addr],
 121            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 122}
 123
 124static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val)
 125{
 126    uint64_t romask = ldq_le_p(&s->romask[addr]);
 127    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
 128    uint32_t oldval = ldq_le_p(&s->mmior[addr]);
 129    stq_le_p(&s->mmior[addr],
 130            ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
 131}
 132
 133/* OR a 64-bit register with a 64-bit value */
 134static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val)
 135{
 136    return amdvi_readq(s, addr) | val;
 137}
 138
 139/* OR a 64-bit register with a 64-bit value storing result in the register */
 140static void amdvi_assign_orq(AMDVIState *s, hwaddr addr, uint64_t val)
 141{
 142    amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) | val);
 143}
 144
 145/* AND a 64-bit register with a 64-bit value storing result in the register */
 146static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val)
 147{
 148   amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val);
 149}
 150
 151static void amdvi_generate_msi_interrupt(AMDVIState *s)
 152{
 153    MSIMessage msg = {};
 154    MemTxAttrs attrs = {
 155        .requester_id = pci_requester_id(&s->pci.dev)
 156    };
 157
 158    if (msi_enabled(&s->pci.dev)) {
 159        msg = msi_get_message(&s->pci.dev, 0);
 160        address_space_stl_le(&address_space_memory, msg.address, msg.data,
 161                             attrs, NULL);
 162    }
 163}
 164
 165static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
 166{
 167    /* event logging not enabled */
 168    if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS,
 169        AMDVI_MMIO_STATUS_EVT_OVF)) {
 170        return;
 171    }
 172
 173    /* event log buffer full */
 174    if (s->evtlog_tail >= s->evtlog_len) {
 175        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
 176        /* generate interrupt */
 177        amdvi_generate_msi_interrupt(s);
 178        return;
 179    }
 180
 181    if (dma_memory_write(&address_space_memory, s->evtlog + s->evtlog_tail,
 182        &evt, AMDVI_EVENT_LEN)) {
 183        trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail);
 184    }
 185
 186    s->evtlog_tail += AMDVI_EVENT_LEN;
 187    amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
 188    amdvi_generate_msi_interrupt(s);
 189}
 190
 191static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start,
 192                                int length)
 193{
 194    int index = start / 64, bitpos = start % 64;
 195    uint64_t mask = MAKE_64BIT_MASK(start, length);
 196    buffer[index] &= ~mask;
 197    buffer[index] |= (value << bitpos) & mask;
 198}
 199/*
 200 * AMDVi event structure
 201 *    0:15   -> DeviceID
 202 *    55:63  -> event type + miscellaneous info
 203 *    63:127 -> related address
 204 */
 205static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr,
 206                               uint16_t info)
 207{
 208    amdvi_setevent_bits(evt, devid, 0, 16);
 209    amdvi_setevent_bits(evt, info, 55, 8);
 210    amdvi_setevent_bits(evt, addr, 63, 64);
 211}
 212/* log an error encountered during a page walk
 213 *
 214 * @addr: virtual address in translation request
 215 */
 216static void amdvi_page_fault(AMDVIState *s, uint16_t devid,
 217                             hwaddr addr, uint16_t info)
 218{
 219    uint64_t evt[4];
 220
 221    info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF;
 222    amdvi_encode_event(evt, devid, addr, info);
 223    amdvi_log_event(s, evt);
 224    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 225            PCI_STATUS_SIG_TARGET_ABORT);
 226}
 227/*
 228 * log a master abort accessing device table
 229 *  @devtab : address of device table entry
 230 *  @info : error flags
 231 */
 232static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid,
 233                                   hwaddr devtab, uint16_t info)
 234{
 235    uint64_t evt[4];
 236
 237    info |= AMDVI_EVENT_DEV_TAB_HW_ERROR;
 238
 239    amdvi_encode_event(evt, devid, devtab, info);
 240    amdvi_log_event(s, evt);
 241    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 242            PCI_STATUS_SIG_TARGET_ABORT);
 243}
 244/* log an event trying to access command buffer
 245 *   @addr : address that couldn't be accessed
 246 */
 247static void amdvi_log_command_error(AMDVIState *s, hwaddr addr)
 248{
 249    uint64_t evt[4], info = AMDVI_EVENT_COMMAND_HW_ERROR;
 250
 251    amdvi_encode_event(evt, 0, addr, info);
 252    amdvi_log_event(s, evt);
 253    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 254            PCI_STATUS_SIG_TARGET_ABORT);
 255}
 256/* log an illegal comand event
 257 *   @addr : address of illegal command
 258 */
 259static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info,
 260                                       hwaddr addr)
 261{
 262    uint64_t evt[4];
 263
 264    info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR;
 265    amdvi_encode_event(evt, 0, addr, info);
 266    amdvi_log_event(s, evt);
 267}
 268/* log an error accessing device table
 269 *
 270 *  @devid : device owning the table entry
 271 *  @devtab : address of device table entry
 272 *  @info : error flags
 273 */
 274static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid,
 275                                          hwaddr addr, uint16_t info)
 276{
 277    uint64_t evt[4];
 278
 279    info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY;
 280    amdvi_encode_event(evt, devid, addr, info);
 281    amdvi_log_event(s, evt);
 282}
 283/* log an error accessing a PTE entry
 284 * @addr : address that couldn't be accessed
 285 */
 286static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid,
 287                                    hwaddr addr, uint16_t info)
 288{
 289    uint64_t evt[4];
 290
 291    info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR;
 292    amdvi_encode_event(evt, devid, addr, info);
 293    amdvi_log_event(s, evt);
 294    pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
 295             PCI_STATUS_SIG_TARGET_ABORT);
 296}
 297
 298static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2)
 299{
 300    return *((const uint64_t *)v1) == *((const uint64_t *)v2);
 301}
 302
 303static guint amdvi_uint64_hash(gconstpointer v)
 304{
 305    return (guint)*(const uint64_t *)v;
 306}
 307
 308static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr,
 309                                           uint64_t devid)
 310{
 311    uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
 312                   ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 313    return g_hash_table_lookup(s->iotlb, &key);
 314}
 315
 316static void amdvi_iotlb_reset(AMDVIState *s)
 317{
 318    assert(s->iotlb);
 319    trace_amdvi_iotlb_reset();
 320    g_hash_table_remove_all(s->iotlb);
 321}
 322
 323static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value,
 324                                            gpointer user_data)
 325{
 326    AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
 327    uint16_t devid = *(uint16_t *)user_data;
 328    return entry->devid == devid;
 329}
 330
 331static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr,
 332                                    uint64_t devid)
 333{
 334    uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
 335                   ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 336    g_hash_table_remove(s->iotlb, &key);
 337}
 338
 339static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid,
 340                               uint64_t gpa, IOMMUTLBEntry to_cache,
 341                               uint16_t domid)
 342{
 343    AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1);
 344    uint64_t *key = g_new(uint64_t, 1);
 345    uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K;
 346
 347    /* don't cache erroneous translations */
 348    if (to_cache.perm != IOMMU_NONE) {
 349        trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid),
 350                PCI_FUNC(devid), gpa, to_cache.translated_addr);
 351
 352        if (g_hash_table_size(s->iotlb) >= AMDVI_IOTLB_MAX_SIZE) {
 353            amdvi_iotlb_reset(s);
 354        }
 355
 356        entry->domid = domid;
 357        entry->perms = to_cache.perm;
 358        entry->translated_addr = to_cache.translated_addr;
 359        entry->page_mask = to_cache.addr_mask;
 360        *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
 361        g_hash_table_replace(s->iotlb, key, entry);
 362    }
 363}
 364
 365static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd)
 366{
 367    /* pad the last 3 bits */
 368    hwaddr addr = cpu_to_le64(extract64(cmd[0], 3, 49)) << 3;
 369    uint64_t data = cpu_to_le64(cmd[1]);
 370
 371    if (extract64(cmd[0], 51, 8)) {
 372        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 373                                   s->cmdbuf + s->cmdbuf_head);
 374    }
 375    if (extract64(cmd[0], 0, 1)) {
 376        if (dma_memory_write(&address_space_memory, addr, &data,
 377            AMDVI_COMPLETION_DATA_SIZE)) {
 378            trace_amdvi_completion_wait_fail(addr);
 379        }
 380    }
 381    /* set completion interrupt */
 382    if (extract64(cmd[0], 1, 1)) {
 383        amdvi_test_mask(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
 384        /* generate interrupt */
 385        amdvi_generate_msi_interrupt(s);
 386    }
 387    trace_amdvi_completion_wait(addr, data);
 388}
 389
 390/* log error without aborting since linux seems to be using reserved bits */
 391static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd)
 392{
 393    uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16));
 394
 395    /* This command should invalidate internal caches of which there isn't */
 396    if (extract64(cmd[0], 15, 16) || cmd[1]) {
 397        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 398                                   s->cmdbuf + s->cmdbuf_head);
 399    }
 400    trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid),
 401                             PCI_FUNC(devid));
 402}
 403
 404static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
 405{
 406    if (extract64(cmd[0], 15, 16) ||  extract64(cmd[0], 19, 8) ||
 407        extract64(cmd[1], 0, 2) || extract64(cmd[1], 3, 29)
 408        || extract64(cmd[1], 47, 16)) {
 409        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 410                                   s->cmdbuf + s->cmdbuf_head);
 411    }
 412    trace_amdvi_ppr_exec();
 413}
 414
 415static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
 416{
 417    if (extract64(cmd[0], 0, 60) || cmd[1]) {
 418        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 419                                   s->cmdbuf + s->cmdbuf_head);
 420    }
 421
 422    amdvi_iotlb_reset(s);
 423    trace_amdvi_all_inval();
 424}
 425
 426static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
 427                                            gpointer user_data)
 428{
 429    AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
 430    uint16_t domid = *(uint16_t *)user_data;
 431    return entry->domid == domid;
 432}
 433
 434/* we don't have devid - we can't remove pages by address */
 435static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
 436{
 437    uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16));
 438
 439    if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 16, 12) ||
 440        extract64(cmd[0], 3, 10)) {
 441        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 442                                   s->cmdbuf + s->cmdbuf_head);
 443    }
 444
 445    g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid,
 446                                &domid);
 447    trace_amdvi_pages_inval(domid);
 448}
 449
 450static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd)
 451{
 452    if (extract64(cmd[0], 16, 8) || extract64(cmd[0], 20, 8) ||
 453        extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
 454        extract64(cmd[1], 5, 7)) {
 455        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 456                                   s->cmdbuf + s->cmdbuf_head);
 457    }
 458
 459    trace_amdvi_prefetch_pages();
 460}
 461
 462static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
 463{
 464    if (extract64(cmd[0], 16, 16) || cmd[1]) {
 465        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 466                                   s->cmdbuf + s->cmdbuf_head);
 467        return;
 468    }
 469
 470    trace_amdvi_intr_inval();
 471}
 472
 473/* FIXME: Try to work with the specified size instead of all the pages
 474 * when the S bit is on
 475 */
 476static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd)
 477{
 478
 479    uint16_t devid = extract64(cmd[0], 0, 16);
 480    if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 9)) {
 481        amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
 482                                   s->cmdbuf + s->cmdbuf_head);
 483        return;
 484    }
 485
 486    if (extract64(cmd[1], 0, 1)) {
 487        g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_devid,
 488                                    &devid);
 489    } else {
 490        amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12,
 491                                cpu_to_le16(extract64(cmd[1], 0, 16)));
 492    }
 493    trace_amdvi_iotlb_inval();
 494}
 495
 496/* not honouring reserved bits is regarded as an illegal command */
 497static void amdvi_cmdbuf_exec(AMDVIState *s)
 498{
 499    uint64_t cmd[2];
 500
 501    if (dma_memory_read(&address_space_memory, s->cmdbuf + s->cmdbuf_head,
 502        cmd, AMDVI_COMMAND_SIZE)) {
 503        trace_amdvi_command_read_fail(s->cmdbuf, s->cmdbuf_head);
 504        amdvi_log_command_error(s, s->cmdbuf + s->cmdbuf_head);
 505        return;
 506    }
 507
 508    switch (extract64(cmd[0], 60, 4)) {
 509    case AMDVI_CMD_COMPLETION_WAIT:
 510        amdvi_completion_wait(s, cmd);
 511        break;
 512    case AMDVI_CMD_INVAL_DEVTAB_ENTRY:
 513        amdvi_inval_devtab_entry(s, cmd);
 514        break;
 515    case AMDVI_CMD_INVAL_AMDVI_PAGES:
 516        amdvi_inval_pages(s, cmd);
 517        break;
 518    case AMDVI_CMD_INVAL_IOTLB_PAGES:
 519        iommu_inval_iotlb(s, cmd);
 520        break;
 521    case AMDVI_CMD_INVAL_INTR_TABLE:
 522        amdvi_inval_inttable(s, cmd);
 523        break;
 524    case AMDVI_CMD_PREFETCH_AMDVI_PAGES:
 525        amdvi_prefetch_pages(s, cmd);
 526        break;
 527    case AMDVI_CMD_COMPLETE_PPR_REQUEST:
 528        amdvi_complete_ppr(s, cmd);
 529        break;
 530    case AMDVI_CMD_INVAL_AMDVI_ALL:
 531        amdvi_inval_all(s, cmd);
 532        break;
 533    default:
 534        trace_amdvi_unhandled_command(extract64(cmd[1], 60, 4));
 535        /* log illegal command */
 536        amdvi_log_illegalcom_error(s, extract64(cmd[1], 60, 4),
 537                                   s->cmdbuf + s->cmdbuf_head);
 538    }
 539}
 540
 541static void amdvi_cmdbuf_run(AMDVIState *s)
 542{
 543    if (!s->cmdbuf_enabled) {
 544        trace_amdvi_command_error(amdvi_readq(s, AMDVI_MMIO_CONTROL));
 545        return;
 546    }
 547
 548    /* check if there is work to do. */
 549    while (s->cmdbuf_head != s->cmdbuf_tail) {
 550        trace_amdvi_command_exec(s->cmdbuf_head, s->cmdbuf_tail, s->cmdbuf);
 551        amdvi_cmdbuf_exec(s);
 552        s->cmdbuf_head += AMDVI_COMMAND_SIZE;
 553        amdvi_writeq_raw(s, s->cmdbuf_head, AMDVI_MMIO_COMMAND_HEAD);
 554
 555        /* wrap head pointer */
 556        if (s->cmdbuf_head >= s->cmdbuf_len * AMDVI_COMMAND_SIZE) {
 557            s->cmdbuf_head = 0;
 558        }
 559    }
 560}
 561
 562static void amdvi_mmio_trace(hwaddr addr, unsigned size)
 563{
 564    uint8_t index = (addr & ~0x2000) / 8;
 565
 566    if ((addr & 0x2000)) {
 567        /* high table */
 568        index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index;
 569        trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
 570    } else {
 571        index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index;
 572        trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
 573    }
 574}
 575
 576static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size)
 577{
 578    AMDVIState *s = opaque;
 579
 580    uint64_t val = -1;
 581    if (addr + size > AMDVI_MMIO_SIZE) {
 582        trace_amdvi_mmio_read_invalid(AMDVI_MMIO_SIZE, addr, size);
 583        return (uint64_t)-1;
 584    }
 585
 586    if (size == 2) {
 587        val = amdvi_readw(s, addr);
 588    } else if (size == 4) {
 589        val = amdvi_readl(s, addr);
 590    } else if (size == 8) {
 591        val = amdvi_readq(s, addr);
 592    }
 593    amdvi_mmio_trace(addr, size);
 594
 595    return val;
 596}
 597
 598static void amdvi_handle_control_write(AMDVIState *s)
 599{
 600    unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL);
 601    s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN);
 602
 603    s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN);
 604    s->evtlog_enabled = s->enabled && !!(control &
 605                        AMDVI_MMIO_CONTROL_EVENTLOGEN);
 606
 607    s->evtlog_intr = !!(control & AMDVI_MMIO_CONTROL_EVENTINTEN);
 608    s->completion_wait_intr = !!(control & AMDVI_MMIO_CONTROL_COMWAITINTEN);
 609    s->cmdbuf_enabled = s->enabled && !!(control &
 610                        AMDVI_MMIO_CONTROL_CMDBUFLEN);
 611    s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN);
 612
 613    /* update the flags depending on the control register */
 614    if (s->cmdbuf_enabled) {
 615        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN);
 616    } else {
 617        amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_CMDBUF_RUN);
 618    }
 619    if (s->evtlog_enabled) {
 620        amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_RUN);
 621    } else {
 622        amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_EVT_RUN);
 623    }
 624
 625    trace_amdvi_control_status(control);
 626    amdvi_cmdbuf_run(s);
 627}
 628
 629static inline void amdvi_handle_devtab_write(AMDVIState *s)
 630
 631{
 632    uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE);
 633    s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK);
 634
 635    /* set device table length */
 636    s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 *
 637                    (AMDVI_MMIO_DEVTAB_SIZE_UNIT /
 638                     AMDVI_MMIO_DEVTAB_ENTRY_SIZE));
 639}
 640
 641static inline void amdvi_handle_cmdhead_write(AMDVIState *s)
 642{
 643    s->cmdbuf_head = amdvi_readq(s, AMDVI_MMIO_COMMAND_HEAD)
 644                     & AMDVI_MMIO_CMDBUF_HEAD_MASK;
 645    amdvi_cmdbuf_run(s);
 646}
 647
 648static inline void amdvi_handle_cmdbase_write(AMDVIState *s)
 649{
 650    s->cmdbuf = amdvi_readq(s, AMDVI_MMIO_COMMAND_BASE)
 651                & AMDVI_MMIO_CMDBUF_BASE_MASK;
 652    s->cmdbuf_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_CMDBUF_SIZE_BYTE)
 653                    & AMDVI_MMIO_CMDBUF_SIZE_MASK);
 654    s->cmdbuf_head = s->cmdbuf_tail = 0;
 655}
 656
 657static inline void amdvi_handle_cmdtail_write(AMDVIState *s)
 658{
 659    s->cmdbuf_tail = amdvi_readq(s, AMDVI_MMIO_COMMAND_TAIL)
 660                     & AMDVI_MMIO_CMDBUF_TAIL_MASK;
 661    amdvi_cmdbuf_run(s);
 662}
 663
 664static inline void amdvi_handle_excllim_write(AMDVIState *s)
 665{
 666    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EXCL_LIMIT);
 667    s->excl_limit = (val & AMDVI_MMIO_EXCL_LIMIT_MASK) |
 668                    AMDVI_MMIO_EXCL_LIMIT_LOW;
 669}
 670
 671static inline void amdvi_handle_evtbase_write(AMDVIState *s)
 672{
 673    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE);
 674    s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK;
 675    s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE)
 676                    & AMDVI_MMIO_EVTLOG_SIZE_MASK);
 677}
 678
 679static inline void amdvi_handle_evttail_write(AMDVIState *s)
 680{
 681    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_TAIL);
 682    s->evtlog_tail = val & AMDVI_MMIO_EVTLOG_TAIL_MASK;
 683}
 684
 685static inline void amdvi_handle_evthead_write(AMDVIState *s)
 686{
 687    uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_HEAD);
 688    s->evtlog_head = val & AMDVI_MMIO_EVTLOG_HEAD_MASK;
 689}
 690
 691static inline void amdvi_handle_pprbase_write(AMDVIState *s)
 692{
 693    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_BASE);
 694    s->ppr_log = val & AMDVI_MMIO_PPRLOG_BASE_MASK;
 695    s->pprlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_PPRLOG_SIZE_BYTE)
 696                    & AMDVI_MMIO_PPRLOG_SIZE_MASK);
 697}
 698
 699static inline void amdvi_handle_pprhead_write(AMDVIState *s)
 700{
 701    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_HEAD);
 702    s->pprlog_head = val & AMDVI_MMIO_PPRLOG_HEAD_MASK;
 703}
 704
 705static inline void amdvi_handle_pprtail_write(AMDVIState *s)
 706{
 707    uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_TAIL);
 708    s->pprlog_tail = val & AMDVI_MMIO_PPRLOG_TAIL_MASK;
 709}
 710
 711/* FIXME: something might go wrong if System Software writes in chunks
 712 * of one byte but linux writes in chunks of 4 bytes so currently it
 713 * works correctly with linux but will definitely be busted if software
 714 * reads/writes 8 bytes
 715 */
 716static void amdvi_mmio_reg_write(AMDVIState *s, unsigned size, uint64_t val,
 717                                 hwaddr addr)
 718{
 719    if (size == 2) {
 720        amdvi_writew(s, addr, val);
 721    } else if (size == 4) {
 722        amdvi_writel(s, addr, val);
 723    } else if (size == 8) {
 724        amdvi_writeq(s, addr, val);
 725    }
 726}
 727
 728static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
 729                             unsigned size)
 730{
 731    AMDVIState *s = opaque;
 732    unsigned long offset = addr & 0x07;
 733
 734    if (addr + size > AMDVI_MMIO_SIZE) {
 735        trace_amdvi_mmio_write("error: addr outside region: max ",
 736                (uint64_t)AMDVI_MMIO_SIZE, size, val, offset);
 737        return;
 738    }
 739
 740    amdvi_mmio_trace(addr, size);
 741    switch (addr & ~0x07) {
 742    case AMDVI_MMIO_CONTROL:
 743        amdvi_mmio_reg_write(s, size, val, addr);
 744        amdvi_handle_control_write(s);
 745        break;
 746    case AMDVI_MMIO_DEVICE_TABLE:
 747        amdvi_mmio_reg_write(s, size, val, addr);
 748       /*  set device table address
 749        *   This also suffers from inability to tell whether software
 750        *   is done writing
 751        */
 752        if (offset || (size == 8)) {
 753            amdvi_handle_devtab_write(s);
 754        }
 755        break;
 756    case AMDVI_MMIO_COMMAND_HEAD:
 757        amdvi_mmio_reg_write(s, size, val, addr);
 758        amdvi_handle_cmdhead_write(s);
 759        break;
 760    case AMDVI_MMIO_COMMAND_BASE:
 761        amdvi_mmio_reg_write(s, size, val, addr);
 762        /* FIXME - make sure System Software has finished writing incase
 763         * it writes in chucks less than 8 bytes in a robust way.As for
 764         * now, this hacks works for the linux driver
 765         */
 766        if (offset || (size == 8)) {
 767            amdvi_handle_cmdbase_write(s);
 768        }
 769        break;
 770    case AMDVI_MMIO_COMMAND_TAIL:
 771        amdvi_mmio_reg_write(s, size, val, addr);
 772        amdvi_handle_cmdtail_write(s);
 773        break;
 774    case AMDVI_MMIO_EVENT_BASE:
 775        amdvi_mmio_reg_write(s, size, val, addr);
 776        amdvi_handle_evtbase_write(s);
 777        break;
 778    case AMDVI_MMIO_EVENT_HEAD:
 779        amdvi_mmio_reg_write(s, size, val, addr);
 780        amdvi_handle_evthead_write(s);
 781        break;
 782    case AMDVI_MMIO_EVENT_TAIL:
 783        amdvi_mmio_reg_write(s, size, val, addr);
 784        amdvi_handle_evttail_write(s);
 785        break;
 786    case AMDVI_MMIO_EXCL_LIMIT:
 787        amdvi_mmio_reg_write(s, size, val, addr);
 788        amdvi_handle_excllim_write(s);
 789        break;
 790        /* PPR log base - unused for now */
 791    case AMDVI_MMIO_PPR_BASE:
 792        amdvi_mmio_reg_write(s, size, val, addr);
 793        amdvi_handle_pprbase_write(s);
 794        break;
 795        /* PPR log head - also unused for now */
 796    case AMDVI_MMIO_PPR_HEAD:
 797        amdvi_mmio_reg_write(s, size, val, addr);
 798        amdvi_handle_pprhead_write(s);
 799        break;
 800        /* PPR log tail - unused for now */
 801    case AMDVI_MMIO_PPR_TAIL:
 802        amdvi_mmio_reg_write(s, size, val, addr);
 803        amdvi_handle_pprtail_write(s);
 804        break;
 805    }
 806}
 807
 808static inline uint64_t amdvi_get_perms(uint64_t entry)
 809{
 810    return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >>
 811           AMDVI_DEV_PERM_SHIFT;
 812}
 813
 814/* validate that reserved bits are honoured */
 815static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid,
 816                               uint64_t *dte)
 817{
 818    if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED)
 819        || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED)
 820        || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) {
 821        amdvi_log_illegaldevtab_error(s, devid,
 822                                      s->devtab +
 823                                      devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
 824        return false;
 825    }
 826
 827    return true;
 828}
 829
 830/* get a device table entry given the devid */
 831static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry)
 832{
 833    uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE;
 834
 835    if (dma_memory_read(&address_space_memory, s->devtab + offset, entry,
 836        AMDVI_DEVTAB_ENTRY_SIZE)) {
 837        trace_amdvi_dte_get_fail(s->devtab, offset);
 838        /* log error accessing dte */
 839        amdvi_log_devtab_error(s, devid, s->devtab + offset, 0);
 840        return false;
 841    }
 842
 843    *entry = le64_to_cpu(*entry);
 844    if (!amdvi_validate_dte(s, devid, entry)) {
 845        trace_amdvi_invalid_dte(entry[0]);
 846        return false;
 847    }
 848
 849    return true;
 850}
 851
 852/* get pte translation mode */
 853static inline uint8_t get_pte_translation_mode(uint64_t pte)
 854{
 855    return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
 856}
 857
 858static inline uint64_t pte_override_page_mask(uint64_t pte)
 859{
 860    uint8_t page_mask = 12;
 861    uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) ^ AMDVI_DEV_PT_ROOT_MASK;
 862    /* find the first zero bit */
 863    while (addr & 1) {
 864        page_mask++;
 865        addr = addr >> 1;
 866    }
 867
 868    return ~((1ULL << page_mask) - 1);
 869}
 870
 871static inline uint64_t pte_get_page_mask(uint64_t oldlevel)
 872{
 873    return ~((1UL << ((oldlevel * 9) + 3)) - 1);
 874}
 875
 876static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
 877                                          uint16_t devid)
 878{
 879    uint64_t pte;
 880
 881    if (dma_memory_read(&address_space_memory, pte_addr, &pte, sizeof(pte))) {
 882        trace_amdvi_get_pte_hwerror(pte_addr);
 883        amdvi_log_pagetab_error(s, devid, pte_addr, 0);
 884        pte = 0;
 885        return pte;
 886    }
 887
 888    pte = le64_to_cpu(pte);
 889    return pte;
 890}
 891
 892static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
 893                            IOMMUTLBEntry *ret, unsigned perms,
 894                            hwaddr addr)
 895{
 896    unsigned level, present, pte_perms, oldlevel;
 897    uint64_t pte = dte[0], pte_addr, page_mask;
 898
 899    /* make sure the DTE has TV = 1 */
 900    if (pte & AMDVI_DEV_TRANSLATION_VALID) {
 901        level = get_pte_translation_mode(pte);
 902        if (level >= 7) {
 903            trace_amdvi_mode_invalid(level, addr);
 904            return;
 905        }
 906        if (level == 0) {
 907            goto no_remap;
 908        }
 909
 910        /* we are at the leaf page table or page table encodes a huge page */
 911        while (level > 0) {
 912            pte_perms = amdvi_get_perms(pte);
 913            present = pte & 1;
 914            if (!present || perms != (perms & pte_perms)) {
 915                amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
 916                trace_amdvi_page_fault(addr);
 917                return;
 918            }
 919
 920            /* go to the next lower level */
 921            pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK;
 922            /* add offset and load pte */
 923            pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
 924            pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
 925            if (!pte) {
 926                return;
 927            }
 928            oldlevel = level;
 929            level = get_pte_translation_mode(pte);
 930            if (level == 0x7) {
 931                break;
 932            }
 933        }
 934
 935        if (level == 0x7) {
 936            page_mask = pte_override_page_mask(pte);
 937        } else {
 938            page_mask = pte_get_page_mask(oldlevel);
 939        }
 940
 941        /* get access permissions from pte */
 942        ret->iova = addr & page_mask;
 943        ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask;
 944        ret->addr_mask = ~page_mask;
 945        ret->perm = amdvi_get_perms(pte);
 946        return;
 947    }
 948no_remap:
 949    ret->iova = addr & AMDVI_PAGE_MASK_4K;
 950    ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
 951    ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
 952    ret->perm = amdvi_get_perms(pte);
 953}
 954
 955static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
 956                               bool is_write, IOMMUTLBEntry *ret)
 957{
 958    AMDVIState *s = as->iommu_state;
 959    uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
 960    AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
 961    uint64_t entry[4];
 962
 963    if (iotlb_entry) {
 964        trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid),
 965                PCI_FUNC(devid), addr, iotlb_entry->translated_addr);
 966        ret->iova = addr & ~iotlb_entry->page_mask;
 967        ret->translated_addr = iotlb_entry->translated_addr;
 968        ret->addr_mask = iotlb_entry->page_mask;
 969        ret->perm = iotlb_entry->perms;
 970        return;
 971    }
 972
 973    if (!amdvi_get_dte(s, devid, entry)) {
 974        return;
 975    }
 976
 977    /* devices with V = 0 are not translated */
 978    if (!(entry[0] & AMDVI_DEV_VALID)) {
 979        goto out;
 980    }
 981
 982    amdvi_page_walk(as, entry, ret,
 983                    is_write ? AMDVI_PERM_WRITE : AMDVI_PERM_READ, addr);
 984
 985    amdvi_update_iotlb(s, devid, addr, *ret,
 986                       entry[1] & AMDVI_DEV_DOMID_ID_MASK);
 987    return;
 988
 989out:
 990    ret->iova = addr & AMDVI_PAGE_MASK_4K;
 991    ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
 992    ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
 993    ret->perm = IOMMU_RW;
 994}
 995
 996static inline bool amdvi_is_interrupt_addr(hwaddr addr)
 997{
 998    return addr >= AMDVI_INT_ADDR_FIRST && addr <= AMDVI_INT_ADDR_LAST;
 999}
1000
1001static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
1002                                     IOMMUAccessFlags flag, int iommu_idx)
1003{
1004    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
1005    AMDVIState *s = as->iommu_state;
1006    IOMMUTLBEntry ret = {
1007        .target_as = &address_space_memory,
1008        .iova = addr,
1009        .translated_addr = 0,
1010        .addr_mask = ~(hwaddr)0,
1011        .perm = IOMMU_NONE
1012    };
1013
1014    if (!s->enabled) {
1015        /* AMDVI disabled - corresponds to iommu=off not
1016         * failure to provide any parameter
1017         */
1018        ret.iova = addr & AMDVI_PAGE_MASK_4K;
1019        ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
1020        ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
1021        ret.perm = IOMMU_RW;
1022        return ret;
1023    } else if (amdvi_is_interrupt_addr(addr)) {
1024        ret.iova = addr & AMDVI_PAGE_MASK_4K;
1025        ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
1026        ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
1027        ret.perm = IOMMU_WO;
1028        return ret;
1029    }
1030
1031    amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret);
1032    trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
1033            PCI_FUNC(as->devfn), addr, ret.translated_addr);
1034    return ret;
1035}
1036
1037static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
1038                          union irte *irte, uint16_t devid)
1039{
1040    uint64_t irte_root, offset;
1041
1042    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
1043    offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
1044
1045    trace_amdvi_ir_irte(irte_root, offset);
1046
1047    if (dma_memory_read(&address_space_memory, irte_root + offset,
1048                        irte, sizeof(*irte))) {
1049        trace_amdvi_ir_err("failed to get irte");
1050        return -AMDVI_IR_GET_IRTE;
1051    }
1052
1053    trace_amdvi_ir_irte_val(irte->val);
1054
1055    return 0;
1056}
1057
1058static int amdvi_int_remap_legacy(AMDVIState *iommu,
1059                                  MSIMessage *origin,
1060                                  MSIMessage *translated,
1061                                  uint64_t *dte,
1062                                  X86IOMMUIrq *irq,
1063                                  uint16_t sid)
1064{
1065    int ret;
1066    union irte irte;
1067
1068    /* get interrupt remapping table */
1069    ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
1070    if (ret < 0) {
1071        return ret;
1072    }
1073
1074    if (!irte.fields.valid) {
1075        trace_amdvi_ir_target_abort("RemapEn is disabled");
1076        return -AMDVI_IR_TARGET_ABORT;
1077    }
1078
1079    if (irte.fields.guest_mode) {
1080        error_report_once("guest mode is not zero");
1081        return -AMDVI_IR_ERR;
1082    }
1083
1084    if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
1085        error_report_once("reserved int_type");
1086        return -AMDVI_IR_ERR;
1087    }
1088
1089    irq->delivery_mode = irte.fields.int_type;
1090    irq->vector = irte.fields.vector;
1091    irq->dest_mode = irte.fields.dm;
1092    irq->redir_hint = irte.fields.rq_eoi;
1093    irq->dest = irte.fields.destination;
1094
1095    return 0;
1096}
1097
1098static int amdvi_get_irte_ga(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
1099                             struct irte_ga *irte, uint16_t devid)
1100{
1101    uint64_t irte_root, offset;
1102
1103    irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
1104    offset = (origin->data & AMDVI_IRTE_OFFSET) << 4;
1105    trace_amdvi_ir_irte(irte_root, offset);
1106
1107    if (dma_memory_read(&address_space_memory, irte_root + offset,
1108                        irte, sizeof(*irte))) {
1109        trace_amdvi_ir_err("failed to get irte_ga");
1110        return -AMDVI_IR_GET_IRTE;
1111    }
1112
1113    trace_amdvi_ir_irte_ga_val(irte->hi.val, irte->lo.val);
1114    return 0;
1115}
1116
1117static int amdvi_int_remap_ga(AMDVIState *iommu,
1118                              MSIMessage *origin,
1119                              MSIMessage *translated,
1120                              uint64_t *dte,
1121                              X86IOMMUIrq *irq,
1122                              uint16_t sid)
1123{
1124    int ret;
1125    struct irte_ga irte;
1126
1127    /* get interrupt remapping table */
1128    ret = amdvi_get_irte_ga(iommu, origin, dte, &irte, sid);
1129    if (ret < 0) {
1130        return ret;
1131    }
1132
1133    if (!irte.lo.fields_remap.valid) {
1134        trace_amdvi_ir_target_abort("RemapEn is disabled");
1135        return -AMDVI_IR_TARGET_ABORT;
1136    }
1137
1138    if (irte.lo.fields_remap.guest_mode) {
1139        error_report_once("guest mode is not zero");
1140        return -AMDVI_IR_ERR;
1141    }
1142
1143    if (irte.lo.fields_remap.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
1144        error_report_once("reserved int_type is set");
1145        return -AMDVI_IR_ERR;
1146    }
1147
1148    irq->delivery_mode = irte.lo.fields_remap.int_type;
1149    irq->vector = irte.hi.fields.vector;
1150    irq->dest_mode = irte.lo.fields_remap.dm;
1151    irq->redir_hint = irte.lo.fields_remap.rq_eoi;
1152    irq->dest = irte.lo.fields_remap.destination;
1153
1154    return 0;
1155}
1156
1157static int __amdvi_int_remap_msi(AMDVIState *iommu,
1158                                 MSIMessage *origin,
1159                                 MSIMessage *translated,
1160                                 uint64_t *dte,
1161                                 X86IOMMUIrq *irq,
1162                                 uint16_t sid)
1163{
1164    int ret;
1165    uint8_t int_ctl;
1166
1167    int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
1168    trace_amdvi_ir_intctl(int_ctl);
1169
1170    switch (int_ctl) {
1171    case AMDVI_IR_INTCTL_PASS:
1172        memcpy(translated, origin, sizeof(*origin));
1173        return 0;
1174    case AMDVI_IR_INTCTL_REMAP:
1175        break;
1176    case AMDVI_IR_INTCTL_ABORT:
1177        trace_amdvi_ir_target_abort("int_ctl abort");
1178        return -AMDVI_IR_TARGET_ABORT;
1179    default:
1180        trace_amdvi_ir_err("int_ctl reserved");
1181        return -AMDVI_IR_ERR;
1182    }
1183
1184    if (iommu->ga_enabled) {
1185        ret = amdvi_int_remap_ga(iommu, origin, translated, dte, irq, sid);
1186    } else {
1187        ret = amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
1188    }
1189
1190    return ret;
1191}
1192
1193/* Interrupt remapping for MSI/MSI-X entry */
1194static int amdvi_int_remap_msi(AMDVIState *iommu,
1195                               MSIMessage *origin,
1196                               MSIMessage *translated,
1197                               uint16_t sid)
1198{
1199    int ret = 0;
1200    uint64_t pass = 0;
1201    uint64_t dte[4] = { 0 };
1202    X86IOMMUIrq irq = { 0 };
1203    uint8_t dest_mode, delivery_mode;
1204
1205    assert(origin && translated);
1206
1207    /*
1208     * When IOMMU is enabled, interrupt remap request will come either from
1209     * IO-APIC or PCI device. If interrupt is from PCI device then it will
1210     * have a valid requester id but if the interrupt is from IO-APIC
1211     * then requester id will be invalid.
1212     */
1213    if (sid == X86_IOMMU_SID_INVALID) {
1214        sid = AMDVI_IOAPIC_SB_DEVID;
1215    }
1216
1217    trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
1218
1219    /* check if device table entry is set before we go further. */
1220    if (!iommu || !iommu->devtab_len) {
1221        memcpy(translated, origin, sizeof(*origin));
1222        goto out;
1223    }
1224
1225    if (!amdvi_get_dte(iommu, sid, dte)) {
1226        return -AMDVI_IR_ERR;
1227    }
1228
1229    /* Check if IR is enabled in DTE */
1230    if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
1231        memcpy(translated, origin, sizeof(*origin));
1232        goto out;
1233    }
1234
1235    /* validate that we are configure with intremap=on */
1236    if (!x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu))) {
1237        trace_amdvi_err("Interrupt remapping is enabled in the guest but "
1238                        "not in the host. Use intremap=on to enable interrupt "
1239                        "remapping in amd-iommu.");
1240        return -AMDVI_IR_ERR;
1241    }
1242
1243    if (origin->address & AMDVI_MSI_ADDR_HI_MASK) {
1244        trace_amdvi_err("MSI address high 32 bits non-zero when "
1245                        "Interrupt Remapping enabled.");
1246        return -AMDVI_IR_ERR;
1247    }
1248
1249    if ((origin->address & AMDVI_MSI_ADDR_LO_MASK) != APIC_DEFAULT_ADDRESS) {
1250        trace_amdvi_err("MSI is not from IOAPIC.");
1251        return -AMDVI_IR_ERR;
1252    }
1253
1254    /*
1255     * The MSI data register [10:8] are used to get the upstream interrupt type.
1256     *
1257     * See MSI/MSI-X format:
1258     * https://pdfs.semanticscholar.org/presentation/9420/c279e942eca568157711ef5c92b800c40a79.pdf
1259     * (page 5)
1260     */
1261    delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
1262
1263    switch (delivery_mode) {
1264    case AMDVI_IOAPIC_INT_TYPE_FIXED:
1265    case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
1266        trace_amdvi_ir_delivery_mode("fixed/arbitrated");
1267        ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
1268        if (ret < 0) {
1269            goto remap_fail;
1270        } else {
1271            /* Translate IRQ to MSI messages */
1272            x86_iommu_irq_to_msi_message(&irq, translated);
1273            goto out;
1274        }
1275        break;
1276    case AMDVI_IOAPIC_INT_TYPE_SMI:
1277        error_report("SMI is not supported!");
1278        ret = -AMDVI_IR_ERR;
1279        break;
1280    case AMDVI_IOAPIC_INT_TYPE_NMI:
1281        pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
1282        trace_amdvi_ir_delivery_mode("nmi");
1283        break;
1284    case AMDVI_IOAPIC_INT_TYPE_INIT:
1285        pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
1286        trace_amdvi_ir_delivery_mode("init");
1287        break;
1288    case AMDVI_IOAPIC_INT_TYPE_EINT:
1289        pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
1290        trace_amdvi_ir_delivery_mode("eint");
1291        break;
1292    default:
1293        trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
1294        ret = -AMDVI_IR_ERR;
1295        break;
1296    }
1297
1298    if (ret < 0) {
1299        goto remap_fail;
1300    }
1301
1302    /*
1303     * The MSI address register bit[2] is used to get the destination
1304     * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
1305     * only.
1306     */
1307    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
1308    if (dest_mode) {
1309        trace_amdvi_ir_err("invalid dest_mode");
1310        ret = -AMDVI_IR_ERR;
1311        goto remap_fail;
1312    }
1313
1314    if (pass) {
1315        memcpy(translated, origin, sizeof(*origin));
1316    } else {
1317        trace_amdvi_ir_err("passthrough is not enabled");
1318        ret = -AMDVI_IR_ERR;
1319        goto remap_fail;
1320    }
1321
1322out:
1323    trace_amdvi_ir_remap_msi(origin->address, origin->data,
1324                             translated->address, translated->data);
1325    return 0;
1326
1327remap_fail:
1328    return ret;
1329}
1330
1331static int amdvi_int_remap(X86IOMMUState *iommu,
1332                           MSIMessage *origin,
1333                           MSIMessage *translated,
1334                           uint16_t sid)
1335{
1336    return amdvi_int_remap_msi(AMD_IOMMU_DEVICE(iommu), origin,
1337                               translated, sid);
1338}
1339
1340static MemTxResult amdvi_mem_ir_write(void *opaque, hwaddr addr,
1341                                      uint64_t value, unsigned size,
1342                                      MemTxAttrs attrs)
1343{
1344    int ret;
1345    MSIMessage from = { 0, 0 }, to = { 0, 0 };
1346    uint16_t sid = AMDVI_IOAPIC_SB_DEVID;
1347
1348    from.address = (uint64_t) addr + AMDVI_INT_ADDR_FIRST;
1349    from.data = (uint32_t) value;
1350
1351    trace_amdvi_mem_ir_write_req(addr, value, size);
1352
1353    if (!attrs.unspecified) {
1354        /* We have explicit Source ID */
1355        sid = attrs.requester_id;
1356    }
1357
1358    ret = amdvi_int_remap_msi(opaque, &from, &to, sid);
1359    if (ret < 0) {
1360        /* TODO: log the event using IOMMU log event interface */
1361        error_report_once("failed to remap interrupt from devid 0x%x", sid);
1362        return MEMTX_ERROR;
1363    }
1364
1365    apic_get_class()->send_msi(&to);
1366
1367    trace_amdvi_mem_ir_write(to.address, to.data);
1368    return MEMTX_OK;
1369}
1370
1371static MemTxResult amdvi_mem_ir_read(void *opaque, hwaddr addr,
1372                                     uint64_t *data, unsigned size,
1373                                     MemTxAttrs attrs)
1374{
1375    return MEMTX_OK;
1376}
1377
1378static const MemoryRegionOps amdvi_ir_ops = {
1379    .read_with_attrs = amdvi_mem_ir_read,
1380    .write_with_attrs = amdvi_mem_ir_write,
1381    .endianness = DEVICE_LITTLE_ENDIAN,
1382    .impl = {
1383        .min_access_size = 4,
1384        .max_access_size = 4,
1385    },
1386    .valid = {
1387        .min_access_size = 4,
1388        .max_access_size = 4,
1389    }
1390};
1391
1392static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
1393{
1394    char name[128];
1395    AMDVIState *s = opaque;
1396    AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
1397    int bus_num = pci_bus_num(bus);
1398
1399    iommu_as = s->address_spaces[bus_num];
1400
1401    /* allocate memory during the first run */
1402    if (!iommu_as) {
1403        iommu_as = g_malloc0(sizeof(AMDVIAddressSpace *) * PCI_DEVFN_MAX);
1404        s->address_spaces[bus_num] = iommu_as;
1405    }
1406
1407    /* set up AMD-Vi region */
1408    if (!iommu_as[devfn]) {
1409        snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn);
1410
1411        iommu_as[devfn] = g_malloc0(sizeof(AMDVIAddressSpace));
1412        iommu_as[devfn]->bus_num = (uint8_t)bus_num;
1413        iommu_as[devfn]->devfn = (uint8_t)devfn;
1414        iommu_as[devfn]->iommu_state = s;
1415
1416        amdvi_dev_as = iommu_as[devfn];
1417
1418        /*
1419         * Memory region relationships looks like (Address range shows
1420         * only lower 32 bits to make it short in length...):
1421         *
1422         * |-----------------+-------------------+----------|
1423         * | Name            | Address range     | Priority |
1424         * |-----------------+-------------------+----------+
1425         * | amdvi_root      | 00000000-ffffffff |        0 |
1426         * |  amdvi_iommu    | 00000000-ffffffff |        1 |
1427         * |  amdvi_iommu_ir | fee00000-feefffff |       64 |
1428         * |-----------------+-------------------+----------|
1429         */
1430        memory_region_init_iommu(&amdvi_dev_as->iommu,
1431                                 sizeof(amdvi_dev_as->iommu),
1432                                 TYPE_AMD_IOMMU_MEMORY_REGION,
1433                                 OBJECT(s),
1434                                 "amd_iommu", UINT64_MAX);
1435        memory_region_init(&amdvi_dev_as->root, OBJECT(s),
1436                           "amdvi_root", UINT64_MAX);
1437        address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name);
1438        memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s),
1439                              &amdvi_ir_ops, s, "amd_iommu_ir",
1440                              AMDVI_INT_ADDR_SIZE);
1441        memory_region_add_subregion_overlap(&amdvi_dev_as->root,
1442                                            AMDVI_INT_ADDR_FIRST,
1443                                            &amdvi_dev_as->iommu_ir,
1444                                            64);
1445        memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
1446                                            MEMORY_REGION(&amdvi_dev_as->iommu),
1447                                            1);
1448    }
1449    return &iommu_as[devfn]->as;
1450}
1451
1452static const MemoryRegionOps mmio_mem_ops = {
1453    .read = amdvi_mmio_read,
1454    .write = amdvi_mmio_write,
1455    .endianness = DEVICE_LITTLE_ENDIAN,
1456    .impl = {
1457        .min_access_size = 1,
1458        .max_access_size = 8,
1459        .unaligned = false,
1460    },
1461    .valid = {
1462        .min_access_size = 1,
1463        .max_access_size = 8,
1464    }
1465};
1466
1467static void amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
1468                                            IOMMUNotifierFlag old,
1469                                            IOMMUNotifierFlag new)
1470{
1471    AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
1472
1473    if (new & IOMMU_NOTIFIER_MAP) {
1474        error_report("device %02x.%02x.%x requires iommu notifier which is not "
1475                     "currently supported", as->bus_num, PCI_SLOT(as->devfn),
1476                     PCI_FUNC(as->devfn));
1477        exit(1);
1478    }
1479}
1480
1481static void amdvi_init(AMDVIState *s)
1482{
1483    amdvi_iotlb_reset(s);
1484
1485    s->devtab_len = 0;
1486    s->cmdbuf_len = 0;
1487    s->cmdbuf_head = 0;
1488    s->cmdbuf_tail = 0;
1489    s->evtlog_head = 0;
1490    s->evtlog_tail = 0;
1491    s->excl_enabled = false;
1492    s->excl_allow = false;
1493    s->mmio_enabled = false;
1494    s->enabled = false;
1495    s->ats_enabled = false;
1496    s->cmdbuf_enabled = false;
1497
1498    /* reset MMIO */
1499    memset(s->mmior, 0, AMDVI_MMIO_SIZE);
1500    amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES, AMDVI_EXT_FEATURES,
1501            0xffffffffffffffef, 0);
1502    amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67);
1503
1504    /* reset device ident */
1505    pci_config_set_vendor_id(s->pci.dev.config, PCI_VENDOR_ID_AMD);
1506    pci_config_set_prog_interface(s->pci.dev.config, 00);
1507    pci_config_set_device_id(s->pci.dev.config, s->devid);
1508    pci_config_set_class(s->pci.dev.config, 0x0806);
1509
1510    /* reset AMDVI specific capabilities, all r/o */
1511    pci_set_long(s->pci.dev.config + s->capab_offset, AMDVI_CAPAB_FEATURES);
1512    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_LOW,
1513                 s->mmio.addr & ~(0xffff0000));
1514    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH,
1515                (s->mmio.addr & ~(0xffff)) >> 16);
1516    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_RANGE,
1517                 0xff000000);
1518    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC, 0);
1519    pci_set_long(s->pci.dev.config + s->capab_offset + AMDVI_CAPAB_MISC,
1520            AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR);
1521}
1522
1523static void amdvi_reset(DeviceState *dev)
1524{
1525    AMDVIState *s = AMD_IOMMU_DEVICE(dev);
1526
1527    msi_reset(&s->pci.dev);
1528    amdvi_init(s);
1529}
1530
1531static void amdvi_realize(DeviceState *dev, Error **err)
1532{
1533    int ret = 0;
1534    AMDVIState *s = AMD_IOMMU_DEVICE(dev);
1535    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
1536    MachineState *ms = MACHINE(qdev_get_machine());
1537    PCMachineState *pcms = PC_MACHINE(ms);
1538    PCIBus *bus = pcms->bus;
1539
1540    s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
1541                                     amdvi_uint64_equal, g_free, g_free);
1542
1543    /* This device should take care of IOMMU PCI properties */
1544    x86_iommu->type = TYPE_AMD;
1545    qdev_set_parent_bus(DEVICE(&s->pci), &bus->qbus);
1546    object_property_set_bool(OBJECT(&s->pci), true, "realized", err);
1547    ret = pci_add_capability(&s->pci.dev, AMDVI_CAPAB_ID_SEC, 0,
1548                                         AMDVI_CAPAB_SIZE, err);
1549    if (ret < 0) {
1550        return;
1551    }
1552    s->capab_offset = ret;
1553
1554    ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_MSI, 0,
1555                             AMDVI_CAPAB_REG_SIZE, err);
1556    if (ret < 0) {
1557        return;
1558    }
1559    ret = pci_add_capability(&s->pci.dev, PCI_CAP_ID_HT, 0,
1560                             AMDVI_CAPAB_REG_SIZE, err);
1561    if (ret < 0) {
1562        return;
1563    }
1564
1565    /* Pseudo address space under root PCI bus. */
1566    pcms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
1567
1568    /* set up MMIO */
1569    memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
1570                          AMDVI_MMIO_SIZE);
1571
1572    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio);
1573    sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
1574    pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
1575    s->devid = object_property_get_int(OBJECT(&s->pci), "addr", err);
1576    msi_init(&s->pci.dev, 0, 1, true, false, err);
1577    amdvi_init(s);
1578}
1579
1580static const VMStateDescription vmstate_amdvi = {
1581    .name = "amd-iommu",
1582    .unmigratable = 1
1583};
1584
1585static void amdvi_instance_init(Object *klass)
1586{
1587    AMDVIState *s = AMD_IOMMU_DEVICE(klass);
1588
1589    object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI);
1590}
1591
1592static void amdvi_class_init(ObjectClass *klass, void* data)
1593{
1594    DeviceClass *dc = DEVICE_CLASS(klass);
1595    X86IOMMUClass *dc_class = X86_IOMMU_CLASS(klass);
1596
1597    dc->reset = amdvi_reset;
1598    dc->vmsd = &vmstate_amdvi;
1599    dc->hotpluggable = false;
1600    dc_class->realize = amdvi_realize;
1601    dc_class->int_remap = amdvi_int_remap;
1602    /* Supported by the pc-q35-* machine types */
1603    dc->user_creatable = true;
1604    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1605    dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
1606}
1607
1608static const TypeInfo amdvi = {
1609    .name = TYPE_AMD_IOMMU_DEVICE,
1610    .parent = TYPE_X86_IOMMU_DEVICE,
1611    .instance_size = sizeof(AMDVIState),
1612    .instance_init = amdvi_instance_init,
1613    .class_init = amdvi_class_init
1614};
1615
1616static const TypeInfo amdviPCI = {
1617    .name = "AMDVI-PCI",
1618    .parent = TYPE_PCI_DEVICE,
1619    .instance_size = sizeof(AMDVIPCIState),
1620    .interfaces = (InterfaceInfo[]) {
1621        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1622        { },
1623    },
1624};
1625
1626static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, void *data)
1627{
1628    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
1629
1630    imrc->translate = amdvi_translate;
1631    imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed;
1632}
1633
1634static const TypeInfo amdvi_iommu_memory_region_info = {
1635    .parent = TYPE_IOMMU_MEMORY_REGION,
1636    .name = TYPE_AMD_IOMMU_MEMORY_REGION,
1637    .class_init = amdvi_iommu_memory_region_class_init,
1638};
1639
1640static void amdviPCI_register_types(void)
1641{
1642    type_register_static(&amdviPCI);
1643    type_register_static(&amdvi);
1644    type_register_static(&amdvi_iommu_memory_region_info);
1645}
1646
1647type_init(amdviPCI_register_types);
1648