qemu/exec.c
<<
>>
Prefs
   1/*
   2 *  Virtual page mapping
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu-common.h"
  22#include "qapi/error.h"
  23
  24#include "qemu/cutils.h"
  25#include "cpu.h"
  26#include "exec/exec-all.h"
  27#include "exec/target_page.h"
  28#include "tcg.h"
  29#include "hw/qdev-core.h"
  30#include "hw/qdev-properties.h"
  31#if !defined(CONFIG_USER_ONLY)
  32#include "hw/boards.h"
  33#include "hw/xen/xen.h"
  34#endif
  35#include "sysemu/kvm.h"
  36#include "sysemu/sysemu.h"
  37#include "sysemu/tcg.h"
  38#include "qemu/timer.h"
  39#include "qemu/config-file.h"
  40#include "qemu/error-report.h"
  41#include "qemu/qemu-print.h"
  42#if defined(CONFIG_USER_ONLY)
  43#include "qemu.h"
  44#else /* !CONFIG_USER_ONLY */
  45#include "exec/memory.h"
  46#include "exec/ioport.h"
  47#include "sysemu/dma.h"
  48#include "sysemu/hostmem.h"
  49#include "sysemu/hw_accel.h"
  50#include "exec/address-spaces.h"
  51#include "sysemu/xen-mapcache.h"
  52#include "trace-root.h"
  53
  54#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
  55#include <linux/falloc.h>
  56#endif
  57
  58#endif
  59#include "qemu/rcu_queue.h"
  60#include "qemu/main-loop.h"
  61#include "translate-all.h"
  62#include "sysemu/replay.h"
  63
  64#include "exec/memory-internal.h"
  65#include "exec/ram_addr.h"
  66#include "exec/log.h"
  67
  68#include "migration/vmstate.h"
  69
  70#include "qemu/range.h"
  71#ifndef _WIN32
  72#include "qemu/mmap-alloc.h"
  73#endif
  74
  75#include "monitor/monitor.h"
  76
  77//#define DEBUG_SUBPAGE
  78
  79#if !defined(CONFIG_USER_ONLY)
  80/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  81 * are protected by the ramlist lock.
  82 */
  83RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  84
  85static MemoryRegion *system_memory;
  86static MemoryRegion *system_io;
  87
  88AddressSpace address_space_io;
  89AddressSpace address_space_memory;
  90
  91static MemoryRegion io_mem_unassigned;
  92#endif
  93
  94CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  95
  96/* current CPU in the current thread. It is only valid inside
  97   cpu_exec() */
  98__thread CPUState *current_cpu;
  99/* 0 = Do not count executed instructions.
 100   1 = Precise instruction counting.
 101   2 = Adaptive rate instruction counting.  */
 102int use_icount;
 103
 104uintptr_t qemu_host_page_size;
 105intptr_t qemu_host_page_mask;
 106
 107#if !defined(CONFIG_USER_ONLY)
 108
 109typedef struct PhysPageEntry PhysPageEntry;
 110
 111struct PhysPageEntry {
 112    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 113    uint32_t skip : 6;
 114     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 115    uint32_t ptr : 26;
 116};
 117
 118#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 119
 120/* Size of the L2 (and L3, etc) page tables.  */
 121#define ADDR_SPACE_BITS 64
 122
 123#define P_L2_BITS 9
 124#define P_L2_SIZE (1 << P_L2_BITS)
 125
 126#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 127
 128typedef PhysPageEntry Node[P_L2_SIZE];
 129
 130typedef struct PhysPageMap {
 131    struct rcu_head rcu;
 132
 133    unsigned sections_nb;
 134    unsigned sections_nb_alloc;
 135    unsigned nodes_nb;
 136    unsigned nodes_nb_alloc;
 137    Node *nodes;
 138    MemoryRegionSection *sections;
 139} PhysPageMap;
 140
 141struct AddressSpaceDispatch {
 142    MemoryRegionSection *mru_section;
 143    /* This is a multi-level map on the physical address space.
 144     * The bottom level has pointers to MemoryRegionSections.
 145     */
 146    PhysPageEntry phys_map;
 147    PhysPageMap map;
 148};
 149
 150#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 151typedef struct subpage_t {
 152    MemoryRegion iomem;
 153    FlatView *fv;
 154    hwaddr base;
 155    uint16_t sub_section[];
 156} subpage_t;
 157
 158#define PHYS_SECTION_UNASSIGNED 0
 159
 160static void io_mem_init(void);
 161static void memory_map_init(void);
 162static void tcg_log_global_after_sync(MemoryListener *listener);
 163static void tcg_commit(MemoryListener *listener);
 164
 165/**
 166 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 167 * @cpu: the CPU whose AddressSpace this is
 168 * @as: the AddressSpace itself
 169 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 170 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 171 */
 172struct CPUAddressSpace {
 173    CPUState *cpu;
 174    AddressSpace *as;
 175    struct AddressSpaceDispatch *memory_dispatch;
 176    MemoryListener tcg_as_listener;
 177};
 178
 179struct DirtyBitmapSnapshot {
 180    ram_addr_t start;
 181    ram_addr_t end;
 182    unsigned long dirty[];
 183};
 184
 185#endif
 186
 187#if !defined(CONFIG_USER_ONLY)
 188
 189static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 190{
 191    static unsigned alloc_hint = 16;
 192    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 193        map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
 194        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 195        alloc_hint = map->nodes_nb_alloc;
 196    }
 197}
 198
 199static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 200{
 201    unsigned i;
 202    uint32_t ret;
 203    PhysPageEntry e;
 204    PhysPageEntry *p;
 205
 206    ret = map->nodes_nb++;
 207    p = map->nodes[ret];
 208    assert(ret != PHYS_MAP_NODE_NIL);
 209    assert(ret != map->nodes_nb_alloc);
 210
 211    e.skip = leaf ? 0 : 1;
 212    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 213    for (i = 0; i < P_L2_SIZE; ++i) {
 214        memcpy(&p[i], &e, sizeof(e));
 215    }
 216    return ret;
 217}
 218
 219static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 220                                hwaddr *index, uint64_t *nb, uint16_t leaf,
 221                                int level)
 222{
 223    PhysPageEntry *p;
 224    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 225
 226    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 227        lp->ptr = phys_map_node_alloc(map, level == 0);
 228    }
 229    p = map->nodes[lp->ptr];
 230    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 231
 232    while (*nb && lp < &p[P_L2_SIZE]) {
 233        if ((*index & (step - 1)) == 0 && *nb >= step) {
 234            lp->skip = 0;
 235            lp->ptr = leaf;
 236            *index += step;
 237            *nb -= step;
 238        } else {
 239            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 240        }
 241        ++lp;
 242    }
 243}
 244
 245static void phys_page_set(AddressSpaceDispatch *d,
 246                          hwaddr index, uint64_t nb,
 247                          uint16_t leaf)
 248{
 249    /* Wildly overreserve - it doesn't matter much. */
 250    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 251
 252    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 253}
 254
 255/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 256 * and update our entry so we can skip it and go directly to the destination.
 257 */
 258static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 259{
 260    unsigned valid_ptr = P_L2_SIZE;
 261    int valid = 0;
 262    PhysPageEntry *p;
 263    int i;
 264
 265    if (lp->ptr == PHYS_MAP_NODE_NIL) {
 266        return;
 267    }
 268
 269    p = nodes[lp->ptr];
 270    for (i = 0; i < P_L2_SIZE; i++) {
 271        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 272            continue;
 273        }
 274
 275        valid_ptr = i;
 276        valid++;
 277        if (p[i].skip) {
 278            phys_page_compact(&p[i], nodes);
 279        }
 280    }
 281
 282    /* We can only compress if there's only one child. */
 283    if (valid != 1) {
 284        return;
 285    }
 286
 287    assert(valid_ptr < P_L2_SIZE);
 288
 289    /* Don't compress if it won't fit in the # of bits we have. */
 290    if (P_L2_LEVELS >= (1 << 6) &&
 291        lp->skip + p[valid_ptr].skip >= (1 << 6)) {
 292        return;
 293    }
 294
 295    lp->ptr = p[valid_ptr].ptr;
 296    if (!p[valid_ptr].skip) {
 297        /* If our only child is a leaf, make this a leaf. */
 298        /* By design, we should have made this node a leaf to begin with so we
 299         * should never reach here.
 300         * But since it's so simple to handle this, let's do it just in case we
 301         * change this rule.
 302         */
 303        lp->skip = 0;
 304    } else {
 305        lp->skip += p[valid_ptr].skip;
 306    }
 307}
 308
 309void address_space_dispatch_compact(AddressSpaceDispatch *d)
 310{
 311    if (d->phys_map.skip) {
 312        phys_page_compact(&d->phys_map, d->map.nodes);
 313    }
 314}
 315
 316static inline bool section_covers_addr(const MemoryRegionSection *section,
 317                                       hwaddr addr)
 318{
 319    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 320     * the section must cover the entire address space.
 321     */
 322    return int128_gethi(section->size) ||
 323           range_covers_byte(section->offset_within_address_space,
 324                             int128_getlo(section->size), addr);
 325}
 326
 327static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
 328{
 329    PhysPageEntry lp = d->phys_map, *p;
 330    Node *nodes = d->map.nodes;
 331    MemoryRegionSection *sections = d->map.sections;
 332    hwaddr index = addr >> TARGET_PAGE_BITS;
 333    int i;
 334
 335    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 336        if (lp.ptr == PHYS_MAP_NODE_NIL) {
 337            return &sections[PHYS_SECTION_UNASSIGNED];
 338        }
 339        p = nodes[lp.ptr];
 340        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 341    }
 342
 343    if (section_covers_addr(&sections[lp.ptr], addr)) {
 344        return &sections[lp.ptr];
 345    } else {
 346        return &sections[PHYS_SECTION_UNASSIGNED];
 347    }
 348}
 349
 350/* Called from RCU critical section */
 351static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 352                                                        hwaddr addr,
 353                                                        bool resolve_subpage)
 354{
 355    MemoryRegionSection *section = atomic_read(&d->mru_section);
 356    subpage_t *subpage;
 357
 358    if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
 359        !section_covers_addr(section, addr)) {
 360        section = phys_page_find(d, addr);
 361        atomic_set(&d->mru_section, section);
 362    }
 363    if (resolve_subpage && section->mr->subpage) {
 364        subpage = container_of(section->mr, subpage_t, iomem);
 365        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 366    }
 367    return section;
 368}
 369
 370/* Called from RCU critical section */
 371static MemoryRegionSection *
 372address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 373                                 hwaddr *plen, bool resolve_subpage)
 374{
 375    MemoryRegionSection *section;
 376    MemoryRegion *mr;
 377    Int128 diff;
 378
 379    section = address_space_lookup_region(d, addr, resolve_subpage);
 380    /* Compute offset within MemoryRegionSection */
 381    addr -= section->offset_within_address_space;
 382
 383    /* Compute offset within MemoryRegion */
 384    *xlat = addr + section->offset_within_region;
 385
 386    mr = section->mr;
 387
 388    /* MMIO registers can be expected to perform full-width accesses based only
 389     * on their address, without considering adjacent registers that could
 390     * decode to completely different MemoryRegions.  When such registers
 391     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 392     * regions overlap wildly.  For this reason we cannot clamp the accesses
 393     * here.
 394     *
 395     * If the length is small (as is the case for address_space_ldl/stl),
 396     * everything works fine.  If the incoming length is large, however,
 397     * the caller really has to do the clamping through memory_access_size.
 398     */
 399    if (memory_region_is_ram(mr)) {
 400        diff = int128_sub(section->size, int128_make64(addr));
 401        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 402    }
 403    return section;
 404}
 405
 406/**
 407 * address_space_translate_iommu - translate an address through an IOMMU
 408 * memory region and then through the target address space.
 409 *
 410 * @iommu_mr: the IOMMU memory region that we start the translation from
 411 * @addr: the address to be translated through the MMU
 412 * @xlat: the translated address offset within the destination memory region.
 413 *        It cannot be %NULL.
 414 * @plen_out: valid read/write length of the translated address. It
 415 *            cannot be %NULL.
 416 * @page_mask_out: page mask for the translated address. This
 417 *            should only be meaningful for IOMMU translated
 418 *            addresses, since there may be huge pages that this bit
 419 *            would tell. It can be %NULL if we don't care about it.
 420 * @is_write: whether the translation operation is for write
 421 * @is_mmio: whether this can be MMIO, set true if it can
 422 * @target_as: the address space targeted by the IOMMU
 423 * @attrs: transaction attributes
 424 *
 425 * This function is called from RCU critical section.  It is the common
 426 * part of flatview_do_translate and address_space_translate_cached.
 427 */
 428static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
 429                                                         hwaddr *xlat,
 430                                                         hwaddr *plen_out,
 431                                                         hwaddr *page_mask_out,
 432                                                         bool is_write,
 433                                                         bool is_mmio,
 434                                                         AddressSpace **target_as,
 435                                                         MemTxAttrs attrs)
 436{
 437    MemoryRegionSection *section;
 438    hwaddr page_mask = (hwaddr)-1;
 439
 440    do {
 441        hwaddr addr = *xlat;
 442        IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 443        int iommu_idx = 0;
 444        IOMMUTLBEntry iotlb;
 445
 446        if (imrc->attrs_to_index) {
 447            iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
 448        }
 449
 450        iotlb = imrc->translate(iommu_mr, addr, is_write ?
 451                                IOMMU_WO : IOMMU_RO, iommu_idx);
 452
 453        if (!(iotlb.perm & (1 << is_write))) {
 454            goto unassigned;
 455        }
 456
 457        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 458                | (addr & iotlb.addr_mask));
 459        page_mask &= iotlb.addr_mask;
 460        *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
 461        *target_as = iotlb.target_as;
 462
 463        section = address_space_translate_internal(
 464                address_space_to_dispatch(iotlb.target_as), addr, xlat,
 465                plen_out, is_mmio);
 466
 467        iommu_mr = memory_region_get_iommu(section->mr);
 468    } while (unlikely(iommu_mr));
 469
 470    if (page_mask_out) {
 471        *page_mask_out = page_mask;
 472    }
 473    return *section;
 474
 475unassigned:
 476    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
 477}
 478
 479/**
 480 * flatview_do_translate - translate an address in FlatView
 481 *
 482 * @fv: the flat view that we want to translate on
 483 * @addr: the address to be translated in above address space
 484 * @xlat: the translated address offset within memory region. It
 485 *        cannot be @NULL.
 486 * @plen_out: valid read/write length of the translated address. It
 487 *            can be @NULL when we don't care about it.
 488 * @page_mask_out: page mask for the translated address. This
 489 *            should only be meaningful for IOMMU translated
 490 *            addresses, since there may be huge pages that this bit
 491 *            would tell. It can be @NULL if we don't care about it.
 492 * @is_write: whether the translation operation is for write
 493 * @is_mmio: whether this can be MMIO, set true if it can
 494 * @target_as: the address space targeted by the IOMMU
 495 * @attrs: memory transaction attributes
 496 *
 497 * This function is called from RCU critical section
 498 */
 499static MemoryRegionSection flatview_do_translate(FlatView *fv,
 500                                                 hwaddr addr,
 501                                                 hwaddr *xlat,
 502                                                 hwaddr *plen_out,
 503                                                 hwaddr *page_mask_out,
 504                                                 bool is_write,
 505                                                 bool is_mmio,
 506                                                 AddressSpace **target_as,
 507                                                 MemTxAttrs attrs)
 508{
 509    MemoryRegionSection *section;
 510    IOMMUMemoryRegion *iommu_mr;
 511    hwaddr plen = (hwaddr)(-1);
 512
 513    if (!plen_out) {
 514        plen_out = &plen;
 515    }
 516
 517    section = address_space_translate_internal(
 518            flatview_to_dispatch(fv), addr, xlat,
 519            plen_out, is_mmio);
 520
 521    iommu_mr = memory_region_get_iommu(section->mr);
 522    if (unlikely(iommu_mr)) {
 523        return address_space_translate_iommu(iommu_mr, xlat,
 524                                             plen_out, page_mask_out,
 525                                             is_write, is_mmio,
 526                                             target_as, attrs);
 527    }
 528    if (page_mask_out) {
 529        /* Not behind an IOMMU, use default page size. */
 530        *page_mask_out = ~TARGET_PAGE_MASK;
 531    }
 532
 533    return *section;
 534}
 535
 536/* Called from RCU critical section */
 537IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
 538                                            bool is_write, MemTxAttrs attrs)
 539{
 540    MemoryRegionSection section;
 541    hwaddr xlat, page_mask;
 542
 543    /*
 544     * This can never be MMIO, and we don't really care about plen,
 545     * but page mask.
 546     */
 547    section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
 548                                    NULL, &page_mask, is_write, false, &as,
 549                                    attrs);
 550
 551    /* Illegal translation */
 552    if (section.mr == &io_mem_unassigned) {
 553        goto iotlb_fail;
 554    }
 555
 556    /* Convert memory region offset into address space offset */
 557    xlat += section.offset_within_address_space -
 558        section.offset_within_region;
 559
 560    return (IOMMUTLBEntry) {
 561        .target_as = as,
 562        .iova = addr & ~page_mask,
 563        .translated_addr = xlat & ~page_mask,
 564        .addr_mask = page_mask,
 565        /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
 566        .perm = IOMMU_RW,
 567    };
 568
 569iotlb_fail:
 570    return (IOMMUTLBEntry) {0};
 571}
 572
 573/* Called from RCU critical section */
 574MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
 575                                 hwaddr *plen, bool is_write,
 576                                 MemTxAttrs attrs)
 577{
 578    MemoryRegion *mr;
 579    MemoryRegionSection section;
 580    AddressSpace *as = NULL;
 581
 582    /* This can be MMIO, so setup MMIO bit. */
 583    section = flatview_do_translate(fv, addr, xlat, plen, NULL,
 584                                    is_write, true, &as, attrs);
 585    mr = section.mr;
 586
 587    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 588        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 589        *plen = MIN(page, *plen);
 590    }
 591
 592    return mr;
 593}
 594
 595typedef struct TCGIOMMUNotifier {
 596    IOMMUNotifier n;
 597    MemoryRegion *mr;
 598    CPUState *cpu;
 599    int iommu_idx;
 600    bool active;
 601} TCGIOMMUNotifier;
 602
 603static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 604{
 605    TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
 606
 607    if (!notifier->active) {
 608        return;
 609    }
 610    tlb_flush(notifier->cpu);
 611    notifier->active = false;
 612    /* We leave the notifier struct on the list to avoid reallocating it later.
 613     * Generally the number of IOMMUs a CPU deals with will be small.
 614     * In any case we can't unregister the iommu notifier from a notify
 615     * callback.
 616     */
 617}
 618
 619static void tcg_register_iommu_notifier(CPUState *cpu,
 620                                        IOMMUMemoryRegion *iommu_mr,
 621                                        int iommu_idx)
 622{
 623    /* Make sure this CPU has an IOMMU notifier registered for this
 624     * IOMMU/IOMMU index combination, so that we can flush its TLB
 625     * when the IOMMU tells us the mappings we've cached have changed.
 626     */
 627    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
 628    TCGIOMMUNotifier *notifier;
 629    Error *err = NULL;
 630    int i, ret;
 631
 632    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
 633        notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
 634        if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
 635            break;
 636        }
 637    }
 638    if (i == cpu->iommu_notifiers->len) {
 639        /* Not found, add a new entry at the end of the array */
 640        cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
 641        notifier = g_new0(TCGIOMMUNotifier, 1);
 642        g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
 643
 644        notifier->mr = mr;
 645        notifier->iommu_idx = iommu_idx;
 646        notifier->cpu = cpu;
 647        /* Rather than trying to register interest in the specific part
 648         * of the iommu's address space that we've accessed and then
 649         * expand it later as subsequent accesses touch more of it, we
 650         * just register interest in the whole thing, on the assumption
 651         * that iommu reconfiguration will be rare.
 652         */
 653        iommu_notifier_init(&notifier->n,
 654                            tcg_iommu_unmap_notify,
 655                            IOMMU_NOTIFIER_UNMAP,
 656                            0,
 657                            HWADDR_MAX,
 658                            iommu_idx);
 659        ret = memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
 660                                                    &err);
 661        if (ret) {
 662            error_report_err(err);
 663            exit(1);
 664        }
 665    }
 666
 667    if (!notifier->active) {
 668        notifier->active = true;
 669    }
 670}
 671
 672static void tcg_iommu_free_notifier_list(CPUState *cpu)
 673{
 674    /* Destroy the CPU's notifier list */
 675    int i;
 676    TCGIOMMUNotifier *notifier;
 677
 678    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
 679        notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
 680        memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
 681        g_free(notifier);
 682    }
 683    g_array_free(cpu->iommu_notifiers, true);
 684}
 685
 686/* Called from RCU critical section */
 687MemoryRegionSection *
 688address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 689                                  hwaddr *xlat, hwaddr *plen,
 690                                  MemTxAttrs attrs, int *prot)
 691{
 692    MemoryRegionSection *section;
 693    IOMMUMemoryRegion *iommu_mr;
 694    IOMMUMemoryRegionClass *imrc;
 695    IOMMUTLBEntry iotlb;
 696    int iommu_idx;
 697    AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 698
 699    for (;;) {
 700        section = address_space_translate_internal(d, addr, &addr, plen, false);
 701
 702        iommu_mr = memory_region_get_iommu(section->mr);
 703        if (!iommu_mr) {
 704            break;
 705        }
 706
 707        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 708
 709        iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
 710        tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
 711        /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
 712         * doesn't short-cut its translation table walk.
 713         */
 714        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
 715        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 716                | (addr & iotlb.addr_mask));
 717        /* Update the caller's prot bits to remove permissions the IOMMU
 718         * is giving us a failure response for. If we get down to no
 719         * permissions left at all we can give up now.
 720         */
 721        if (!(iotlb.perm & IOMMU_RO)) {
 722            *prot &= ~(PAGE_READ | PAGE_EXEC);
 723        }
 724        if (!(iotlb.perm & IOMMU_WO)) {
 725            *prot &= ~PAGE_WRITE;
 726        }
 727
 728        if (!*prot) {
 729            goto translate_fail;
 730        }
 731
 732        d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
 733    }
 734
 735    assert(!memory_region_is_iommu(section->mr));
 736    *xlat = addr;
 737    return section;
 738
 739translate_fail:
 740    return &d->map.sections[PHYS_SECTION_UNASSIGNED];
 741}
 742#endif
 743
 744#if !defined(CONFIG_USER_ONLY)
 745
 746static int cpu_common_post_load(void *opaque, int version_id)
 747{
 748    CPUState *cpu = opaque;
 749
 750    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 751       version_id is increased. */
 752    cpu->interrupt_request &= ~0x01;
 753    tlb_flush(cpu);
 754
 755    /* loadvm has just updated the content of RAM, bypassing the
 756     * usual mechanisms that ensure we flush TBs for writes to
 757     * memory we've translated code from. So we must flush all TBs,
 758     * which will now be stale.
 759     */
 760    tb_flush(cpu);
 761
 762    return 0;
 763}
 764
 765static int cpu_common_pre_load(void *opaque)
 766{
 767    CPUState *cpu = opaque;
 768
 769    cpu->exception_index = -1;
 770
 771    return 0;
 772}
 773
 774static bool cpu_common_exception_index_needed(void *opaque)
 775{
 776    CPUState *cpu = opaque;
 777
 778    return tcg_enabled() && cpu->exception_index != -1;
 779}
 780
 781static const VMStateDescription vmstate_cpu_common_exception_index = {
 782    .name = "cpu_common/exception_index",
 783    .version_id = 1,
 784    .minimum_version_id = 1,
 785    .needed = cpu_common_exception_index_needed,
 786    .fields = (VMStateField[]) {
 787        VMSTATE_INT32(exception_index, CPUState),
 788        VMSTATE_END_OF_LIST()
 789    }
 790};
 791
 792static bool cpu_common_crash_occurred_needed(void *opaque)
 793{
 794    CPUState *cpu = opaque;
 795
 796    return cpu->crash_occurred;
 797}
 798
 799static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 800    .name = "cpu_common/crash_occurred",
 801    .version_id = 1,
 802    .minimum_version_id = 1,
 803    .needed = cpu_common_crash_occurred_needed,
 804    .fields = (VMStateField[]) {
 805        VMSTATE_BOOL(crash_occurred, CPUState),
 806        VMSTATE_END_OF_LIST()
 807    }
 808};
 809
 810const VMStateDescription vmstate_cpu_common = {
 811    .name = "cpu_common",
 812    .version_id = 1,
 813    .minimum_version_id = 1,
 814    .pre_load = cpu_common_pre_load,
 815    .post_load = cpu_common_post_load,
 816    .fields = (VMStateField[]) {
 817        VMSTATE_UINT32(halted, CPUState),
 818        VMSTATE_UINT32(interrupt_request, CPUState),
 819        VMSTATE_END_OF_LIST()
 820    },
 821    .subsections = (const VMStateDescription*[]) {
 822        &vmstate_cpu_common_exception_index,
 823        &vmstate_cpu_common_crash_occurred,
 824        NULL
 825    }
 826};
 827
 828#endif
 829
 830CPUState *qemu_get_cpu(int index)
 831{
 832    CPUState *cpu;
 833
 834    CPU_FOREACH(cpu) {
 835        if (cpu->cpu_index == index) {
 836            return cpu;
 837        }
 838    }
 839
 840    return NULL;
 841}
 842
 843#if !defined(CONFIG_USER_ONLY)
 844void cpu_address_space_init(CPUState *cpu, int asidx,
 845                            const char *prefix, MemoryRegion *mr)
 846{
 847    CPUAddressSpace *newas;
 848    AddressSpace *as = g_new0(AddressSpace, 1);
 849    char *as_name;
 850
 851    assert(mr);
 852    as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
 853    address_space_init(as, mr, as_name);
 854    g_free(as_name);
 855
 856    /* Target code should have set num_ases before calling us */
 857    assert(asidx < cpu->num_ases);
 858
 859    if (asidx == 0) {
 860        /* address space 0 gets the convenience alias */
 861        cpu->as = as;
 862    }
 863
 864    /* KVM cannot currently support multiple address spaces. */
 865    assert(asidx == 0 || !kvm_enabled());
 866
 867    if (!cpu->cpu_ases) {
 868        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 869    }
 870
 871    newas = &cpu->cpu_ases[asidx];
 872    newas->cpu = cpu;
 873    newas->as = as;
 874    if (tcg_enabled()) {
 875        newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
 876        newas->tcg_as_listener.commit = tcg_commit;
 877        memory_listener_register(&newas->tcg_as_listener, as);
 878    }
 879}
 880
 881AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 882{
 883    /* Return the AddressSpace corresponding to the specified index */
 884    return cpu->cpu_ases[asidx].as;
 885}
 886#endif
 887
 888void cpu_exec_unrealizefn(CPUState *cpu)
 889{
 890    CPUClass *cc = CPU_GET_CLASS(cpu);
 891
 892    cpu_list_remove(cpu);
 893
 894    if (cc->vmsd != NULL) {
 895        vmstate_unregister(NULL, cc->vmsd, cpu);
 896    }
 897    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 898        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 899    }
 900#ifndef CONFIG_USER_ONLY
 901    tcg_iommu_free_notifier_list(cpu);
 902#endif
 903}
 904
 905Property cpu_common_props[] = {
 906#ifndef CONFIG_USER_ONLY
 907    /* Create a memory property for softmmu CPU object,
 908     * so users can wire up its memory. (This can't go in hw/core/cpu.c
 909     * because that file is compiled only once for both user-mode
 910     * and system builds.) The default if no link is set up is to use
 911     * the system address space.
 912     */
 913    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
 914                     MemoryRegion *),
 915#endif
 916    DEFINE_PROP_END_OF_LIST(),
 917};
 918
 919void cpu_exec_initfn(CPUState *cpu)
 920{
 921    cpu->as = NULL;
 922    cpu->num_ases = 0;
 923
 924#ifndef CONFIG_USER_ONLY
 925    cpu->thread_id = qemu_get_thread_id();
 926    cpu->memory = system_memory;
 927    object_ref(OBJECT(cpu->memory));
 928#endif
 929}
 930
 931void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 932{
 933    CPUClass *cc = CPU_GET_CLASS(cpu);
 934    static bool tcg_target_initialized;
 935
 936    cpu_list_add(cpu);
 937
 938    if (tcg_enabled() && !tcg_target_initialized) {
 939        tcg_target_initialized = true;
 940        cc->tcg_initialize();
 941    }
 942    tlb_init(cpu);
 943
 944    qemu_plugin_vcpu_init_hook(cpu);
 945
 946#ifndef CONFIG_USER_ONLY
 947    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 948        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 949    }
 950    if (cc->vmsd != NULL) {
 951        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 952    }
 953
 954    cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
 955#endif
 956}
 957
 958const char *parse_cpu_option(const char *cpu_option)
 959{
 960    ObjectClass *oc;
 961    CPUClass *cc;
 962    gchar **model_pieces;
 963    const char *cpu_type;
 964
 965    model_pieces = g_strsplit(cpu_option, ",", 2);
 966    if (!model_pieces[0]) {
 967        error_report("-cpu option cannot be empty");
 968        exit(1);
 969    }
 970
 971    oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
 972    if (oc == NULL) {
 973        error_report("unable to find CPU model '%s'", model_pieces[0]);
 974        g_strfreev(model_pieces);
 975        exit(EXIT_FAILURE);
 976    }
 977
 978    cpu_type = object_class_get_name(oc);
 979    cc = CPU_CLASS(oc);
 980    cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
 981    g_strfreev(model_pieces);
 982    return cpu_type;
 983}
 984
 985#if defined(CONFIG_USER_ONLY)
 986void tb_invalidate_phys_addr(target_ulong addr)
 987{
 988    mmap_lock();
 989    tb_invalidate_phys_page_range(addr, addr + 1);
 990    mmap_unlock();
 991}
 992
 993static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 994{
 995    tb_invalidate_phys_addr(pc);
 996}
 997#else
 998void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
 999{
1000    ram_addr_t ram_addr;
1001    MemoryRegion *mr;
1002    hwaddr l = 1;
1003
1004    if (!tcg_enabled()) {
1005        return;
1006    }
1007
1008    RCU_READ_LOCK_GUARD();
1009    mr = address_space_translate(as, addr, &addr, &l, false, attrs);
1010    if (!(memory_region_is_ram(mr)
1011          || memory_region_is_romd(mr))) {
1012        return;
1013    }
1014    ram_addr = memory_region_get_ram_addr(mr) + addr;
1015    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
1016}
1017
1018static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
1019{
1020    MemTxAttrs attrs;
1021    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
1022    int asidx = cpu_asidx_from_attrs(cpu, attrs);
1023    if (phys != -1) {
1024        /* Locks grabbed by tb_invalidate_phys_addr */
1025        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
1026                                phys | (pc & ~TARGET_PAGE_MASK), attrs);
1027    }
1028}
1029#endif
1030
1031#ifndef CONFIG_USER_ONLY
1032/* Add a watchpoint.  */
1033int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
1034                          int flags, CPUWatchpoint **watchpoint)
1035{
1036    CPUWatchpoint *wp;
1037
1038    /* forbid ranges which are empty or run off the end of the address space */
1039    if (len == 0 || (addr + len - 1) < addr) {
1040        error_report("tried to set invalid watchpoint at %"
1041                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
1042        return -EINVAL;
1043    }
1044    wp = g_malloc(sizeof(*wp));
1045
1046    wp->vaddr = addr;
1047    wp->len = len;
1048    wp->flags = flags;
1049
1050    /* keep all GDB-injected watchpoints in front */
1051    if (flags & BP_GDB) {
1052        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
1053    } else {
1054        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
1055    }
1056
1057    tlb_flush_page(cpu, addr);
1058
1059    if (watchpoint)
1060        *watchpoint = wp;
1061    return 0;
1062}
1063
1064/* Remove a specific watchpoint.  */
1065int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
1066                          int flags)
1067{
1068    CPUWatchpoint *wp;
1069
1070    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1071        if (addr == wp->vaddr && len == wp->len
1072                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
1073            cpu_watchpoint_remove_by_ref(cpu, wp);
1074            return 0;
1075        }
1076    }
1077    return -ENOENT;
1078}
1079
1080/* Remove a specific watchpoint by reference.  */
1081void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
1082{
1083    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
1084
1085    tlb_flush_page(cpu, watchpoint->vaddr);
1086
1087    g_free(watchpoint);
1088}
1089
1090/* Remove all matching watchpoints.  */
1091void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
1092{
1093    CPUWatchpoint *wp, *next;
1094
1095    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
1096        if (wp->flags & mask) {
1097            cpu_watchpoint_remove_by_ref(cpu, wp);
1098        }
1099    }
1100}
1101
1102/* Return true if this watchpoint address matches the specified
1103 * access (ie the address range covered by the watchpoint overlaps
1104 * partially or completely with the address range covered by the
1105 * access).
1106 */
1107static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
1108                                              vaddr addr, vaddr len)
1109{
1110    /* We know the lengths are non-zero, but a little caution is
1111     * required to avoid errors in the case where the range ends
1112     * exactly at the top of the address space and so addr + len
1113     * wraps round to zero.
1114     */
1115    vaddr wpend = wp->vaddr + wp->len - 1;
1116    vaddr addrend = addr + len - 1;
1117
1118    return !(addr > wpend || wp->vaddr > addrend);
1119}
1120
1121/* Return flags for watchpoints that match addr + prot.  */
1122int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
1123{
1124    CPUWatchpoint *wp;
1125    int ret = 0;
1126
1127    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1128        if (watchpoint_address_matches(wp, addr, TARGET_PAGE_SIZE)) {
1129            ret |= wp->flags;
1130        }
1131    }
1132    return ret;
1133}
1134#endif /* !CONFIG_USER_ONLY */
1135
1136/* Add a breakpoint.  */
1137int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
1138                          CPUBreakpoint **breakpoint)
1139{
1140    CPUBreakpoint *bp;
1141
1142    bp = g_malloc(sizeof(*bp));
1143
1144    bp->pc = pc;
1145    bp->flags = flags;
1146
1147    /* keep all GDB-injected breakpoints in front */
1148    if (flags & BP_GDB) {
1149        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
1150    } else {
1151        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
1152    }
1153
1154    breakpoint_invalidate(cpu, pc);
1155
1156    if (breakpoint) {
1157        *breakpoint = bp;
1158    }
1159    return 0;
1160}
1161
1162/* Remove a specific breakpoint.  */
1163int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
1164{
1165    CPUBreakpoint *bp;
1166
1167    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1168        if (bp->pc == pc && bp->flags == flags) {
1169            cpu_breakpoint_remove_by_ref(cpu, bp);
1170            return 0;
1171        }
1172    }
1173    return -ENOENT;
1174}
1175
1176/* Remove a specific breakpoint by reference.  */
1177void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
1178{
1179    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
1180
1181    breakpoint_invalidate(cpu, breakpoint->pc);
1182
1183    g_free(breakpoint);
1184}
1185
1186/* Remove all matching breakpoints. */
1187void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
1188{
1189    CPUBreakpoint *bp, *next;
1190
1191    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
1192        if (bp->flags & mask) {
1193            cpu_breakpoint_remove_by_ref(cpu, bp);
1194        }
1195    }
1196}
1197
1198/* enable or disable single step mode. EXCP_DEBUG is returned by the
1199   CPU loop after each instruction */
1200void cpu_single_step(CPUState *cpu, int enabled)
1201{
1202    if (cpu->singlestep_enabled != enabled) {
1203        cpu->singlestep_enabled = enabled;
1204        if (kvm_enabled()) {
1205            kvm_update_guest_debug(cpu, 0);
1206        } else {
1207            /* must flush all the translated code to avoid inconsistencies */
1208            /* XXX: only flush what is necessary */
1209            tb_flush(cpu);
1210        }
1211    }
1212}
1213
1214void cpu_abort(CPUState *cpu, const char *fmt, ...)
1215{
1216    va_list ap;
1217    va_list ap2;
1218
1219    va_start(ap, fmt);
1220    va_copy(ap2, ap);
1221    fprintf(stderr, "qemu: fatal: ");
1222    vfprintf(stderr, fmt, ap);
1223    fprintf(stderr, "\n");
1224    cpu_dump_state(cpu, stderr, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1225    if (qemu_log_separate()) {
1226        qemu_log_lock();
1227        qemu_log("qemu: fatal: ");
1228        qemu_log_vprintf(fmt, ap2);
1229        qemu_log("\n");
1230        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1231        qemu_log_flush();
1232        qemu_log_unlock();
1233        qemu_log_close();
1234    }
1235    va_end(ap2);
1236    va_end(ap);
1237    replay_finish();
1238#if defined(CONFIG_USER_ONLY)
1239    {
1240        struct sigaction act;
1241        sigfillset(&act.sa_mask);
1242        act.sa_handler = SIG_DFL;
1243        act.sa_flags = 0;
1244        sigaction(SIGABRT, &act, NULL);
1245    }
1246#endif
1247    abort();
1248}
1249
1250#if !defined(CONFIG_USER_ONLY)
1251/* Called from RCU critical section */
1252static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
1253{
1254    RAMBlock *block;
1255
1256    block = atomic_rcu_read(&ram_list.mru_block);
1257    if (block && addr - block->offset < block->max_length) {
1258        return block;
1259    }
1260    RAMBLOCK_FOREACH(block) {
1261        if (addr - block->offset < block->max_length) {
1262            goto found;
1263        }
1264    }
1265
1266    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1267    abort();
1268
1269found:
1270    /* It is safe to write mru_block outside the iothread lock.  This
1271     * is what happens:
1272     *
1273     *     mru_block = xxx
1274     *     rcu_read_unlock()
1275     *                                        xxx removed from list
1276     *                  rcu_read_lock()
1277     *                  read mru_block
1278     *                                        mru_block = NULL;
1279     *                                        call_rcu(reclaim_ramblock, xxx);
1280     *                  rcu_read_unlock()
1281     *
1282     * atomic_rcu_set is not needed here.  The block was already published
1283     * when it was placed into the list.  Here we're just making an extra
1284     * copy of the pointer.
1285     */
1286    ram_list.mru_block = block;
1287    return block;
1288}
1289
1290static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
1291{
1292    CPUState *cpu;
1293    ram_addr_t start1;
1294    RAMBlock *block;
1295    ram_addr_t end;
1296
1297    assert(tcg_enabled());
1298    end = TARGET_PAGE_ALIGN(start + length);
1299    start &= TARGET_PAGE_MASK;
1300
1301    RCU_READ_LOCK_GUARD();
1302    block = qemu_get_ram_block(start);
1303    assert(block == qemu_get_ram_block(end - 1));
1304    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1305    CPU_FOREACH(cpu) {
1306        tlb_reset_dirty(cpu, start1, length);
1307    }
1308}
1309
1310/* Note: start and end must be within the same ram block.  */
1311bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1312                                              ram_addr_t length,
1313                                              unsigned client)
1314{
1315    DirtyMemoryBlocks *blocks;
1316    unsigned long end, page;
1317    bool dirty = false;
1318    RAMBlock *ramblock;
1319    uint64_t mr_offset, mr_size;
1320
1321    if (length == 0) {
1322        return false;
1323    }
1324
1325    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1326    page = start >> TARGET_PAGE_BITS;
1327
1328    WITH_RCU_READ_LOCK_GUARD() {
1329        blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1330        ramblock = qemu_get_ram_block(start);
1331        /* Range sanity check on the ramblock */
1332        assert(start >= ramblock->offset &&
1333               start + length <= ramblock->offset + ramblock->used_length);
1334
1335        while (page < end) {
1336            unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1337            unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1338            unsigned long num = MIN(end - page,
1339                                    DIRTY_MEMORY_BLOCK_SIZE - offset);
1340
1341            dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1342                                                  offset, num);
1343            page += num;
1344        }
1345
1346        mr_offset = (ram_addr_t)(page << TARGET_PAGE_BITS) - ramblock->offset;
1347        mr_size = (end - page) << TARGET_PAGE_BITS;
1348        memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
1349    }
1350
1351    if (dirty && tcg_enabled()) {
1352        tlb_reset_dirty_range_all(start, length);
1353    }
1354
1355    return dirty;
1356}
1357
1358DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1359    (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
1360{
1361    DirtyMemoryBlocks *blocks;
1362    ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
1363    unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
1364    ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
1365    ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
1366    DirtyBitmapSnapshot *snap;
1367    unsigned long page, end, dest;
1368
1369    snap = g_malloc0(sizeof(*snap) +
1370                     ((last - first) >> (TARGET_PAGE_BITS + 3)));
1371    snap->start = first;
1372    snap->end   = last;
1373
1374    page = first >> TARGET_PAGE_BITS;
1375    end  = last  >> TARGET_PAGE_BITS;
1376    dest = 0;
1377
1378    WITH_RCU_READ_LOCK_GUARD() {
1379        blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1380
1381        while (page < end) {
1382            unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1383            unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1384            unsigned long num = MIN(end - page,
1385                                    DIRTY_MEMORY_BLOCK_SIZE - offset);
1386
1387            assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
1388            assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
1389            offset >>= BITS_PER_LEVEL;
1390
1391            bitmap_copy_and_clear_atomic(snap->dirty + dest,
1392                                         blocks->blocks[idx] + offset,
1393                                         num);
1394            page += num;
1395            dest += num >> BITS_PER_LEVEL;
1396        }
1397    }
1398
1399    if (tcg_enabled()) {
1400        tlb_reset_dirty_range_all(start, length);
1401    }
1402
1403    memory_region_clear_dirty_bitmap(mr, offset, length);
1404
1405    return snap;
1406}
1407
1408bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
1409                                            ram_addr_t start,
1410                                            ram_addr_t length)
1411{
1412    unsigned long page, end;
1413
1414    assert(start >= snap->start);
1415    assert(start + length <= snap->end);
1416
1417    end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
1418    page = (start - snap->start) >> TARGET_PAGE_BITS;
1419
1420    while (page < end) {
1421        if (test_bit(page, snap->dirty)) {
1422            return true;
1423        }
1424        page++;
1425    }
1426    return false;
1427}
1428
1429/* Called from RCU critical section */
1430hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1431                                       MemoryRegionSection *section)
1432{
1433    AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
1434    return section - d->map.sections;
1435}
1436#endif /* defined(CONFIG_USER_ONLY) */
1437
1438#if !defined(CONFIG_USER_ONLY)
1439
1440static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
1441                            uint16_t section);
1442static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1443
1444static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
1445                               qemu_anon_ram_alloc;
1446
1447/*
1448 * Set a custom physical guest memory alloator.
1449 * Accelerators with unusual needs may need this.  Hopefully, we can
1450 * get rid of it eventually.
1451 */
1452void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared))
1453{
1454    phys_mem_alloc = alloc;
1455}
1456
1457static uint16_t phys_section_add(PhysPageMap *map,
1458                                 MemoryRegionSection *section)
1459{
1460    /* The physical section number is ORed with a page-aligned
1461     * pointer to produce the iotlb entries.  Thus it should
1462     * never overflow into the page-aligned value.
1463     */
1464    assert(map->sections_nb < TARGET_PAGE_SIZE);
1465
1466    if (map->sections_nb == map->sections_nb_alloc) {
1467        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1468        map->sections = g_renew(MemoryRegionSection, map->sections,
1469                                map->sections_nb_alloc);
1470    }
1471    map->sections[map->sections_nb] = *section;
1472    memory_region_ref(section->mr);
1473    return map->sections_nb++;
1474}
1475
1476static void phys_section_destroy(MemoryRegion *mr)
1477{
1478    bool have_sub_page = mr->subpage;
1479
1480    memory_region_unref(mr);
1481
1482    if (have_sub_page) {
1483        subpage_t *subpage = container_of(mr, subpage_t, iomem);
1484        object_unref(OBJECT(&subpage->iomem));
1485        g_free(subpage);
1486    }
1487}
1488
1489static void phys_sections_free(PhysPageMap *map)
1490{
1491    while (map->sections_nb > 0) {
1492        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1493        phys_section_destroy(section->mr);
1494    }
1495    g_free(map->sections);
1496    g_free(map->nodes);
1497}
1498
1499static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1500{
1501    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1502    subpage_t *subpage;
1503    hwaddr base = section->offset_within_address_space
1504        & TARGET_PAGE_MASK;
1505    MemoryRegionSection *existing = phys_page_find(d, base);
1506    MemoryRegionSection subsection = {
1507        .offset_within_address_space = base,
1508        .size = int128_make64(TARGET_PAGE_SIZE),
1509    };
1510    hwaddr start, end;
1511
1512    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1513
1514    if (!(existing->mr->subpage)) {
1515        subpage = subpage_init(fv, base);
1516        subsection.fv = fv;
1517        subsection.mr = &subpage->iomem;
1518        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1519                      phys_section_add(&d->map, &subsection));
1520    } else {
1521        subpage = container_of(existing->mr, subpage_t, iomem);
1522    }
1523    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1524    end = start + int128_get64(section->size) - 1;
1525    subpage_register(subpage, start, end,
1526                     phys_section_add(&d->map, section));
1527}
1528
1529
1530static void register_multipage(FlatView *fv,
1531                               MemoryRegionSection *section)
1532{
1533    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1534    hwaddr start_addr = section->offset_within_address_space;
1535    uint16_t section_index = phys_section_add(&d->map, section);
1536    uint64_t num_pages = int128_get64(int128_rshift(section->size,
1537                                                    TARGET_PAGE_BITS));
1538
1539    assert(num_pages);
1540    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1541}
1542
1543/*
1544 * The range in *section* may look like this:
1545 *
1546 *      |s|PPPPPPP|s|
1547 *
1548 * where s stands for subpage and P for page.
1549 */
1550void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1551{
1552    MemoryRegionSection remain = *section;
1553    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1554
1555    /* register first subpage */
1556    if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1557        uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
1558                        - remain.offset_within_address_space;
1559
1560        MemoryRegionSection now = remain;
1561        now.size = int128_min(int128_make64(left), now.size);
1562        register_subpage(fv, &now);
1563        if (int128_eq(remain.size, now.size)) {
1564            return;
1565        }
1566        remain.size = int128_sub(remain.size, now.size);
1567        remain.offset_within_address_space += int128_get64(now.size);
1568        remain.offset_within_region += int128_get64(now.size);
1569    }
1570
1571    /* register whole pages */
1572    if (int128_ge(remain.size, page_size)) {
1573        MemoryRegionSection now = remain;
1574        now.size = int128_and(now.size, int128_neg(page_size));
1575        register_multipage(fv, &now);
1576        if (int128_eq(remain.size, now.size)) {
1577            return;
1578        }
1579        remain.size = int128_sub(remain.size, now.size);
1580        remain.offset_within_address_space += int128_get64(now.size);
1581        remain.offset_within_region += int128_get64(now.size);
1582    }
1583
1584    /* register last subpage */
1585    register_subpage(fv, &remain);
1586}
1587
1588void qemu_flush_coalesced_mmio_buffer(void)
1589{
1590    if (kvm_enabled())
1591        kvm_flush_coalesced_mmio_buffer();
1592}
1593
1594void qemu_mutex_lock_ramlist(void)
1595{
1596    qemu_mutex_lock(&ram_list.mutex);
1597}
1598
1599void qemu_mutex_unlock_ramlist(void)
1600{
1601    qemu_mutex_unlock(&ram_list.mutex);
1602}
1603
1604void ram_block_dump(Monitor *mon)
1605{
1606    RAMBlock *block;
1607    char *psize;
1608
1609    RCU_READ_LOCK_GUARD();
1610    monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
1611                   "Block Name", "PSize", "Offset", "Used", "Total");
1612    RAMBLOCK_FOREACH(block) {
1613        psize = size_to_str(block->page_size);
1614        monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1615                       " 0x%016" PRIx64 "\n", block->idstr, psize,
1616                       (uint64_t)block->offset,
1617                       (uint64_t)block->used_length,
1618                       (uint64_t)block->max_length);
1619        g_free(psize);
1620    }
1621}
1622
1623#ifdef __linux__
1624/*
1625 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
1626 * may or may not name the same files / on the same filesystem now as
1627 * when we actually open and map them.  Iterate over the file
1628 * descriptors instead, and use qemu_fd_getpagesize().
1629 */
1630static int find_min_backend_pagesize(Object *obj, void *opaque)
1631{
1632    long *hpsize_min = opaque;
1633
1634    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1635        HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1636        long hpsize = host_memory_backend_pagesize(backend);
1637
1638        if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
1639            *hpsize_min = hpsize;
1640        }
1641    }
1642
1643    return 0;
1644}
1645
1646static int find_max_backend_pagesize(Object *obj, void *opaque)
1647{
1648    long *hpsize_max = opaque;
1649
1650    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1651        HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1652        long hpsize = host_memory_backend_pagesize(backend);
1653
1654        if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
1655            *hpsize_max = hpsize;
1656        }
1657    }
1658
1659    return 0;
1660}
1661
1662/*
1663 * TODO: We assume right now that all mapped host memory backends are
1664 * used as RAM, however some might be used for different purposes.
1665 */
1666long qemu_minrampagesize(void)
1667{
1668    long hpsize = LONG_MAX;
1669    long mainrampagesize;
1670    Object *memdev_root;
1671    MachineState *ms = MACHINE(qdev_get_machine());
1672
1673    mainrampagesize = qemu_mempath_getpagesize(mem_path);
1674
1675    /* it's possible we have memory-backend objects with
1676     * hugepage-backed RAM. these may get mapped into system
1677     * address space via -numa parameters or memory hotplug
1678     * hooks. we want to take these into account, but we
1679     * also want to make sure these supported hugepage
1680     * sizes are applicable across the entire range of memory
1681     * we may boot from, so we take the min across all
1682     * backends, and assume normal pages in cases where a
1683     * backend isn't backed by hugepages.
1684     */
1685    memdev_root = object_resolve_path("/objects", NULL);
1686    if (memdev_root) {
1687        object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
1688    }
1689    if (hpsize == LONG_MAX) {
1690        /* No additional memory regions found ==> Report main RAM page size */
1691        return mainrampagesize;
1692    }
1693
1694    /* If NUMA is disabled or the NUMA nodes are not backed with a
1695     * memory-backend, then there is at least one node using "normal" RAM,
1696     * so if its page size is smaller we have got to report that size instead.
1697     */
1698    if (hpsize > mainrampagesize &&
1699        (ms->numa_state == NULL ||
1700         ms->numa_state->num_nodes == 0 ||
1701         ms->numa_state->nodes[0].node_memdev == NULL)) {
1702        static bool warned;
1703        if (!warned) {
1704            error_report("Huge page support disabled (n/a for main memory).");
1705            warned = true;
1706        }
1707        return mainrampagesize;
1708    }
1709
1710    return hpsize;
1711}
1712
1713long qemu_maxrampagesize(void)
1714{
1715    long pagesize = qemu_mempath_getpagesize(mem_path);
1716    Object *memdev_root = object_resolve_path("/objects", NULL);
1717
1718    if (memdev_root) {
1719        object_child_foreach(memdev_root, find_max_backend_pagesize,
1720                             &pagesize);
1721    }
1722    return pagesize;
1723}
1724#else
1725long qemu_minrampagesize(void)
1726{
1727    return qemu_real_host_page_size;
1728}
1729long qemu_maxrampagesize(void)
1730{
1731    return qemu_real_host_page_size;
1732}
1733#endif
1734
1735#ifdef CONFIG_POSIX
1736static int64_t get_file_size(int fd)
1737{
1738    int64_t size;
1739#if defined(__linux__)
1740    struct stat st;
1741
1742    if (fstat(fd, &st) < 0) {
1743        return -errno;
1744    }
1745
1746    /* Special handling for devdax character devices */
1747    if (S_ISCHR(st.st_mode)) {
1748        g_autofree char *subsystem_path = NULL;
1749        g_autofree char *subsystem = NULL;
1750
1751        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
1752                                         major(st.st_rdev), minor(st.st_rdev));
1753        subsystem = g_file_read_link(subsystem_path, NULL);
1754
1755        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
1756            g_autofree char *size_path = NULL;
1757            g_autofree char *size_str = NULL;
1758
1759            size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
1760                                    major(st.st_rdev), minor(st.st_rdev));
1761
1762            if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
1763                return g_ascii_strtoll(size_str, NULL, 0);
1764            }
1765        }
1766    }
1767#endif /* defined(__linux__) */
1768
1769    /* st.st_size may be zero for special files yet lseek(2) works */
1770    size = lseek(fd, 0, SEEK_END);
1771    if (size < 0) {
1772        return -errno;
1773    }
1774    return size;
1775}
1776
1777static int file_ram_open(const char *path,
1778                         const char *region_name,
1779                         bool *created,
1780                         Error **errp)
1781{
1782    char *filename;
1783    char *sanitized_name;
1784    char *c;
1785    int fd = -1;
1786
1787    *created = false;
1788    for (;;) {
1789        fd = open(path, O_RDWR);
1790        if (fd >= 0) {
1791            /* @path names an existing file, use it */
1792            break;
1793        }
1794        if (errno == ENOENT) {
1795            /* @path names a file that doesn't exist, create it */
1796            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1797            if (fd >= 0) {
1798                *created = true;
1799                break;
1800            }
1801        } else if (errno == EISDIR) {
1802            /* @path names a directory, create a file there */
1803            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1804            sanitized_name = g_strdup(region_name);
1805            for (c = sanitized_name; *c != '\0'; c++) {
1806                if (*c == '/') {
1807                    *c = '_';
1808                }
1809            }
1810
1811            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1812                                       sanitized_name);
1813            g_free(sanitized_name);
1814
1815            fd = mkstemp(filename);
1816            if (fd >= 0) {
1817                unlink(filename);
1818                g_free(filename);
1819                break;
1820            }
1821            g_free(filename);
1822        }
1823        if (errno != EEXIST && errno != EINTR) {
1824            error_setg_errno(errp, errno,
1825                             "can't open backing store %s for guest RAM",
1826                             path);
1827            return -1;
1828        }
1829        /*
1830         * Try again on EINTR and EEXIST.  The latter happens when
1831         * something else creates the file between our two open().
1832         */
1833    }
1834
1835    return fd;
1836}
1837
1838static void *file_ram_alloc(RAMBlock *block,
1839                            ram_addr_t memory,
1840                            int fd,
1841                            bool truncate,
1842                            Error **errp)
1843{
1844    MachineState *ms = MACHINE(qdev_get_machine());
1845    void *area;
1846
1847    block->page_size = qemu_fd_getpagesize(fd);
1848    if (block->mr->align % block->page_size) {
1849        error_setg(errp, "alignment 0x%" PRIx64
1850                   " must be multiples of page size 0x%zx",
1851                   block->mr->align, block->page_size);
1852        return NULL;
1853    } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
1854        error_setg(errp, "alignment 0x%" PRIx64
1855                   " must be a power of two", block->mr->align);
1856        return NULL;
1857    }
1858    block->mr->align = MAX(block->page_size, block->mr->align);
1859#if defined(__s390x__)
1860    if (kvm_enabled()) {
1861        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1862    }
1863#endif
1864
1865    if (memory < block->page_size) {
1866        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1867                   "or larger than page size 0x%zx",
1868                   memory, block->page_size);
1869        return NULL;
1870    }
1871
1872    memory = ROUND_UP(memory, block->page_size);
1873
1874    /*
1875     * ftruncate is not supported by hugetlbfs in older
1876     * hosts, so don't bother bailing out on errors.
1877     * If anything goes wrong with it under other filesystems,
1878     * mmap will fail.
1879     *
1880     * Do not truncate the non-empty backend file to avoid corrupting
1881     * the existing data in the file. Disabling shrinking is not
1882     * enough. For example, the current vNVDIMM implementation stores
1883     * the guest NVDIMM labels at the end of the backend file. If the
1884     * backend file is later extended, QEMU will not be able to find
1885     * those labels. Therefore, extending the non-empty backend file
1886     * is disabled as well.
1887     */
1888    if (truncate && ftruncate(fd, memory)) {
1889        perror("ftruncate");
1890    }
1891
1892    area = qemu_ram_mmap(fd, memory, block->mr->align,
1893                         block->flags & RAM_SHARED, block->flags & RAM_PMEM);
1894    if (area == MAP_FAILED) {
1895        error_setg_errno(errp, errno,
1896                         "unable to map backing store for guest RAM");
1897        return NULL;
1898    }
1899
1900    if (mem_prealloc) {
1901        os_mem_prealloc(fd, area, memory, ms->smp.cpus, errp);
1902        if (errp && *errp) {
1903            qemu_ram_munmap(fd, area, memory);
1904            return NULL;
1905        }
1906    }
1907
1908    block->fd = fd;
1909    return area;
1910}
1911#endif
1912
1913/* Allocate space within the ram_addr_t space that governs the
1914 * dirty bitmaps.
1915 * Called with the ramlist lock held.
1916 */
1917static ram_addr_t find_ram_offset(ram_addr_t size)
1918{
1919    RAMBlock *block, *next_block;
1920    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1921
1922    assert(size != 0); /* it would hand out same offset multiple times */
1923
1924    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1925        return 0;
1926    }
1927
1928    RAMBLOCK_FOREACH(block) {
1929        ram_addr_t candidate, next = RAM_ADDR_MAX;
1930
1931        /* Align blocks to start on a 'long' in the bitmap
1932         * which makes the bitmap sync'ing take the fast path.
1933         */
1934        candidate = block->offset + block->max_length;
1935        candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
1936
1937        /* Search for the closest following block
1938         * and find the gap.
1939         */
1940        RAMBLOCK_FOREACH(next_block) {
1941            if (next_block->offset >= candidate) {
1942                next = MIN(next, next_block->offset);
1943            }
1944        }
1945
1946        /* If it fits remember our place and remember the size
1947         * of gap, but keep going so that we might find a smaller
1948         * gap to fill so avoiding fragmentation.
1949         */
1950        if (next - candidate >= size && next - candidate < mingap) {
1951            offset = candidate;
1952            mingap = next - candidate;
1953        }
1954
1955        trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
1956    }
1957
1958    if (offset == RAM_ADDR_MAX) {
1959        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1960                (uint64_t)size);
1961        abort();
1962    }
1963
1964    trace_find_ram_offset(size, offset);
1965
1966    return offset;
1967}
1968
1969static unsigned long last_ram_page(void)
1970{
1971    RAMBlock *block;
1972    ram_addr_t last = 0;
1973
1974    RCU_READ_LOCK_GUARD();
1975    RAMBLOCK_FOREACH(block) {
1976        last = MAX(last, block->offset + block->max_length);
1977    }
1978    return last >> TARGET_PAGE_BITS;
1979}
1980
1981static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1982{
1983    int ret;
1984
1985    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1986    if (!machine_dump_guest_core(current_machine)) {
1987        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1988        if (ret) {
1989            perror("qemu_madvise");
1990            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1991                            "but dump_guest_core=off specified\n");
1992        }
1993    }
1994}
1995
1996const char *qemu_ram_get_idstr(RAMBlock *rb)
1997{
1998    return rb->idstr;
1999}
2000
2001void *qemu_ram_get_host_addr(RAMBlock *rb)
2002{
2003    return rb->host;
2004}
2005
2006ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
2007{
2008    return rb->offset;
2009}
2010
2011ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
2012{
2013    return rb->used_length;
2014}
2015
2016bool qemu_ram_is_shared(RAMBlock *rb)
2017{
2018    return rb->flags & RAM_SHARED;
2019}
2020
2021/* Note: Only set at the start of postcopy */
2022bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
2023{
2024    return rb->flags & RAM_UF_ZEROPAGE;
2025}
2026
2027void qemu_ram_set_uf_zeroable(RAMBlock *rb)
2028{
2029    rb->flags |= RAM_UF_ZEROPAGE;
2030}
2031
2032bool qemu_ram_is_migratable(RAMBlock *rb)
2033{
2034    return rb->flags & RAM_MIGRATABLE;
2035}
2036
2037void qemu_ram_set_migratable(RAMBlock *rb)
2038{
2039    rb->flags |= RAM_MIGRATABLE;
2040}
2041
2042void qemu_ram_unset_migratable(RAMBlock *rb)
2043{
2044    rb->flags &= ~RAM_MIGRATABLE;
2045}
2046
2047/* Called with iothread lock held.  */
2048void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
2049{
2050    RAMBlock *block;
2051
2052    assert(new_block);
2053    assert(!new_block->idstr[0]);
2054
2055    if (dev) {
2056        char *id = qdev_get_dev_path(dev);
2057        if (id) {
2058            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
2059            g_free(id);
2060        }
2061    }
2062    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
2063
2064    RCU_READ_LOCK_GUARD();
2065    RAMBLOCK_FOREACH(block) {
2066        if (block != new_block &&
2067            !strcmp(block->idstr, new_block->idstr)) {
2068            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
2069                    new_block->idstr);
2070            abort();
2071        }
2072    }
2073}
2074
2075/* Called with iothread lock held.  */
2076void qemu_ram_unset_idstr(RAMBlock *block)
2077{
2078    /* FIXME: arch_init.c assumes that this is not called throughout
2079     * migration.  Ignore the problem since hot-unplug during migration
2080     * does not work anyway.
2081     */
2082    if (block) {
2083        memset(block->idstr, 0, sizeof(block->idstr));
2084    }
2085}
2086
2087size_t qemu_ram_pagesize(RAMBlock *rb)
2088{
2089    return rb->page_size;
2090}
2091
2092/* Returns the largest size of page in use */
2093size_t qemu_ram_pagesize_largest(void)
2094{
2095    RAMBlock *block;
2096    size_t largest = 0;
2097
2098    RAMBLOCK_FOREACH(block) {
2099        largest = MAX(largest, qemu_ram_pagesize(block));
2100    }
2101
2102    return largest;
2103}
2104
2105static int memory_try_enable_merging(void *addr, size_t len)
2106{
2107    if (!machine_mem_merge(current_machine)) {
2108        /* disabled by the user */
2109        return 0;
2110    }
2111
2112    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
2113}
2114
2115/* Only legal before guest might have detected the memory size: e.g. on
2116 * incoming migration, or right after reset.
2117 *
2118 * As memory core doesn't know how is memory accessed, it is up to
2119 * resize callback to update device state and/or add assertions to detect
2120 * misuse, if necessary.
2121 */
2122int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
2123{
2124    assert(block);
2125
2126    newsize = HOST_PAGE_ALIGN(newsize);
2127
2128    if (block->used_length == newsize) {
2129        return 0;
2130    }
2131
2132    if (!(block->flags & RAM_RESIZEABLE)) {
2133        error_setg_errno(errp, EINVAL,
2134                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
2135                         " in != 0x" RAM_ADDR_FMT, block->idstr,
2136                         newsize, block->used_length);
2137        return -EINVAL;
2138    }
2139
2140    if (block->max_length < newsize) {
2141        error_setg_errno(errp, EINVAL,
2142                         "Length too large: %s: 0x" RAM_ADDR_FMT
2143                         " > 0x" RAM_ADDR_FMT, block->idstr,
2144                         newsize, block->max_length);
2145        return -EINVAL;
2146    }
2147
2148    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
2149    block->used_length = newsize;
2150    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
2151                                        DIRTY_CLIENTS_ALL);
2152    memory_region_set_size(block->mr, newsize);
2153    if (block->resized) {
2154        block->resized(block->idstr, newsize, block->host);
2155    }
2156    return 0;
2157}
2158
2159/* Called with ram_list.mutex held */
2160static void dirty_memory_extend(ram_addr_t old_ram_size,
2161                                ram_addr_t new_ram_size)
2162{
2163    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
2164                                             DIRTY_MEMORY_BLOCK_SIZE);
2165    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
2166                                             DIRTY_MEMORY_BLOCK_SIZE);
2167    int i;
2168
2169    /* Only need to extend if block count increased */
2170    if (new_num_blocks <= old_num_blocks) {
2171        return;
2172    }
2173
2174    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
2175        DirtyMemoryBlocks *old_blocks;
2176        DirtyMemoryBlocks *new_blocks;
2177        int j;
2178
2179        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
2180        new_blocks = g_malloc(sizeof(*new_blocks) +
2181                              sizeof(new_blocks->blocks[0]) * new_num_blocks);
2182
2183        if (old_num_blocks) {
2184            memcpy(new_blocks->blocks, old_blocks->blocks,
2185                   old_num_blocks * sizeof(old_blocks->blocks[0]));
2186        }
2187
2188        for (j = old_num_blocks; j < new_num_blocks; j++) {
2189            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
2190        }
2191
2192        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
2193
2194        if (old_blocks) {
2195            g_free_rcu(old_blocks, rcu);
2196        }
2197    }
2198}
2199
2200static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
2201{
2202    RAMBlock *block;
2203    RAMBlock *last_block = NULL;
2204    ram_addr_t old_ram_size, new_ram_size;
2205    Error *err = NULL;
2206
2207    old_ram_size = last_ram_page();
2208
2209    qemu_mutex_lock_ramlist();
2210    new_block->offset = find_ram_offset(new_block->max_length);
2211
2212    if (!new_block->host) {
2213        if (xen_enabled()) {
2214            xen_ram_alloc(new_block->offset, new_block->max_length,
2215                          new_block->mr, &err);
2216            if (err) {
2217                error_propagate(errp, err);
2218                qemu_mutex_unlock_ramlist();
2219                return;
2220            }
2221        } else {
2222            new_block->host = phys_mem_alloc(new_block->max_length,
2223                                             &new_block->mr->align, shared);
2224            if (!new_block->host) {
2225                error_setg_errno(errp, errno,
2226                                 "cannot set up guest memory '%s'",
2227                                 memory_region_name(new_block->mr));
2228                qemu_mutex_unlock_ramlist();
2229                return;
2230            }
2231            memory_try_enable_merging(new_block->host, new_block->max_length);
2232        }
2233    }
2234
2235    new_ram_size = MAX(old_ram_size,
2236              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
2237    if (new_ram_size > old_ram_size) {
2238        dirty_memory_extend(old_ram_size, new_ram_size);
2239    }
2240    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
2241     * QLIST (which has an RCU-friendly variant) does not have insertion at
2242     * tail, so save the last element in last_block.
2243     */
2244    RAMBLOCK_FOREACH(block) {
2245        last_block = block;
2246        if (block->max_length < new_block->max_length) {
2247            break;
2248        }
2249    }
2250    if (block) {
2251        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
2252    } else if (last_block) {
2253        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
2254    } else { /* list is empty */
2255        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
2256    }
2257    ram_list.mru_block = NULL;
2258
2259    /* Write list before version */
2260    smp_wmb();
2261    ram_list.version++;
2262    qemu_mutex_unlock_ramlist();
2263
2264    cpu_physical_memory_set_dirty_range(new_block->offset,
2265                                        new_block->used_length,
2266                                        DIRTY_CLIENTS_ALL);
2267
2268    if (new_block->host) {
2269        qemu_ram_setup_dump(new_block->host, new_block->max_length);
2270        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
2271        /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
2272        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
2273        ram_block_notify_add(new_block->host, new_block->max_length);
2274    }
2275}
2276
2277#ifdef CONFIG_POSIX
2278RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
2279                                 uint32_t ram_flags, int fd,
2280                                 Error **errp)
2281{
2282    RAMBlock *new_block;
2283    Error *local_err = NULL;
2284    int64_t file_size;
2285
2286    /* Just support these ram flags by now. */
2287    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
2288
2289    if (xen_enabled()) {
2290        error_setg(errp, "-mem-path not supported with Xen");
2291        return NULL;
2292    }
2293
2294    if (kvm_enabled() && !kvm_has_sync_mmu()) {
2295        error_setg(errp,
2296                   "host lacks kvm mmu notifiers, -mem-path unsupported");
2297        return NULL;
2298    }
2299
2300    if (phys_mem_alloc != qemu_anon_ram_alloc) {
2301        /*
2302         * file_ram_alloc() needs to allocate just like
2303         * phys_mem_alloc, but we haven't bothered to provide
2304         * a hook there.
2305         */
2306        error_setg(errp,
2307                   "-mem-path not supported with this accelerator");
2308        return NULL;
2309    }
2310
2311    size = HOST_PAGE_ALIGN(size);
2312    file_size = get_file_size(fd);
2313    if (file_size > 0 && file_size < size) {
2314        error_setg(errp, "backing store %s size 0x%" PRIx64
2315                   " does not match 'size' option 0x" RAM_ADDR_FMT,
2316                   mem_path, file_size, size);
2317        return NULL;
2318    }
2319
2320    new_block = g_malloc0(sizeof(*new_block));
2321    new_block->mr = mr;
2322    new_block->used_length = size;
2323    new_block->max_length = size;
2324    new_block->flags = ram_flags;
2325    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
2326    if (!new_block->host) {
2327        g_free(new_block);
2328        return NULL;
2329    }
2330
2331    ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
2332    if (local_err) {
2333        g_free(new_block);
2334        error_propagate(errp, local_err);
2335        return NULL;
2336    }
2337    return new_block;
2338
2339}
2340
2341
2342RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2343                                   uint32_t ram_flags, const char *mem_path,
2344                                   Error **errp)
2345{
2346    int fd;
2347    bool created;
2348    RAMBlock *block;
2349
2350    fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
2351    if (fd < 0) {
2352        return NULL;
2353    }
2354
2355    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
2356    if (!block) {
2357        if (created) {
2358            unlink(mem_path);
2359        }
2360        close(fd);
2361        return NULL;
2362    }
2363
2364    return block;
2365}
2366#endif
2367
2368static
2369RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2370                                  void (*resized)(const char*,
2371                                                  uint64_t length,
2372                                                  void *host),
2373                                  void *host, bool resizeable, bool share,
2374                                  MemoryRegion *mr, Error **errp)
2375{
2376    RAMBlock *new_block;
2377    Error *local_err = NULL;
2378
2379    size = HOST_PAGE_ALIGN(size);
2380    max_size = HOST_PAGE_ALIGN(max_size);
2381    new_block = g_malloc0(sizeof(*new_block));
2382    new_block->mr = mr;
2383    new_block->resized = resized;
2384    new_block->used_length = size;
2385    new_block->max_length = max_size;
2386    assert(max_size >= size);
2387    new_block->fd = -1;
2388    new_block->page_size = qemu_real_host_page_size;
2389    new_block->host = host;
2390    if (host) {
2391        new_block->flags |= RAM_PREALLOC;
2392    }
2393    if (resizeable) {
2394        new_block->flags |= RAM_RESIZEABLE;
2395    }
2396    ram_block_add(new_block, &local_err, share);
2397    if (local_err) {
2398        g_free(new_block);
2399        error_propagate(errp, local_err);
2400        return NULL;
2401    }
2402    return new_block;
2403}
2404
2405RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2406                                   MemoryRegion *mr, Error **errp)
2407{
2408    return qemu_ram_alloc_internal(size, size, NULL, host, false,
2409                                   false, mr, errp);
2410}
2411
2412RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
2413                         MemoryRegion *mr, Error **errp)
2414{
2415    return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
2416                                   share, mr, errp);
2417}
2418
2419RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2420                                     void (*resized)(const char*,
2421                                                     uint64_t length,
2422                                                     void *host),
2423                                     MemoryRegion *mr, Error **errp)
2424{
2425    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
2426                                   false, mr, errp);
2427}
2428
2429static void reclaim_ramblock(RAMBlock *block)
2430{
2431    if (block->flags & RAM_PREALLOC) {
2432        ;
2433    } else if (xen_enabled()) {
2434        xen_invalidate_map_cache_entry(block->host);
2435#ifndef _WIN32
2436    } else if (block->fd >= 0) {
2437        qemu_ram_munmap(block->fd, block->host, block->max_length);
2438        close(block->fd);
2439#endif
2440    } else {
2441        qemu_anon_ram_free(block->host, block->max_length);
2442    }
2443    g_free(block);
2444}
2445
2446void qemu_ram_free(RAMBlock *block)
2447{
2448    if (!block) {
2449        return;
2450    }
2451
2452    if (block->host) {
2453        ram_block_notify_remove(block->host, block->max_length);
2454    }
2455
2456    qemu_mutex_lock_ramlist();
2457    QLIST_REMOVE_RCU(block, next);
2458    ram_list.mru_block = NULL;
2459    /* Write list before version */
2460    smp_wmb();
2461    ram_list.version++;
2462    call_rcu(block, reclaim_ramblock, rcu);
2463    qemu_mutex_unlock_ramlist();
2464}
2465
2466#ifndef _WIN32
2467void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
2468{
2469    RAMBlock *block;
2470    ram_addr_t offset;
2471    int flags;
2472    void *area, *vaddr;
2473
2474    RAMBLOCK_FOREACH(block) {
2475        offset = addr - block->offset;
2476        if (offset < block->max_length) {
2477            vaddr = ramblock_ptr(block, offset);
2478            if (block->flags & RAM_PREALLOC) {
2479                ;
2480            } else if (xen_enabled()) {
2481                abort();
2482            } else {
2483                flags = MAP_FIXED;
2484                if (block->fd >= 0) {
2485                    flags |= (block->flags & RAM_SHARED ?
2486                              MAP_SHARED : MAP_PRIVATE);
2487                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2488                                flags, block->fd, offset);
2489                } else {
2490                    /*
2491                     * Remap needs to match alloc.  Accelerators that
2492                     * set phys_mem_alloc never remap.  If they did,
2493                     * we'd need a remap hook here.
2494                     */
2495                    assert(phys_mem_alloc == qemu_anon_ram_alloc);
2496
2497                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
2498                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2499                                flags, -1, 0);
2500                }
2501                if (area != vaddr) {
2502                    error_report("Could not remap addr: "
2503                                 RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
2504                                 length, addr);
2505                    exit(1);
2506                }
2507                memory_try_enable_merging(vaddr, length);
2508                qemu_ram_setup_dump(vaddr, length);
2509            }
2510        }
2511    }
2512}
2513#endif /* !_WIN32 */
2514
2515/* Return a host pointer to ram allocated with qemu_ram_alloc.
2516 * This should not be used for general purpose DMA.  Use address_space_map
2517 * or address_space_rw instead. For local memory (e.g. video ram) that the
2518 * device owns, use memory_region_get_ram_ptr.
2519 *
2520 * Called within RCU critical section.
2521 */
2522void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2523{
2524    RAMBlock *block = ram_block;
2525
2526    if (block == NULL) {
2527        block = qemu_get_ram_block(addr);
2528        addr -= block->offset;
2529    }
2530
2531    if (xen_enabled() && block->host == NULL) {
2532        /* We need to check if the requested address is in the RAM
2533         * because we don't want to map the entire memory in QEMU.
2534         * In that case just map until the end of the page.
2535         */
2536        if (block->offset == 0) {
2537            return xen_map_cache(addr, 0, 0, false);
2538        }
2539
2540        block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2541    }
2542    return ramblock_ptr(block, addr);
2543}
2544
2545/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2546 * but takes a size argument.
2547 *
2548 * Called within RCU critical section.
2549 */
2550static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2551                                 hwaddr *size, bool lock)
2552{
2553    RAMBlock *block = ram_block;
2554    if (*size == 0) {
2555        return NULL;
2556    }
2557
2558    if (block == NULL) {
2559        block = qemu_get_ram_block(addr);
2560        addr -= block->offset;
2561    }
2562    *size = MIN(*size, block->max_length - addr);
2563
2564    if (xen_enabled() && block->host == NULL) {
2565        /* We need to check if the requested address is in the RAM
2566         * because we don't want to map the entire memory in QEMU.
2567         * In that case just map the requested area.
2568         */
2569        if (block->offset == 0) {
2570            return xen_map_cache(addr, *size, lock, lock);
2571        }
2572
2573        block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2574    }
2575
2576    return ramblock_ptr(block, addr);
2577}
2578
2579/* Return the offset of a hostpointer within a ramblock */
2580ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
2581{
2582    ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
2583    assert((uintptr_t)host >= (uintptr_t)rb->host);
2584    assert(res < rb->max_length);
2585
2586    return res;
2587}
2588
2589/*
2590 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
2591 * in that RAMBlock.
2592 *
2593 * ptr: Host pointer to look up
2594 * round_offset: If true round the result offset down to a page boundary
2595 * *ram_addr: set to result ram_addr
2596 * *offset: set to result offset within the RAMBlock
2597 *
2598 * Returns: RAMBlock (or NULL if not found)
2599 *
2600 * By the time this function returns, the returned pointer is not protected
2601 * by RCU anymore.  If the caller is not within an RCU critical section and
2602 * does not hold the iothread lock, it must have other means of protecting the
2603 * pointer, such as a reference to the region that includes the incoming
2604 * ram_addr_t.
2605 */
2606RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2607                                   ram_addr_t *offset)
2608{
2609    RAMBlock *block;
2610    uint8_t *host = ptr;
2611
2612    if (xen_enabled()) {
2613        ram_addr_t ram_addr;
2614        RCU_READ_LOCK_GUARD();
2615        ram_addr = xen_ram_addr_from_mapcache(ptr);
2616        block = qemu_get_ram_block(ram_addr);
2617        if (block) {
2618            *offset = ram_addr - block->offset;
2619        }
2620        return block;
2621    }
2622
2623    RCU_READ_LOCK_GUARD();
2624    block = atomic_rcu_read(&ram_list.mru_block);
2625    if (block && block->host && host - block->host < block->max_length) {
2626        goto found;
2627    }
2628
2629    RAMBLOCK_FOREACH(block) {
2630        /* This case append when the block is not mapped. */
2631        if (block->host == NULL) {
2632            continue;
2633        }
2634        if (host - block->host < block->max_length) {
2635            goto found;
2636        }
2637    }
2638
2639    return NULL;
2640
2641found:
2642    *offset = (host - block->host);
2643    if (round_offset) {
2644        *offset &= TARGET_PAGE_MASK;
2645    }
2646    return block;
2647}
2648
2649/*
2650 * Finds the named RAMBlock
2651 *
2652 * name: The name of RAMBlock to find
2653 *
2654 * Returns: RAMBlock (or NULL if not found)
2655 */
2656RAMBlock *qemu_ram_block_by_name(const char *name)
2657{
2658    RAMBlock *block;
2659
2660    RAMBLOCK_FOREACH(block) {
2661        if (!strcmp(name, block->idstr)) {
2662            return block;
2663        }
2664    }
2665
2666    return NULL;
2667}
2668
2669/* Some of the softmmu routines need to translate from a host pointer
2670   (typically a TLB entry) back to a ram offset.  */
2671ram_addr_t qemu_ram_addr_from_host(void *ptr)
2672{
2673    RAMBlock *block;
2674    ram_addr_t offset;
2675
2676    block = qemu_ram_block_from_host(ptr, false, &offset);
2677    if (!block) {
2678        return RAM_ADDR_INVALID;
2679    }
2680
2681    return block->offset + offset;
2682}
2683
2684/* Generate a debug exception if a watchpoint has been hit.  */
2685void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
2686                          MemTxAttrs attrs, int flags, uintptr_t ra)
2687{
2688    CPUClass *cc = CPU_GET_CLASS(cpu);
2689    CPUWatchpoint *wp;
2690
2691    assert(tcg_enabled());
2692    if (cpu->watchpoint_hit) {
2693        /*
2694         * We re-entered the check after replacing the TB.
2695         * Now raise the debug interrupt so that it will
2696         * trigger after the current instruction.
2697         */
2698        qemu_mutex_lock_iothread();
2699        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2700        qemu_mutex_unlock_iothread();
2701        return;
2702    }
2703
2704    addr = cc->adjust_watchpoint_address(cpu, addr, len);
2705    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2706        if (watchpoint_address_matches(wp, addr, len)
2707            && (wp->flags & flags)) {
2708            if (flags == BP_MEM_READ) {
2709                wp->flags |= BP_WATCHPOINT_HIT_READ;
2710            } else {
2711                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2712            }
2713            wp->hitaddr = MAX(addr, wp->vaddr);
2714            wp->hitattrs = attrs;
2715            if (!cpu->watchpoint_hit) {
2716                if (wp->flags & BP_CPU &&
2717                    !cc->debug_check_watchpoint(cpu, wp)) {
2718                    wp->flags &= ~BP_WATCHPOINT_HIT;
2719                    continue;
2720                }
2721                cpu->watchpoint_hit = wp;
2722
2723                mmap_lock();
2724                tb_check_watchpoint(cpu, ra);
2725                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2726                    cpu->exception_index = EXCP_DEBUG;
2727                    mmap_unlock();
2728                    cpu_loop_exit_restore(cpu, ra);
2729                } else {
2730                    /* Force execution of one insn next time.  */
2731                    cpu->cflags_next_tb = 1 | curr_cflags();
2732                    mmap_unlock();
2733                    if (ra) {
2734                        cpu_restore_state(cpu, ra, true);
2735                    }
2736                    cpu_loop_exit_noexc(cpu);
2737                }
2738            }
2739        } else {
2740            wp->flags &= ~BP_WATCHPOINT_HIT;
2741        }
2742    }
2743}
2744
2745static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
2746                                 MemTxAttrs attrs, uint8_t *buf, hwaddr len);
2747static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2748                                  const uint8_t *buf, hwaddr len);
2749static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
2750                                  bool is_write, MemTxAttrs attrs);
2751
2752static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2753                                unsigned len, MemTxAttrs attrs)
2754{
2755    subpage_t *subpage = opaque;
2756    uint8_t buf[8];
2757    MemTxResult res;
2758
2759#if defined(DEBUG_SUBPAGE)
2760    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2761           subpage, len, addr);
2762#endif
2763    res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2764    if (res) {
2765        return res;
2766    }
2767    *data = ldn_p(buf, len);
2768    return MEMTX_OK;
2769}
2770
2771static MemTxResult subpage_write(void *opaque, hwaddr addr,
2772                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2773{
2774    subpage_t *subpage = opaque;
2775    uint8_t buf[8];
2776
2777#if defined(DEBUG_SUBPAGE)
2778    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2779           " value %"PRIx64"\n",
2780           __func__, subpage, len, addr, value);
2781#endif
2782    stn_p(buf, len, value);
2783    return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2784}
2785
2786static bool subpage_accepts(void *opaque, hwaddr addr,
2787                            unsigned len, bool is_write,
2788                            MemTxAttrs attrs)
2789{
2790    subpage_t *subpage = opaque;
2791#if defined(DEBUG_SUBPAGE)
2792    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2793           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2794#endif
2795
2796    return flatview_access_valid(subpage->fv, addr + subpage->base,
2797                                 len, is_write, attrs);
2798}
2799
2800static const MemoryRegionOps subpage_ops = {
2801    .read_with_attrs = subpage_read,
2802    .write_with_attrs = subpage_write,
2803    .impl.min_access_size = 1,
2804    .impl.max_access_size = 8,
2805    .valid.min_access_size = 1,
2806    .valid.max_access_size = 8,
2807    .valid.accepts = subpage_accepts,
2808    .endianness = DEVICE_NATIVE_ENDIAN,
2809};
2810
2811static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
2812                            uint16_t section)
2813{
2814    int idx, eidx;
2815
2816    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2817        return -1;
2818    idx = SUBPAGE_IDX(start);
2819    eidx = SUBPAGE_IDX(end);
2820#if defined(DEBUG_SUBPAGE)
2821    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2822           __func__, mmio, start, end, idx, eidx, section);
2823#endif
2824    for (; idx <= eidx; idx++) {
2825        mmio->sub_section[idx] = section;
2826    }
2827
2828    return 0;
2829}
2830
2831static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2832{
2833    subpage_t *mmio;
2834
2835    /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
2836    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2837    mmio->fv = fv;
2838    mmio->base = base;
2839    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2840                          NULL, TARGET_PAGE_SIZE);
2841    mmio->iomem.subpage = true;
2842#if defined(DEBUG_SUBPAGE)
2843    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2844           mmio, base, TARGET_PAGE_SIZE);
2845#endif
2846
2847    return mmio;
2848}
2849
2850static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2851{
2852    assert(fv);
2853    MemoryRegionSection section = {
2854        .fv = fv,
2855        .mr = mr,
2856        .offset_within_address_space = 0,
2857        .offset_within_region = 0,
2858        .size = int128_2_64(),
2859    };
2860
2861    return phys_section_add(map, &section);
2862}
2863
2864MemoryRegionSection *iotlb_to_section(CPUState *cpu,
2865                                      hwaddr index, MemTxAttrs attrs)
2866{
2867    int asidx = cpu_asidx_from_attrs(cpu, attrs);
2868    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2869    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2870    MemoryRegionSection *sections = d->map.sections;
2871
2872    return &sections[index & ~TARGET_PAGE_MASK];
2873}
2874
2875static void io_mem_init(void)
2876{
2877    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2878                          NULL, UINT64_MAX);
2879}
2880
2881AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
2882{
2883    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2884    uint16_t n;
2885
2886    n = dummy_section(&d->map, fv, &io_mem_unassigned);
2887    assert(n == PHYS_SECTION_UNASSIGNED);
2888
2889    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2890
2891    return d;
2892}
2893
2894void address_space_dispatch_free(AddressSpaceDispatch *d)
2895{
2896    phys_sections_free(&d->map);
2897    g_free(d);
2898}
2899
2900static void do_nothing(CPUState *cpu, run_on_cpu_data d)
2901{
2902}
2903
2904static void tcg_log_global_after_sync(MemoryListener *listener)
2905{
2906    CPUAddressSpace *cpuas;
2907
2908    /* Wait for the CPU to end the current TB.  This avoids the following
2909     * incorrect race:
2910     *
2911     *      vCPU                         migration
2912     *      ----------------------       -------------------------
2913     *      TLB check -> slow path
2914     *        notdirty_mem_write
2915     *          write to RAM
2916     *          mark dirty
2917     *                                   clear dirty flag
2918     *      TLB check -> fast path
2919     *                                   read memory
2920     *        write to RAM
2921     *
2922     * by pushing the migration thread's memory read after the vCPU thread has
2923     * written the memory.
2924     */
2925    if (replay_mode == REPLAY_MODE_NONE) {
2926        /*
2927         * VGA can make calls to this function while updating the screen.
2928         * In record/replay mode this causes a deadlock, because
2929         * run_on_cpu waits for rr mutex. Therefore no races are possible
2930         * in this case and no need for making run_on_cpu when
2931         * record/replay is not enabled.
2932         */
2933        cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2934        run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
2935    }
2936}
2937
2938static void tcg_commit(MemoryListener *listener)
2939{
2940    CPUAddressSpace *cpuas;
2941    AddressSpaceDispatch *d;
2942
2943    assert(tcg_enabled());
2944    /* since each CPU stores ram addresses in its TLB cache, we must
2945       reset the modified entries */
2946    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2947    cpu_reloading_memory_map();
2948    /* The CPU and TLB are protected by the iothread lock.
2949     * We reload the dispatch pointer now because cpu_reloading_memory_map()
2950     * may have split the RCU critical section.
2951     */
2952    d = address_space_to_dispatch(cpuas->as);
2953    atomic_rcu_set(&cpuas->memory_dispatch, d);
2954    tlb_flush(cpuas->cpu);
2955}
2956
2957static void memory_map_init(void)
2958{
2959    system_memory = g_malloc(sizeof(*system_memory));
2960
2961    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2962    address_space_init(&address_space_memory, system_memory, "memory");
2963
2964    system_io = g_malloc(sizeof(*system_io));
2965    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2966                          65536);
2967    address_space_init(&address_space_io, system_io, "I/O");
2968}
2969
2970MemoryRegion *get_system_memory(void)
2971{
2972    return system_memory;
2973}
2974
2975MemoryRegion *get_system_io(void)
2976{
2977    return system_io;
2978}
2979
2980#endif /* !defined(CONFIG_USER_ONLY) */
2981
2982/* physical memory access (slow version, mainly for debug) */
2983#if defined(CONFIG_USER_ONLY)
2984int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2985                        uint8_t *buf, target_ulong len, int is_write)
2986{
2987    int flags;
2988    target_ulong l, page;
2989    void * p;
2990
2991    while (len > 0) {
2992        page = addr & TARGET_PAGE_MASK;
2993        l = (page + TARGET_PAGE_SIZE) - addr;
2994        if (l > len)
2995            l = len;
2996        flags = page_get_flags(page);
2997        if (!(flags & PAGE_VALID))
2998            return -1;
2999        if (is_write) {
3000            if (!(flags & PAGE_WRITE))
3001                return -1;
3002            /* XXX: this code should not depend on lock_user */
3003            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
3004                return -1;
3005            memcpy(p, buf, l);
3006            unlock_user(p, addr, l);
3007        } else {
3008            if (!(flags & PAGE_READ))
3009                return -1;
3010            /* XXX: this code should not depend on lock_user */
3011            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
3012                return -1;
3013            memcpy(buf, p, l);
3014            unlock_user(p, addr, 0);
3015        }
3016        len -= l;
3017        buf += l;
3018        addr += l;
3019    }
3020    return 0;
3021}
3022
3023#else
3024
3025static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
3026                                     hwaddr length)
3027{
3028    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3029    addr += memory_region_get_ram_addr(mr);
3030
3031    /* No early return if dirty_log_mask is or becomes 0, because
3032     * cpu_physical_memory_set_dirty_range will still call
3033     * xen_modified_memory.
3034     */
3035    if (dirty_log_mask) {
3036        dirty_log_mask =
3037            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
3038    }
3039    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
3040        assert(tcg_enabled());
3041        tb_invalidate_phys_range(addr, addr + length);
3042        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3043    }
3044    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
3045}
3046
3047void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
3048{
3049    /*
3050     * In principle this function would work on other memory region types too,
3051     * but the ROM device use case is the only one where this operation is
3052     * necessary.  Other memory regions should use the
3053     * address_space_read/write() APIs.
3054     */
3055    assert(memory_region_is_romd(mr));
3056
3057    invalidate_and_set_dirty(mr, addr, size);
3058}
3059
3060static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
3061{
3062    unsigned access_size_max = mr->ops->valid.max_access_size;
3063
3064    /* Regions are assumed to support 1-4 byte accesses unless
3065       otherwise specified.  */
3066    if (access_size_max == 0) {
3067        access_size_max = 4;
3068    }
3069
3070    /* Bound the maximum access by the alignment of the address.  */
3071    if (!mr->ops->impl.unaligned) {
3072        unsigned align_size_max = addr & -addr;
3073        if (align_size_max != 0 && align_size_max < access_size_max) {
3074            access_size_max = align_size_max;
3075        }
3076    }
3077
3078    /* Don't attempt accesses larger than the maximum.  */
3079    if (l > access_size_max) {
3080        l = access_size_max;
3081    }
3082    l = pow2floor(l);
3083
3084    return l;
3085}
3086
3087static bool prepare_mmio_access(MemoryRegion *mr)
3088{
3089    bool unlocked = !qemu_mutex_iothread_locked();
3090    bool release_lock = false;
3091
3092    if (unlocked && mr->global_locking) {
3093        qemu_mutex_lock_iothread();
3094        unlocked = false;
3095        release_lock = true;
3096    }
3097    if (mr->flush_coalesced_mmio) {
3098        if (unlocked) {
3099            qemu_mutex_lock_iothread();
3100        }
3101        qemu_flush_coalesced_mmio_buffer();
3102        if (unlocked) {
3103            qemu_mutex_unlock_iothread();
3104        }
3105    }
3106
3107    return release_lock;
3108}
3109
3110/* Called within RCU critical section.  */
3111static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
3112                                           MemTxAttrs attrs,
3113                                           const uint8_t *buf,
3114                                           hwaddr len, hwaddr addr1,
3115                                           hwaddr l, MemoryRegion *mr)
3116{
3117    uint8_t *ptr;
3118    uint64_t val;
3119    MemTxResult result = MEMTX_OK;
3120    bool release_lock = false;
3121
3122    for (;;) {
3123        if (!memory_access_is_direct(mr, true)) {
3124            release_lock |= prepare_mmio_access(mr);
3125            l = memory_access_size(mr, l, addr1);
3126            /* XXX: could force current_cpu to NULL to avoid
3127               potential bugs */
3128            val = ldn_he_p(buf, l);
3129            result |= memory_region_dispatch_write(mr, addr1, val,
3130                                                   size_memop(l), attrs);
3131        } else {
3132            /* RAM case */
3133            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3134            memcpy(ptr, buf, l);
3135            invalidate_and_set_dirty(mr, addr1, l);
3136        }
3137
3138        if (release_lock) {
3139            qemu_mutex_unlock_iothread();
3140            release_lock = false;
3141        }
3142
3143        len -= l;
3144        buf += l;
3145        addr += l;
3146
3147        if (!len) {
3148            break;
3149        }
3150
3151        l = len;
3152        mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3153    }
3154
3155    return result;
3156}
3157
3158/* Called from RCU critical section.  */
3159static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3160                                  const uint8_t *buf, hwaddr len)
3161{
3162    hwaddr l;
3163    hwaddr addr1;
3164    MemoryRegion *mr;
3165    MemTxResult result = MEMTX_OK;
3166
3167    l = len;
3168    mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3169    result = flatview_write_continue(fv, addr, attrs, buf, len,
3170                                     addr1, l, mr);
3171
3172    return result;
3173}
3174
3175/* Called within RCU critical section.  */
3176MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
3177                                   MemTxAttrs attrs, uint8_t *buf,
3178                                   hwaddr len, hwaddr addr1, hwaddr l,
3179                                   MemoryRegion *mr)
3180{
3181    uint8_t *ptr;
3182    uint64_t val;
3183    MemTxResult result = MEMTX_OK;
3184    bool release_lock = false;
3185
3186    for (;;) {
3187        if (!memory_access_is_direct(mr, false)) {
3188            /* I/O case */
3189            release_lock |= prepare_mmio_access(mr);
3190            l = memory_access_size(mr, l, addr1);
3191            result |= memory_region_dispatch_read(mr, addr1, &val,
3192                                                  size_memop(l), attrs);
3193            stn_he_p(buf, l, val);
3194        } else {
3195            /* RAM case */
3196            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3197            memcpy(buf, ptr, l);
3198        }
3199
3200        if (release_lock) {
3201            qemu_mutex_unlock_iothread();
3202            release_lock = false;
3203        }
3204
3205        len -= l;
3206        buf += l;
3207        addr += l;
3208
3209        if (!len) {
3210            break;
3211        }
3212
3213        l = len;
3214        mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3215    }
3216
3217    return result;
3218}
3219
3220/* Called from RCU critical section.  */
3221static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
3222                                 MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3223{
3224    hwaddr l;
3225    hwaddr addr1;
3226    MemoryRegion *mr;
3227
3228    l = len;
3229    mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3230    return flatview_read_continue(fv, addr, attrs, buf, len,
3231                                  addr1, l, mr);
3232}
3233
3234MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3235                                    MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3236{
3237    MemTxResult result = MEMTX_OK;
3238    FlatView *fv;
3239
3240    if (len > 0) {
3241        RCU_READ_LOCK_GUARD();
3242        fv = address_space_to_flatview(as);
3243        result = flatview_read(fv, addr, attrs, buf, len);
3244    }
3245
3246    return result;
3247}
3248
3249MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
3250                                MemTxAttrs attrs,
3251                                const uint8_t *buf, hwaddr len)
3252{
3253    MemTxResult result = MEMTX_OK;
3254    FlatView *fv;
3255
3256    if (len > 0) {
3257        RCU_READ_LOCK_GUARD();
3258        fv = address_space_to_flatview(as);
3259        result = flatview_write(fv, addr, attrs, buf, len);
3260    }
3261
3262    return result;
3263}
3264
3265MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3266                             uint8_t *buf, hwaddr len, bool is_write)
3267{
3268    if (is_write) {
3269        return address_space_write(as, addr, attrs, buf, len);
3270    } else {
3271        return address_space_read_full(as, addr, attrs, buf, len);
3272    }
3273}
3274
3275void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
3276                            hwaddr len, int is_write)
3277{
3278    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3279                     buf, len, is_write);
3280}
3281
3282enum write_rom_type {
3283    WRITE_DATA,
3284    FLUSH_CACHE,
3285};
3286
3287static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
3288                                                           hwaddr addr,
3289                                                           MemTxAttrs attrs,
3290                                                           const uint8_t *buf,
3291                                                           hwaddr len,
3292                                                           enum write_rom_type type)
3293{
3294    hwaddr l;
3295    uint8_t *ptr;
3296    hwaddr addr1;
3297    MemoryRegion *mr;
3298
3299    RCU_READ_LOCK_GUARD();
3300    while (len > 0) {
3301        l = len;
3302        mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
3303
3304        if (!(memory_region_is_ram(mr) ||
3305              memory_region_is_romd(mr))) {
3306            l = memory_access_size(mr, l, addr1);
3307        } else {
3308            /* ROM/RAM case */
3309            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3310            switch (type) {
3311            case WRITE_DATA:
3312                memcpy(ptr, buf, l);
3313                invalidate_and_set_dirty(mr, addr1, l);
3314                break;
3315            case FLUSH_CACHE:
3316                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
3317                break;
3318            }
3319        }
3320        len -= l;
3321        buf += l;
3322        addr += l;
3323    }
3324    return MEMTX_OK;
3325}
3326
3327/* used for ROM loading : can write in RAM and ROM */
3328MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
3329                                    MemTxAttrs attrs,
3330                                    const uint8_t *buf, hwaddr len)
3331{
3332    return address_space_write_rom_internal(as, addr, attrs,
3333                                            buf, len, WRITE_DATA);
3334}
3335
3336void cpu_flush_icache_range(hwaddr start, hwaddr len)
3337{
3338    /*
3339     * This function should do the same thing as an icache flush that was
3340     * triggered from within the guest. For TCG we are always cache coherent,
3341     * so there is no need to flush anything. For KVM / Xen we need to flush
3342     * the host's instruction cache at least.
3343     */
3344    if (tcg_enabled()) {
3345        return;
3346    }
3347
3348    address_space_write_rom_internal(&address_space_memory,
3349                                     start, MEMTXATTRS_UNSPECIFIED,
3350                                     NULL, len, FLUSH_CACHE);
3351}
3352
3353typedef struct {
3354    MemoryRegion *mr;
3355    void *buffer;
3356    hwaddr addr;
3357    hwaddr len;
3358    bool in_use;
3359} BounceBuffer;
3360
3361static BounceBuffer bounce;
3362
3363typedef struct MapClient {
3364    QEMUBH *bh;
3365    QLIST_ENTRY(MapClient) link;
3366} MapClient;
3367
3368QemuMutex map_client_list_lock;
3369static QLIST_HEAD(, MapClient) map_client_list
3370    = QLIST_HEAD_INITIALIZER(map_client_list);
3371
3372static void cpu_unregister_map_client_do(MapClient *client)
3373{
3374    QLIST_REMOVE(client, link);
3375    g_free(client);
3376}
3377
3378static void cpu_notify_map_clients_locked(void)
3379{
3380    MapClient *client;
3381
3382    while (!QLIST_EMPTY(&map_client_list)) {
3383        client = QLIST_FIRST(&map_client_list);
3384        qemu_bh_schedule(client->bh);
3385        cpu_unregister_map_client_do(client);
3386    }
3387}
3388
3389void cpu_register_map_client(QEMUBH *bh)
3390{
3391    MapClient *client = g_malloc(sizeof(*client));
3392
3393    qemu_mutex_lock(&map_client_list_lock);
3394    client->bh = bh;
3395    QLIST_INSERT_HEAD(&map_client_list, client, link);
3396    if (!atomic_read(&bounce.in_use)) {
3397        cpu_notify_map_clients_locked();
3398    }
3399    qemu_mutex_unlock(&map_client_list_lock);
3400}
3401
3402void cpu_exec_init_all(void)
3403{
3404    qemu_mutex_init(&ram_list.mutex);
3405    /* The data structures we set up here depend on knowing the page size,
3406     * so no more changes can be made after this point.
3407     * In an ideal world, nothing we did before we had finished the
3408     * machine setup would care about the target page size, and we could
3409     * do this much later, rather than requiring board models to state
3410     * up front what their requirements are.
3411     */
3412    finalize_target_page_bits();
3413    io_mem_init();
3414    memory_map_init();
3415    qemu_mutex_init(&map_client_list_lock);
3416}
3417
3418void cpu_unregister_map_client(QEMUBH *bh)
3419{
3420    MapClient *client;
3421
3422    qemu_mutex_lock(&map_client_list_lock);
3423    QLIST_FOREACH(client, &map_client_list, link) {
3424        if (client->bh == bh) {
3425            cpu_unregister_map_client_do(client);
3426            break;
3427        }
3428    }
3429    qemu_mutex_unlock(&map_client_list_lock);
3430}
3431
3432static void cpu_notify_map_clients(void)
3433{
3434    qemu_mutex_lock(&map_client_list_lock);
3435    cpu_notify_map_clients_locked();
3436    qemu_mutex_unlock(&map_client_list_lock);
3437}
3438
3439static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
3440                                  bool is_write, MemTxAttrs attrs)
3441{
3442    MemoryRegion *mr;
3443    hwaddr l, xlat;
3444
3445    while (len > 0) {
3446        l = len;
3447        mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3448        if (!memory_access_is_direct(mr, is_write)) {
3449            l = memory_access_size(mr, l, addr);
3450            if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
3451                return false;
3452            }
3453        }
3454
3455        len -= l;
3456        addr += l;
3457    }
3458    return true;
3459}
3460
3461bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3462                                hwaddr len, bool is_write,
3463                                MemTxAttrs attrs)
3464{
3465    FlatView *fv;
3466    bool result;
3467
3468    RCU_READ_LOCK_GUARD();
3469    fv = address_space_to_flatview(as);
3470    result = flatview_access_valid(fv, addr, len, is_write, attrs);
3471    return result;
3472}
3473
3474static hwaddr
3475flatview_extend_translation(FlatView *fv, hwaddr addr,
3476                            hwaddr target_len,
3477                            MemoryRegion *mr, hwaddr base, hwaddr len,
3478                            bool is_write, MemTxAttrs attrs)
3479{
3480    hwaddr done = 0;
3481    hwaddr xlat;
3482    MemoryRegion *this_mr;
3483
3484    for (;;) {
3485        target_len -= len;
3486        addr += len;
3487        done += len;
3488        if (target_len == 0) {
3489            return done;
3490        }
3491
3492        len = target_len;
3493        this_mr = flatview_translate(fv, addr, &xlat,
3494                                     &len, is_write, attrs);
3495        if (this_mr != mr || xlat != base + done) {
3496            return done;
3497        }
3498    }
3499}
3500
3501/* Map a physical memory region into a host virtual address.
3502 * May map a subset of the requested range, given by and returned in *plen.
3503 * May return NULL if resources needed to perform the mapping are exhausted.
3504 * Use only for reads OR writes - not for read-modify-write operations.
3505 * Use cpu_register_map_client() to know when retrying the map operation is
3506 * likely to succeed.
3507 */
3508void *address_space_map(AddressSpace *as,
3509                        hwaddr addr,
3510                        hwaddr *plen,
3511                        bool is_write,
3512                        MemTxAttrs attrs)
3513{
3514    hwaddr len = *plen;
3515    hwaddr l, xlat;
3516    MemoryRegion *mr;
3517    void *ptr;
3518    FlatView *fv;
3519
3520    if (len == 0) {
3521        return NULL;
3522    }
3523
3524    l = len;
3525    RCU_READ_LOCK_GUARD();
3526    fv = address_space_to_flatview(as);
3527    mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3528
3529    if (!memory_access_is_direct(mr, is_write)) {
3530        if (atomic_xchg(&bounce.in_use, true)) {
3531            return NULL;
3532        }
3533        /* Avoid unbounded allocations */
3534        l = MIN(l, TARGET_PAGE_SIZE);
3535        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3536        bounce.addr = addr;
3537        bounce.len = l;
3538
3539        memory_region_ref(mr);
3540        bounce.mr = mr;
3541        if (!is_write) {
3542            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
3543                               bounce.buffer, l);
3544        }
3545
3546        *plen = l;
3547        return bounce.buffer;
3548    }
3549
3550
3551    memory_region_ref(mr);
3552    *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3553                                        l, is_write, attrs);
3554    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3555
3556    return ptr;
3557}
3558
3559/* Unmaps a memory region previously mapped by address_space_map().
3560 * Will also mark the memory as dirty if is_write == 1.  access_len gives
3561 * the amount of memory that was actually read or written by the caller.
3562 */
3563void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3564                         int is_write, hwaddr access_len)
3565{
3566    if (buffer != bounce.buffer) {
3567        MemoryRegion *mr;
3568        ram_addr_t addr1;
3569
3570        mr = memory_region_from_host(buffer, &addr1);
3571        assert(mr != NULL);
3572        if (is_write) {
3573            invalidate_and_set_dirty(mr, addr1, access_len);
3574        }
3575        if (xen_enabled()) {
3576            xen_invalidate_map_cache_entry(buffer);
3577        }
3578        memory_region_unref(mr);
3579        return;
3580    }
3581    if (is_write) {
3582        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3583                            bounce.buffer, access_len);
3584    }
3585    qemu_vfree(bounce.buffer);
3586    bounce.buffer = NULL;
3587    memory_region_unref(bounce.mr);
3588    atomic_mb_set(&bounce.in_use, false);
3589    cpu_notify_map_clients();
3590}
3591
3592void *cpu_physical_memory_map(hwaddr addr,
3593                              hwaddr *plen,
3594                              int is_write)
3595{
3596    return address_space_map(&address_space_memory, addr, plen, is_write,
3597                             MEMTXATTRS_UNSPECIFIED);
3598}
3599
3600void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3601                               int is_write, hwaddr access_len)
3602{
3603    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3604}
3605
3606#define ARG1_DECL                AddressSpace *as
3607#define ARG1                     as
3608#define SUFFIX
3609#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3610#define RCU_READ_LOCK(...)       rcu_read_lock()
3611#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3612#include "memory_ldst.inc.c"
3613
3614int64_t address_space_cache_init(MemoryRegionCache *cache,
3615                                 AddressSpace *as,
3616                                 hwaddr addr,
3617                                 hwaddr len,
3618                                 bool is_write)
3619{
3620    AddressSpaceDispatch *d;
3621    hwaddr l;
3622    MemoryRegion *mr;
3623
3624    assert(len > 0);
3625
3626    l = len;
3627    cache->fv = address_space_get_flatview(as);
3628    d = flatview_to_dispatch(cache->fv);
3629    cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
3630
3631    mr = cache->mrs.mr;
3632    memory_region_ref(mr);
3633    if (memory_access_is_direct(mr, is_write)) {
3634        /* We don't care about the memory attributes here as we're only
3635         * doing this if we found actual RAM, which behaves the same
3636         * regardless of attributes; so UNSPECIFIED is fine.
3637         */
3638        l = flatview_extend_translation(cache->fv, addr, len, mr,
3639                                        cache->xlat, l, is_write,
3640                                        MEMTXATTRS_UNSPECIFIED);
3641        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
3642    } else {
3643        cache->ptr = NULL;
3644    }
3645
3646    cache->len = l;
3647    cache->is_write = is_write;
3648    return l;
3649}
3650
3651void address_space_cache_invalidate(MemoryRegionCache *cache,
3652                                    hwaddr addr,
3653                                    hwaddr access_len)
3654{
3655    assert(cache->is_write);
3656    if (likely(cache->ptr)) {
3657        invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
3658    }
3659}
3660
3661void address_space_cache_destroy(MemoryRegionCache *cache)
3662{
3663    if (!cache->mrs.mr) {
3664        return;
3665    }
3666
3667    if (xen_enabled()) {
3668        xen_invalidate_map_cache_entry(cache->ptr);
3669    }
3670    memory_region_unref(cache->mrs.mr);
3671    flatview_unref(cache->fv);
3672    cache->mrs.mr = NULL;
3673    cache->fv = NULL;
3674}
3675
3676/* Called from RCU critical section.  This function has the same
3677 * semantics as address_space_translate, but it only works on a
3678 * predefined range of a MemoryRegion that was mapped with
3679 * address_space_cache_init.
3680 */
3681static inline MemoryRegion *address_space_translate_cached(
3682    MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
3683    hwaddr *plen, bool is_write, MemTxAttrs attrs)
3684{
3685    MemoryRegionSection section;
3686    MemoryRegion *mr;
3687    IOMMUMemoryRegion *iommu_mr;
3688    AddressSpace *target_as;
3689
3690    assert(!cache->ptr);
3691    *xlat = addr + cache->xlat;
3692
3693    mr = cache->mrs.mr;
3694    iommu_mr = memory_region_get_iommu(mr);
3695    if (!iommu_mr) {
3696        /* MMIO region.  */
3697        return mr;
3698    }
3699
3700    section = address_space_translate_iommu(iommu_mr, xlat, plen,
3701                                            NULL, is_write, true,
3702                                            &target_as, attrs);
3703    return section.mr;
3704}
3705
3706/* Called from RCU critical section. address_space_read_cached uses this
3707 * out of line function when the target is an MMIO or IOMMU region.
3708 */
3709void
3710address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3711                                   void *buf, hwaddr len)
3712{
3713    hwaddr addr1, l;
3714    MemoryRegion *mr;
3715
3716    l = len;
3717    mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
3718                                        MEMTXATTRS_UNSPECIFIED);
3719    flatview_read_continue(cache->fv,
3720                           addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3721                           addr1, l, mr);
3722}
3723
3724/* Called from RCU critical section. address_space_write_cached uses this
3725 * out of line function when the target is an MMIO or IOMMU region.
3726 */
3727void
3728address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3729                                    const void *buf, hwaddr len)
3730{
3731    hwaddr addr1, l;
3732    MemoryRegion *mr;
3733
3734    l = len;
3735    mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
3736                                        MEMTXATTRS_UNSPECIFIED);
3737    flatview_write_continue(cache->fv,
3738                            addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3739                            addr1, l, mr);
3740}
3741
3742#define ARG1_DECL                MemoryRegionCache *cache
3743#define ARG1                     cache
3744#define SUFFIX                   _cached_slow
3745#define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
3746#define RCU_READ_LOCK()          ((void)0)
3747#define RCU_READ_UNLOCK()        ((void)0)
3748#include "memory_ldst.inc.c"
3749
3750/* virtual memory access for debug (includes writing to ROM) */
3751int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3752                        uint8_t *buf, target_ulong len, int is_write)
3753{
3754    hwaddr phys_addr;
3755    target_ulong l, page;
3756
3757    cpu_synchronize_state(cpu);
3758    while (len > 0) {
3759        int asidx;
3760        MemTxAttrs attrs;
3761
3762        page = addr & TARGET_PAGE_MASK;
3763        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3764        asidx = cpu_asidx_from_attrs(cpu, attrs);
3765        /* if no physical page mapped, return an error */
3766        if (phys_addr == -1)
3767            return -1;
3768        l = (page + TARGET_PAGE_SIZE) - addr;
3769        if (l > len)
3770            l = len;
3771        phys_addr += (addr & ~TARGET_PAGE_MASK);
3772        if (is_write) {
3773            address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
3774                                    attrs, buf, l);
3775        } else {
3776            address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3777                             attrs, buf, l, 0);
3778        }
3779        len -= l;
3780        buf += l;
3781        addr += l;
3782    }
3783    return 0;
3784}
3785
3786/*
3787 * Allows code that needs to deal with migration bitmaps etc to still be built
3788 * target independent.
3789 */
3790size_t qemu_target_page_size(void)
3791{
3792    return TARGET_PAGE_SIZE;
3793}
3794
3795int qemu_target_page_bits(void)
3796{
3797    return TARGET_PAGE_BITS;
3798}
3799
3800int qemu_target_page_bits_min(void)
3801{
3802    return TARGET_PAGE_BITS_MIN;
3803}
3804#endif
3805
3806bool target_words_bigendian(void)
3807{
3808#if defined(TARGET_WORDS_BIGENDIAN)
3809    return true;
3810#else
3811    return false;
3812#endif
3813}
3814
3815#ifndef CONFIG_USER_ONLY
3816bool cpu_physical_memory_is_io(hwaddr phys_addr)
3817{
3818    MemoryRegion*mr;
3819    hwaddr l = 1;
3820    bool res;
3821
3822    RCU_READ_LOCK_GUARD();
3823    mr = address_space_translate(&address_space_memory,
3824                                 phys_addr, &phys_addr, &l, false,
3825                                 MEMTXATTRS_UNSPECIFIED);
3826
3827    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3828    return res;
3829}
3830
3831int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3832{
3833    RAMBlock *block;
3834    int ret = 0;
3835
3836    RCU_READ_LOCK_GUARD();
3837    RAMBLOCK_FOREACH(block) {
3838        ret = func(block, opaque);
3839        if (ret) {
3840            break;
3841        }
3842    }
3843    return ret;
3844}
3845
3846/*
3847 * Unmap pages of memory from start to start+length such that
3848 * they a) read as 0, b) Trigger whatever fault mechanism
3849 * the OS provides for postcopy.
3850 * The pages must be unmapped by the end of the function.
3851 * Returns: 0 on success, none-0 on failure
3852 *
3853 */
3854int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
3855{
3856    int ret = -1;
3857
3858    uint8_t *host_startaddr = rb->host + start;
3859
3860    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
3861        error_report("ram_block_discard_range: Unaligned start address: %p",
3862                     host_startaddr);
3863        goto err;
3864    }
3865
3866    if ((start + length) <= rb->used_length) {
3867        bool need_madvise, need_fallocate;
3868        uint8_t *host_endaddr = host_startaddr + length;
3869        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
3870            error_report("ram_block_discard_range: Unaligned end address: %p",
3871                         host_endaddr);
3872            goto err;
3873        }
3874
3875        errno = ENOTSUP; /* If we are missing MADVISE etc */
3876
3877        /* The logic here is messy;
3878         *    madvise DONTNEED fails for hugepages
3879         *    fallocate works on hugepages and shmem
3880         */
3881        need_madvise = (rb->page_size == qemu_host_page_size);
3882        need_fallocate = rb->fd != -1;
3883        if (need_fallocate) {
3884            /* For a file, this causes the area of the file to be zero'd
3885             * if read, and for hugetlbfs also causes it to be unmapped
3886             * so a userfault will trigger.
3887             */
3888#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
3889            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
3890                            start, length);
3891            if (ret) {
3892                ret = -errno;
3893                error_report("ram_block_discard_range: Failed to fallocate "
3894                             "%s:%" PRIx64 " +%zx (%d)",
3895                             rb->idstr, start, length, ret);
3896                goto err;
3897            }
3898#else
3899            ret = -ENOSYS;
3900            error_report("ram_block_discard_range: fallocate not available/file"
3901                         "%s:%" PRIx64 " +%zx (%d)",
3902                         rb->idstr, start, length, ret);
3903            goto err;
3904#endif
3905        }
3906        if (need_madvise) {
3907            /* For normal RAM this causes it to be unmapped,
3908             * for shared memory it causes the local mapping to disappear
3909             * and to fall back on the file contents (which we just
3910             * fallocate'd away).
3911             */
3912#if defined(CONFIG_MADVISE)
3913            ret =  madvise(host_startaddr, length, MADV_DONTNEED);
3914            if (ret) {
3915                ret = -errno;
3916                error_report("ram_block_discard_range: Failed to discard range "
3917                             "%s:%" PRIx64 " +%zx (%d)",
3918                             rb->idstr, start, length, ret);
3919                goto err;
3920            }
3921#else
3922            ret = -ENOSYS;
3923            error_report("ram_block_discard_range: MADVISE not available"
3924                         "%s:%" PRIx64 " +%zx (%d)",
3925                         rb->idstr, start, length, ret);
3926            goto err;
3927#endif
3928        }
3929        trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
3930                                      need_madvise, need_fallocate, ret);
3931    } else {
3932        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
3933                     "/%zx/" RAM_ADDR_FMT")",
3934                     rb->idstr, start, length, rb->used_length);
3935    }
3936
3937err:
3938    return ret;
3939}
3940
3941bool ramblock_is_pmem(RAMBlock *rb)
3942{
3943    return rb->flags & RAM_PMEM;
3944}
3945
3946#endif
3947
3948void page_size_init(void)
3949{
3950    /* NOTE: we can always suppose that qemu_host_page_size >=
3951       TARGET_PAGE_SIZE */
3952    if (qemu_host_page_size == 0) {
3953        qemu_host_page_size = qemu_real_host_page_size;
3954    }
3955    if (qemu_host_page_size < TARGET_PAGE_SIZE) {
3956        qemu_host_page_size = TARGET_PAGE_SIZE;
3957    }
3958    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
3959}
3960
3961#if !defined(CONFIG_USER_ONLY)
3962
3963static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
3964{
3965    if (start == end - 1) {
3966        qemu_printf("\t%3d      ", start);
3967    } else {
3968        qemu_printf("\t%3d..%-3d ", start, end - 1);
3969    }
3970    qemu_printf(" skip=%d ", skip);
3971    if (ptr == PHYS_MAP_NODE_NIL) {
3972        qemu_printf(" ptr=NIL");
3973    } else if (!skip) {
3974        qemu_printf(" ptr=#%d", ptr);
3975    } else {
3976        qemu_printf(" ptr=[%d]", ptr);
3977    }
3978    qemu_printf("\n");
3979}
3980
3981#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
3982                           int128_sub((size), int128_one())) : 0)
3983
3984void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
3985{
3986    int i;
3987
3988    qemu_printf("  Dispatch\n");
3989    qemu_printf("    Physical sections\n");
3990
3991    for (i = 0; i < d->map.sections_nb; ++i) {
3992        MemoryRegionSection *s = d->map.sections + i;
3993        const char *names[] = { " [unassigned]", " [not dirty]",
3994                                " [ROM]", " [watch]" };
3995
3996        qemu_printf("      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx
3997                    " %s%s%s%s%s",
3998            i,
3999            s->offset_within_address_space,
4000            s->offset_within_address_space + MR_SIZE(s->mr->size),
4001            s->mr->name ? s->mr->name : "(noname)",
4002            i < ARRAY_SIZE(names) ? names[i] : "",
4003            s->mr == root ? " [ROOT]" : "",
4004            s == d->mru_section ? " [MRU]" : "",
4005            s->mr->is_iommu ? " [iommu]" : "");
4006
4007        if (s->mr->alias) {
4008            qemu_printf(" alias=%s", s->mr->alias->name ?
4009                    s->mr->alias->name : "noname");
4010        }
4011        qemu_printf("\n");
4012    }
4013
4014    qemu_printf("    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
4015               P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
4016    for (i = 0; i < d->map.nodes_nb; ++i) {
4017        int j, jprev;
4018        PhysPageEntry prev;
4019        Node *n = d->map.nodes + i;
4020
4021        qemu_printf("      [%d]\n", i);
4022
4023        for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
4024            PhysPageEntry *pe = *n + j;
4025
4026            if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
4027                continue;
4028            }
4029
4030            mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4031
4032            jprev = j;
4033            prev = *pe;
4034        }
4035
4036        if (jprev != ARRAY_SIZE(*n)) {
4037            mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4038        }
4039    }
4040}
4041
4042#endif
4043