qemu/exec.c
<<
>>
Prefs
   1/*
   2 *  Virtual page mapping
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19#include "qemu/osdep.h"
  20#include "qapi/error.h"
  21#ifndef _WIN32
  22#endif
  23
  24#include "qemu/cutils.h"
  25#include "cpu.h"
  26#include "exec/exec-all.h"
  27#include "exec/target_page.h"
  28#include "tcg.h"
  29#include "hw/qdev-core.h"
  30#include "hw/qdev-properties.h"
  31#if !defined(CONFIG_USER_ONLY)
  32#include "hw/boards.h"
  33#include "hw/xen/xen.h"
  34#endif
  35#include "sysemu/kvm.h"
  36#include "sysemu/sysemu.h"
  37#include "qemu/timer.h"
  38#include "qemu/config-file.h"
  39#include "qemu/error-report.h"
  40#if defined(CONFIG_USER_ONLY)
  41#include "qemu.h"
  42#else /* !CONFIG_USER_ONLY */
  43#include "hw/hw.h"
  44#include "exec/memory.h"
  45#include "exec/ioport.h"
  46#include "sysemu/dma.h"
  47#include "sysemu/numa.h"
  48#include "sysemu/hw_accel.h"
  49#include "exec/address-spaces.h"
  50#include "sysemu/xen-mapcache.h"
  51#include "trace-root.h"
  52
  53#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
  54#include <fcntl.h>
  55#include <linux/falloc.h>
  56#endif
  57
  58#endif
  59#include "qemu/rcu_queue.h"
  60#include "qemu/main-loop.h"
  61#include "translate-all.h"
  62#include "sysemu/replay.h"
  63
  64#include "exec/memory-internal.h"
  65#include "exec/ram_addr.h"
  66#include "exec/log.h"
  67
  68#include "migration/vmstate.h"
  69
  70#include "qemu/range.h"
  71#ifndef _WIN32
  72#include "qemu/mmap-alloc.h"
  73#endif
  74
  75#include "monitor/monitor.h"
  76
  77//#define DEBUG_SUBPAGE
  78
  79#if !defined(CONFIG_USER_ONLY)
  80/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  81 * are protected by the ramlist lock.
  82 */
  83RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  84
  85static MemoryRegion *system_memory;
  86static MemoryRegion *system_io;
  87
  88AddressSpace address_space_io;
  89AddressSpace address_space_memory;
  90
  91MemoryRegion io_mem_rom, io_mem_notdirty;
  92static MemoryRegion io_mem_unassigned;
  93
  94/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  95#define RAM_PREALLOC   (1 << 0)
  96
  97/* RAM is mmap-ed with MAP_SHARED */
  98#define RAM_SHARED     (1 << 1)
  99
 100/* Only a portion of RAM (used_length) is actually used, and migrated.
 101 * This used_length size can change across reboots.
 102 */
 103#define RAM_RESIZEABLE (1 << 2)
 104
 105#endif
 106
 107#ifdef TARGET_PAGE_BITS_VARY
 108int target_page_bits;
 109bool target_page_bits_decided;
 110#endif
 111
 112struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
 113/* current CPU in the current thread. It is only valid inside
 114   cpu_exec() */
 115__thread CPUState *current_cpu;
 116/* 0 = Do not count executed instructions.
 117   1 = Precise instruction counting.
 118   2 = Adaptive rate instruction counting.  */
 119int use_icount;
 120
 121uintptr_t qemu_host_page_size;
 122intptr_t qemu_host_page_mask;
 123
 124bool set_preferred_target_page_bits(int bits)
 125{
 126    /* The target page size is the lowest common denominator for all
 127     * the CPUs in the system, so we can only make it smaller, never
 128     * larger. And we can't make it smaller once we've committed to
 129     * a particular size.
 130     */
 131#ifdef TARGET_PAGE_BITS_VARY
 132    assert(bits >= TARGET_PAGE_BITS_MIN);
 133    if (target_page_bits == 0 || target_page_bits > bits) {
 134        if (target_page_bits_decided) {
 135            return false;
 136        }
 137        target_page_bits = bits;
 138    }
 139#endif
 140    return true;
 141}
 142
 143#if !defined(CONFIG_USER_ONLY)
 144
 145static void finalize_target_page_bits(void)
 146{
 147#ifdef TARGET_PAGE_BITS_VARY
 148    if (target_page_bits == 0) {
 149        target_page_bits = TARGET_PAGE_BITS_MIN;
 150    }
 151    target_page_bits_decided = true;
 152#endif
 153}
 154
 155typedef struct PhysPageEntry PhysPageEntry;
 156
 157struct PhysPageEntry {
 158    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 159    uint32_t skip : 6;
 160     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 161    uint32_t ptr : 26;
 162};
 163
 164#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 165
 166/* Size of the L2 (and L3, etc) page tables.  */
 167#define ADDR_SPACE_BITS 64
 168
 169#define P_L2_BITS 9
 170#define P_L2_SIZE (1 << P_L2_BITS)
 171
 172#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 173
 174typedef PhysPageEntry Node[P_L2_SIZE];
 175
 176typedef struct PhysPageMap {
 177    struct rcu_head rcu;
 178
 179    unsigned sections_nb;
 180    unsigned sections_nb_alloc;
 181    unsigned nodes_nb;
 182    unsigned nodes_nb_alloc;
 183    Node *nodes;
 184    MemoryRegionSection *sections;
 185} PhysPageMap;
 186
 187struct AddressSpaceDispatch {
 188    MemoryRegionSection *mru_section;
 189    /* This is a multi-level map on the physical address space.
 190     * The bottom level has pointers to MemoryRegionSections.
 191     */
 192    PhysPageEntry phys_map;
 193    PhysPageMap map;
 194};
 195
 196#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 197typedef struct subpage_t {
 198    MemoryRegion iomem;
 199    FlatView *fv;
 200    hwaddr base;
 201    uint16_t sub_section[];
 202} subpage_t;
 203
 204#define PHYS_SECTION_UNASSIGNED 0
 205#define PHYS_SECTION_NOTDIRTY 1
 206#define PHYS_SECTION_ROM 2
 207#define PHYS_SECTION_WATCH 3
 208
 209static void io_mem_init(void);
 210static void memory_map_init(void);
 211static void tcg_commit(MemoryListener *listener);
 212
 213static MemoryRegion io_mem_watch;
 214
 215/**
 216 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 217 * @cpu: the CPU whose AddressSpace this is
 218 * @as: the AddressSpace itself
 219 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 220 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 221 */
 222struct CPUAddressSpace {
 223    CPUState *cpu;
 224    AddressSpace *as;
 225    struct AddressSpaceDispatch *memory_dispatch;
 226    MemoryListener tcg_as_listener;
 227};
 228
 229struct DirtyBitmapSnapshot {
 230    ram_addr_t start;
 231    ram_addr_t end;
 232    unsigned long dirty[];
 233};
 234
 235#endif
 236
 237#if !defined(CONFIG_USER_ONLY)
 238
 239static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 240{
 241    static unsigned alloc_hint = 16;
 242    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 243        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
 244        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 245        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 246        alloc_hint = map->nodes_nb_alloc;
 247    }
 248}
 249
 250static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 251{
 252    unsigned i;
 253    uint32_t ret;
 254    PhysPageEntry e;
 255    PhysPageEntry *p;
 256
 257    ret = map->nodes_nb++;
 258    p = map->nodes[ret];
 259    assert(ret != PHYS_MAP_NODE_NIL);
 260    assert(ret != map->nodes_nb_alloc);
 261
 262    e.skip = leaf ? 0 : 1;
 263    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 264    for (i = 0; i < P_L2_SIZE; ++i) {
 265        memcpy(&p[i], &e, sizeof(e));
 266    }
 267    return ret;
 268}
 269
 270static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 271                                hwaddr *index, hwaddr *nb, uint16_t leaf,
 272                                int level)
 273{
 274    PhysPageEntry *p;
 275    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 276
 277    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 278        lp->ptr = phys_map_node_alloc(map, level == 0);
 279    }
 280    p = map->nodes[lp->ptr];
 281    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 282
 283    while (*nb && lp < &p[P_L2_SIZE]) {
 284        if ((*index & (step - 1)) == 0 && *nb >= step) {
 285            lp->skip = 0;
 286            lp->ptr = leaf;
 287            *index += step;
 288            *nb -= step;
 289        } else {
 290            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 291        }
 292        ++lp;
 293    }
 294}
 295
 296static void phys_page_set(AddressSpaceDispatch *d,
 297                          hwaddr index, hwaddr nb,
 298                          uint16_t leaf)
 299{
 300    /* Wildly overreserve - it doesn't matter much. */
 301    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 302
 303    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 304}
 305
 306/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 307 * and update our entry so we can skip it and go directly to the destination.
 308 */
 309static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 310{
 311    unsigned valid_ptr = P_L2_SIZE;
 312    int valid = 0;
 313    PhysPageEntry *p;
 314    int i;
 315
 316    if (lp->ptr == PHYS_MAP_NODE_NIL) {
 317        return;
 318    }
 319
 320    p = nodes[lp->ptr];
 321    for (i = 0; i < P_L2_SIZE; i++) {
 322        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 323            continue;
 324        }
 325
 326        valid_ptr = i;
 327        valid++;
 328        if (p[i].skip) {
 329            phys_page_compact(&p[i], nodes);
 330        }
 331    }
 332
 333    /* We can only compress if there's only one child. */
 334    if (valid != 1) {
 335        return;
 336    }
 337
 338    assert(valid_ptr < P_L2_SIZE);
 339
 340    /* Don't compress if it won't fit in the # of bits we have. */
 341    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 342        return;
 343    }
 344
 345    lp->ptr = p[valid_ptr].ptr;
 346    if (!p[valid_ptr].skip) {
 347        /* If our only child is a leaf, make this a leaf. */
 348        /* By design, we should have made this node a leaf to begin with so we
 349         * should never reach here.
 350         * But since it's so simple to handle this, let's do it just in case we
 351         * change this rule.
 352         */
 353        lp->skip = 0;
 354    } else {
 355        lp->skip += p[valid_ptr].skip;
 356    }
 357}
 358
 359void address_space_dispatch_compact(AddressSpaceDispatch *d)
 360{
 361    if (d->phys_map.skip) {
 362        phys_page_compact(&d->phys_map, d->map.nodes);
 363    }
 364}
 365
 366static inline bool section_covers_addr(const MemoryRegionSection *section,
 367                                       hwaddr addr)
 368{
 369    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 370     * the section must cover the entire address space.
 371     */
 372    return int128_gethi(section->size) ||
 373           range_covers_byte(section->offset_within_address_space,
 374                             int128_getlo(section->size), addr);
 375}
 376
 377static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
 378{
 379    PhysPageEntry lp = d->phys_map, *p;
 380    Node *nodes = d->map.nodes;
 381    MemoryRegionSection *sections = d->map.sections;
 382    hwaddr index = addr >> TARGET_PAGE_BITS;
 383    int i;
 384
 385    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 386        if (lp.ptr == PHYS_MAP_NODE_NIL) {
 387            return &sections[PHYS_SECTION_UNASSIGNED];
 388        }
 389        p = nodes[lp.ptr];
 390        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 391    }
 392
 393    if (section_covers_addr(&sections[lp.ptr], addr)) {
 394        return &sections[lp.ptr];
 395    } else {
 396        return &sections[PHYS_SECTION_UNASSIGNED];
 397    }
 398}
 399
 400bool memory_region_is_unassigned(MemoryRegion *mr)
 401{
 402    return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 403        && mr != &io_mem_watch;
 404}
 405
 406/* Called from RCU critical section */
 407static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 408                                                        hwaddr addr,
 409                                                        bool resolve_subpage)
 410{
 411    MemoryRegionSection *section = atomic_read(&d->mru_section);
 412    subpage_t *subpage;
 413
 414    if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
 415        !section_covers_addr(section, addr)) {
 416        section = phys_page_find(d, addr);
 417        atomic_set(&d->mru_section, section);
 418    }
 419    if (resolve_subpage && section->mr->subpage) {
 420        subpage = container_of(section->mr, subpage_t, iomem);
 421        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 422    }
 423    return section;
 424}
 425
 426/* Called from RCU critical section */
 427static MemoryRegionSection *
 428address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 429                                 hwaddr *plen, bool resolve_subpage)
 430{
 431    MemoryRegionSection *section;
 432    MemoryRegion *mr;
 433    Int128 diff;
 434
 435    section = address_space_lookup_region(d, addr, resolve_subpage);
 436    /* Compute offset within MemoryRegionSection */
 437    addr -= section->offset_within_address_space;
 438
 439    /* Compute offset within MemoryRegion */
 440    *xlat = addr + section->offset_within_region;
 441
 442    mr = section->mr;
 443
 444    /* MMIO registers can be expected to perform full-width accesses based only
 445     * on their address, without considering adjacent registers that could
 446     * decode to completely different MemoryRegions.  When such registers
 447     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 448     * regions overlap wildly.  For this reason we cannot clamp the accesses
 449     * here.
 450     *
 451     * If the length is small (as is the case for address_space_ldl/stl),
 452     * everything works fine.  If the incoming length is large, however,
 453     * the caller really has to do the clamping through memory_access_size.
 454     */
 455    if (memory_region_is_ram(mr)) {
 456        diff = int128_sub(section->size, int128_make64(addr));
 457        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 458    }
 459    return section;
 460}
 461
 462/**
 463 * flatview_do_translate - translate an address in FlatView
 464 *
 465 * @fv: the flat view that we want to translate on
 466 * @addr: the address to be translated in above address space
 467 * @xlat: the translated address offset within memory region. It
 468 *        cannot be @NULL.
 469 * @plen_out: valid read/write length of the translated address. It
 470 *            can be @NULL when we don't care about it.
 471 * @page_mask_out: page mask for the translated address. This
 472 *            should only be meaningful for IOMMU translated
 473 *            addresses, since there may be huge pages that this bit
 474 *            would tell. It can be @NULL if we don't care about it.
 475 * @is_write: whether the translation operation is for write
 476 * @is_mmio: whether this can be MMIO, set true if it can
 477 *
 478 * This function is called from RCU critical section
 479 */
 480static MemoryRegionSection flatview_do_translate(FlatView *fv,
 481                                                 hwaddr addr,
 482                                                 hwaddr *xlat,
 483                                                 hwaddr *plen_out,
 484                                                 hwaddr *page_mask_out,
 485                                                 bool is_write,
 486                                                 bool is_mmio,
 487                                                 AddressSpace **target_as)
 488{
 489    IOMMUTLBEntry iotlb;
 490    MemoryRegionSection *section;
 491    IOMMUMemoryRegion *iommu_mr;
 492    IOMMUMemoryRegionClass *imrc;
 493    hwaddr page_mask = (hwaddr)(-1);
 494    hwaddr plen = (hwaddr)(-1);
 495
 496    if (plen_out) {
 497        plen = *plen_out;
 498    }
 499
 500    for (;;) {
 501        section = address_space_translate_internal(
 502                flatview_to_dispatch(fv), addr, &addr,
 503                &plen, is_mmio);
 504
 505        iommu_mr = memory_region_get_iommu(section->mr);
 506        if (!iommu_mr) {
 507            break;
 508        }
 509        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 510
 511        iotlb = imrc->translate(iommu_mr, addr, is_write ?
 512                                IOMMU_WO : IOMMU_RO);
 513        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 514                | (addr & iotlb.addr_mask));
 515        page_mask &= iotlb.addr_mask;
 516        plen = MIN(plen, (addr | iotlb.addr_mask) - addr + 1);
 517        if (!(iotlb.perm & (1 << is_write))) {
 518            goto translate_fail;
 519        }
 520
 521        fv = address_space_to_flatview(iotlb.target_as);
 522        *target_as = iotlb.target_as;
 523    }
 524
 525    *xlat = addr;
 526
 527    if (page_mask == (hwaddr)(-1)) {
 528        /* Not behind an IOMMU, use default page size. */
 529        page_mask = ~TARGET_PAGE_MASK;
 530    }
 531
 532    if (page_mask_out) {
 533        *page_mask_out = page_mask;
 534    }
 535
 536    if (plen_out) {
 537        *plen_out = plen;
 538    }
 539
 540    return *section;
 541
 542translate_fail:
 543    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
 544}
 545
 546/* Called from RCU critical section */
 547IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
 548                                            bool is_write)
 549{
 550    MemoryRegionSection section;
 551    hwaddr xlat, page_mask;
 552
 553    /*
 554     * This can never be MMIO, and we don't really care about plen,
 555     * but page mask.
 556     */
 557    section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
 558                                    NULL, &page_mask, is_write, false, &as);
 559
 560    /* Illegal translation */
 561    if (section.mr == &io_mem_unassigned) {
 562        goto iotlb_fail;
 563    }
 564
 565    /* Convert memory region offset into address space offset */
 566    xlat += section.offset_within_address_space -
 567        section.offset_within_region;
 568
 569    return (IOMMUTLBEntry) {
 570        .target_as = as,
 571        .iova = addr & ~page_mask,
 572        .translated_addr = xlat & ~page_mask,
 573        .addr_mask = page_mask,
 574        /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
 575        .perm = IOMMU_RW,
 576    };
 577
 578iotlb_fail:
 579    return (IOMMUTLBEntry) {0};
 580}
 581
 582/* Called from RCU critical section */
 583MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
 584                                 hwaddr *plen, bool is_write)
 585{
 586    MemoryRegion *mr;
 587    MemoryRegionSection section;
 588    AddressSpace *as = NULL;
 589
 590    /* This can be MMIO, so setup MMIO bit. */
 591    section = flatview_do_translate(fv, addr, xlat, plen, NULL,
 592                                    is_write, true, &as);
 593    mr = section.mr;
 594
 595    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 596        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 597        *plen = MIN(page, *plen);
 598    }
 599
 600    return mr;
 601}
 602
 603/* Called from RCU critical section */
 604MemoryRegionSection *
 605address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 606                                  hwaddr *xlat, hwaddr *plen)
 607{
 608    MemoryRegionSection *section;
 609    AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 610
 611    section = address_space_translate_internal(d, addr, xlat, plen, false);
 612
 613    assert(!memory_region_is_iommu(section->mr));
 614    return section;
 615}
 616#endif
 617
 618#if !defined(CONFIG_USER_ONLY)
 619
 620static int cpu_common_post_load(void *opaque, int version_id)
 621{
 622    CPUState *cpu = opaque;
 623
 624    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 625       version_id is increased. */
 626    cpu->interrupt_request &= ~0x01;
 627    tlb_flush(cpu);
 628
 629    return 0;
 630}
 631
 632static int cpu_common_pre_load(void *opaque)
 633{
 634    CPUState *cpu = opaque;
 635
 636    cpu->exception_index = -1;
 637
 638    return 0;
 639}
 640
 641static bool cpu_common_exception_index_needed(void *opaque)
 642{
 643    CPUState *cpu = opaque;
 644
 645    return tcg_enabled() && cpu->exception_index != -1;
 646}
 647
 648static const VMStateDescription vmstate_cpu_common_exception_index = {
 649    .name = "cpu_common/exception_index",
 650    .version_id = 1,
 651    .minimum_version_id = 1,
 652    .needed = cpu_common_exception_index_needed,
 653    .fields = (VMStateField[]) {
 654        VMSTATE_INT32(exception_index, CPUState),
 655        VMSTATE_END_OF_LIST()
 656    }
 657};
 658
 659static bool cpu_common_crash_occurred_needed(void *opaque)
 660{
 661    CPUState *cpu = opaque;
 662
 663    return cpu->crash_occurred;
 664}
 665
 666static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 667    .name = "cpu_common/crash_occurred",
 668    .version_id = 1,
 669    .minimum_version_id = 1,
 670    .needed = cpu_common_crash_occurred_needed,
 671    .fields = (VMStateField[]) {
 672        VMSTATE_BOOL(crash_occurred, CPUState),
 673        VMSTATE_END_OF_LIST()
 674    }
 675};
 676
 677const VMStateDescription vmstate_cpu_common = {
 678    .name = "cpu_common",
 679    .version_id = 1,
 680    .minimum_version_id = 1,
 681    .pre_load = cpu_common_pre_load,
 682    .post_load = cpu_common_post_load,
 683    .fields = (VMStateField[]) {
 684        VMSTATE_UINT32(halted, CPUState),
 685        VMSTATE_UINT32(interrupt_request, CPUState),
 686        VMSTATE_END_OF_LIST()
 687    },
 688    .subsections = (const VMStateDescription*[]) {
 689        &vmstate_cpu_common_exception_index,
 690        &vmstate_cpu_common_crash_occurred,
 691        NULL
 692    }
 693};
 694
 695#endif
 696
 697CPUState *qemu_get_cpu(int index)
 698{
 699    CPUState *cpu;
 700
 701    CPU_FOREACH(cpu) {
 702        if (cpu->cpu_index == index) {
 703            return cpu;
 704        }
 705    }
 706
 707    return NULL;
 708}
 709
 710#if !defined(CONFIG_USER_ONLY)
 711void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
 712{
 713    CPUAddressSpace *newas;
 714
 715    /* Target code should have set num_ases before calling us */
 716    assert(asidx < cpu->num_ases);
 717
 718    if (asidx == 0) {
 719        /* address space 0 gets the convenience alias */
 720        cpu->as = as;
 721    }
 722
 723    /* KVM cannot currently support multiple address spaces. */
 724    assert(asidx == 0 || !kvm_enabled());
 725
 726    if (!cpu->cpu_ases) {
 727        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 728    }
 729
 730    newas = &cpu->cpu_ases[asidx];
 731    newas->cpu = cpu;
 732    newas->as = as;
 733    if (tcg_enabled()) {
 734        newas->tcg_as_listener.commit = tcg_commit;
 735        memory_listener_register(&newas->tcg_as_listener, as);
 736    }
 737}
 738
 739AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 740{
 741    /* Return the AddressSpace corresponding to the specified index */
 742    return cpu->cpu_ases[asidx].as;
 743}
 744#endif
 745
 746void cpu_exec_unrealizefn(CPUState *cpu)
 747{
 748    CPUClass *cc = CPU_GET_CLASS(cpu);
 749
 750    cpu_list_remove(cpu);
 751
 752    if (cc->vmsd != NULL) {
 753        vmstate_unregister(NULL, cc->vmsd, cpu);
 754    }
 755    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 756        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 757    }
 758}
 759
 760Property cpu_common_props[] = {
 761#ifndef CONFIG_USER_ONLY
 762    /* Create a memory property for softmmu CPU object,
 763     * so users can wire up its memory. (This can't go in qom/cpu.c
 764     * because that file is compiled only once for both user-mode
 765     * and system builds.) The default if no link is set up is to use
 766     * the system address space.
 767     */
 768    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
 769                     MemoryRegion *),
 770#endif
 771    DEFINE_PROP_END_OF_LIST(),
 772};
 773
 774void cpu_exec_initfn(CPUState *cpu)
 775{
 776    cpu->as = NULL;
 777    cpu->num_ases = 0;
 778
 779#ifndef CONFIG_USER_ONLY
 780    cpu->thread_id = qemu_get_thread_id();
 781    cpu->memory = system_memory;
 782    object_ref(OBJECT(cpu->memory));
 783#endif
 784}
 785
 786void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 787{
 788    CPUClass *cc = CPU_GET_CLASS(cpu);
 789    static bool tcg_target_initialized;
 790
 791    cpu_list_add(cpu);
 792
 793    if (tcg_enabled() && !tcg_target_initialized) {
 794        tcg_target_initialized = true;
 795        cc->tcg_initialize();
 796    }
 797
 798#ifndef CONFIG_USER_ONLY
 799    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 800        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 801    }
 802    if (cc->vmsd != NULL) {
 803        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 804    }
 805#endif
 806}
 807
 808#if defined(CONFIG_USER_ONLY)
 809static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 810{
 811    mmap_lock();
 812    tb_lock();
 813    tb_invalidate_phys_page_range(pc, pc + 1, 0);
 814    tb_unlock();
 815    mmap_unlock();
 816}
 817#else
 818static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 819{
 820    MemTxAttrs attrs;
 821    hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
 822    int asidx = cpu_asidx_from_attrs(cpu, attrs);
 823    if (phys != -1) {
 824        /* Locks grabbed by tb_invalidate_phys_addr */
 825        tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
 826                                phys | (pc & ~TARGET_PAGE_MASK));
 827    }
 828}
 829#endif
 830
 831#if defined(CONFIG_USER_ONLY)
 832void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 833
 834{
 835}
 836
 837int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 838                          int flags)
 839{
 840    return -ENOSYS;
 841}
 842
 843void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 844{
 845}
 846
 847int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 848                          int flags, CPUWatchpoint **watchpoint)
 849{
 850    return -ENOSYS;
 851}
 852#else
 853/* Add a watchpoint.  */
 854int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 855                          int flags, CPUWatchpoint **watchpoint)
 856{
 857    CPUWatchpoint *wp;
 858
 859    /* forbid ranges which are empty or run off the end of the address space */
 860    if (len == 0 || (addr + len - 1) < addr) {
 861        error_report("tried to set invalid watchpoint at %"
 862                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 863        return -EINVAL;
 864    }
 865    wp = g_malloc(sizeof(*wp));
 866
 867    wp->vaddr = addr;
 868    wp->len = len;
 869    wp->flags = flags;
 870
 871    /* keep all GDB-injected watchpoints in front */
 872    if (flags & BP_GDB) {
 873        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 874    } else {
 875        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 876    }
 877
 878    tlb_flush_page(cpu, addr);
 879
 880    if (watchpoint)
 881        *watchpoint = wp;
 882    return 0;
 883}
 884
 885/* Remove a specific watchpoint.  */
 886int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 887                          int flags)
 888{
 889    CPUWatchpoint *wp;
 890
 891    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 892        if (addr == wp->vaddr && len == wp->len
 893                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 894            cpu_watchpoint_remove_by_ref(cpu, wp);
 895            return 0;
 896        }
 897    }
 898    return -ENOENT;
 899}
 900
 901/* Remove a specific watchpoint by reference.  */
 902void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 903{
 904    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 905
 906    tlb_flush_page(cpu, watchpoint->vaddr);
 907
 908    g_free(watchpoint);
 909}
 910
 911/* Remove all matching watchpoints.  */
 912void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 913{
 914    CPUWatchpoint *wp, *next;
 915
 916    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 917        if (wp->flags & mask) {
 918            cpu_watchpoint_remove_by_ref(cpu, wp);
 919        }
 920    }
 921}
 922
 923/* Return true if this watchpoint address matches the specified
 924 * access (ie the address range covered by the watchpoint overlaps
 925 * partially or completely with the address range covered by the
 926 * access).
 927 */
 928static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 929                                                  vaddr addr,
 930                                                  vaddr len)
 931{
 932    /* We know the lengths are non-zero, but a little caution is
 933     * required to avoid errors in the case where the range ends
 934     * exactly at the top of the address space and so addr + len
 935     * wraps round to zero.
 936     */
 937    vaddr wpend = wp->vaddr + wp->len - 1;
 938    vaddr addrend = addr + len - 1;
 939
 940    return !(addr > wpend || wp->vaddr > addrend);
 941}
 942
 943#endif
 944
 945/* Add a breakpoint.  */
 946int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 947                          CPUBreakpoint **breakpoint)
 948{
 949    CPUBreakpoint *bp;
 950
 951    bp = g_malloc(sizeof(*bp));
 952
 953    bp->pc = pc;
 954    bp->flags = flags;
 955
 956    /* keep all GDB-injected breakpoints in front */
 957    if (flags & BP_GDB) {
 958        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 959    } else {
 960        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 961    }
 962
 963    breakpoint_invalidate(cpu, pc);
 964
 965    if (breakpoint) {
 966        *breakpoint = bp;
 967    }
 968    return 0;
 969}
 970
 971/* Remove a specific breakpoint.  */
 972int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 973{
 974    CPUBreakpoint *bp;
 975
 976    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 977        if (bp->pc == pc && bp->flags == flags) {
 978            cpu_breakpoint_remove_by_ref(cpu, bp);
 979            return 0;
 980        }
 981    }
 982    return -ENOENT;
 983}
 984
 985/* Remove a specific breakpoint by reference.  */
 986void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 987{
 988    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 989
 990    breakpoint_invalidate(cpu, breakpoint->pc);
 991
 992    g_free(breakpoint);
 993}
 994
 995/* Remove all matching breakpoints. */
 996void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 997{
 998    CPUBreakpoint *bp, *next;
 999
1000    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
1001        if (bp->flags & mask) {
1002            cpu_breakpoint_remove_by_ref(cpu, bp);
1003        }
1004    }
1005}
1006
1007/* enable or disable single step mode. EXCP_DEBUG is returned by the
1008   CPU loop after each instruction */
1009void cpu_single_step(CPUState *cpu, int enabled)
1010{
1011    if (cpu->singlestep_enabled != enabled) {
1012        cpu->singlestep_enabled = enabled;
1013        if (kvm_enabled()) {
1014            kvm_update_guest_debug(cpu, 0);
1015        } else {
1016            /* must flush all the translated code to avoid inconsistencies */
1017            /* XXX: only flush what is necessary */
1018            tb_flush(cpu);
1019        }
1020    }
1021}
1022
1023void cpu_abort(CPUState *cpu, const char *fmt, ...)
1024{
1025    va_list ap;
1026    va_list ap2;
1027
1028    va_start(ap, fmt);
1029    va_copy(ap2, ap);
1030    fprintf(stderr, "qemu: fatal: ");
1031    vfprintf(stderr, fmt, ap);
1032    fprintf(stderr, "\n");
1033    cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1034    if (qemu_log_separate()) {
1035        qemu_log_lock();
1036        qemu_log("qemu: fatal: ");
1037        qemu_log_vprintf(fmt, ap2);
1038        qemu_log("\n");
1039        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1040        qemu_log_flush();
1041        qemu_log_unlock();
1042        qemu_log_close();
1043    }
1044    va_end(ap2);
1045    va_end(ap);
1046    replay_finish();
1047#if defined(CONFIG_USER_ONLY)
1048    {
1049        struct sigaction act;
1050        sigfillset(&act.sa_mask);
1051        act.sa_handler = SIG_DFL;
1052        sigaction(SIGABRT, &act, NULL);
1053    }
1054#endif
1055    abort();
1056}
1057
1058#if !defined(CONFIG_USER_ONLY)
1059/* Called from RCU critical section */
1060static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
1061{
1062    RAMBlock *block;
1063
1064    block = atomic_rcu_read(&ram_list.mru_block);
1065    if (block && addr - block->offset < block->max_length) {
1066        return block;
1067    }
1068    RAMBLOCK_FOREACH(block) {
1069        if (addr - block->offset < block->max_length) {
1070            goto found;
1071        }
1072    }
1073
1074    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1075    abort();
1076
1077found:
1078    /* It is safe to write mru_block outside the iothread lock.  This
1079     * is what happens:
1080     *
1081     *     mru_block = xxx
1082     *     rcu_read_unlock()
1083     *                                        xxx removed from list
1084     *                  rcu_read_lock()
1085     *                  read mru_block
1086     *                                        mru_block = NULL;
1087     *                                        call_rcu(reclaim_ramblock, xxx);
1088     *                  rcu_read_unlock()
1089     *
1090     * atomic_rcu_set is not needed here.  The block was already published
1091     * when it was placed into the list.  Here we're just making an extra
1092     * copy of the pointer.
1093     */
1094    ram_list.mru_block = block;
1095    return block;
1096}
1097
1098static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
1099{
1100    CPUState *cpu;
1101    ram_addr_t start1;
1102    RAMBlock *block;
1103    ram_addr_t end;
1104
1105    end = TARGET_PAGE_ALIGN(start + length);
1106    start &= TARGET_PAGE_MASK;
1107
1108    rcu_read_lock();
1109    block = qemu_get_ram_block(start);
1110    assert(block == qemu_get_ram_block(end - 1));
1111    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1112    CPU_FOREACH(cpu) {
1113        tlb_reset_dirty(cpu, start1, length);
1114    }
1115    rcu_read_unlock();
1116}
1117
1118/* Note: start and end must be within the same ram block.  */
1119bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1120                                              ram_addr_t length,
1121                                              unsigned client)
1122{
1123    DirtyMemoryBlocks *blocks;
1124    unsigned long end, page;
1125    bool dirty = false;
1126
1127    if (length == 0) {
1128        return false;
1129    }
1130
1131    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1132    page = start >> TARGET_PAGE_BITS;
1133
1134    rcu_read_lock();
1135
1136    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1137
1138    while (page < end) {
1139        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1140        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1141        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1142
1143        dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1144                                              offset, num);
1145        page += num;
1146    }
1147
1148    rcu_read_unlock();
1149
1150    if (dirty && tcg_enabled()) {
1151        tlb_reset_dirty_range_all(start, length);
1152    }
1153
1154    return dirty;
1155}
1156
1157DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1158     (ram_addr_t start, ram_addr_t length, unsigned client)
1159{
1160    DirtyMemoryBlocks *blocks;
1161    unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
1162    ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
1163    ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
1164    DirtyBitmapSnapshot *snap;
1165    unsigned long page, end, dest;
1166
1167    snap = g_malloc0(sizeof(*snap) +
1168                     ((last - first) >> (TARGET_PAGE_BITS + 3)));
1169    snap->start = first;
1170    snap->end   = last;
1171
1172    page = first >> TARGET_PAGE_BITS;
1173    end  = last  >> TARGET_PAGE_BITS;
1174    dest = 0;
1175
1176    rcu_read_lock();
1177
1178    blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1179
1180    while (page < end) {
1181        unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1182        unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1183        unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1184
1185        assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
1186        assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
1187        offset >>= BITS_PER_LEVEL;
1188
1189        bitmap_copy_and_clear_atomic(snap->dirty + dest,
1190                                     blocks->blocks[idx] + offset,
1191                                     num);
1192        page += num;
1193        dest += num >> BITS_PER_LEVEL;
1194    }
1195
1196    rcu_read_unlock();
1197
1198    if (tcg_enabled()) {
1199        tlb_reset_dirty_range_all(start, length);
1200    }
1201
1202    return snap;
1203}
1204
1205bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
1206                                            ram_addr_t start,
1207                                            ram_addr_t length)
1208{
1209    unsigned long page, end;
1210
1211    assert(start >= snap->start);
1212    assert(start + length <= snap->end);
1213
1214    end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
1215    page = (start - snap->start) >> TARGET_PAGE_BITS;
1216
1217    while (page < end) {
1218        if (test_bit(page, snap->dirty)) {
1219            return true;
1220        }
1221        page++;
1222    }
1223    return false;
1224}
1225
1226/* Called from RCU critical section */
1227hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1228                                       MemoryRegionSection *section,
1229                                       target_ulong vaddr,
1230                                       hwaddr paddr, hwaddr xlat,
1231                                       int prot,
1232                                       target_ulong *address)
1233{
1234    hwaddr iotlb;
1235    CPUWatchpoint *wp;
1236
1237    if (memory_region_is_ram(section->mr)) {
1238        /* Normal RAM.  */
1239        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
1240        if (!section->readonly) {
1241            iotlb |= PHYS_SECTION_NOTDIRTY;
1242        } else {
1243            iotlb |= PHYS_SECTION_ROM;
1244        }
1245    } else {
1246        AddressSpaceDispatch *d;
1247
1248        d = flatview_to_dispatch(section->fv);
1249        iotlb = section - d->map.sections;
1250        iotlb += xlat;
1251    }
1252
1253    /* Make accesses to pages with watchpoints go via the
1254       watchpoint trap routines.  */
1255    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1256        if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
1257            /* Avoid trapping reads of pages with a write breakpoint. */
1258            if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1259                iotlb = PHYS_SECTION_WATCH + paddr;
1260                *address |= TLB_MMIO;
1261                break;
1262            }
1263        }
1264    }
1265
1266    return iotlb;
1267}
1268#endif /* defined(CONFIG_USER_ONLY) */
1269
1270#if !defined(CONFIG_USER_ONLY)
1271
1272static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1273                             uint16_t section);
1274static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1275
1276static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
1277                               qemu_anon_ram_alloc;
1278
1279/*
1280 * Set a custom physical guest memory alloator.
1281 * Accelerators with unusual needs may need this.  Hopefully, we can
1282 * get rid of it eventually.
1283 */
1284void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1285{
1286    phys_mem_alloc = alloc;
1287}
1288
1289static uint16_t phys_section_add(PhysPageMap *map,
1290                                 MemoryRegionSection *section)
1291{
1292    /* The physical section number is ORed with a page-aligned
1293     * pointer to produce the iotlb entries.  Thus it should
1294     * never overflow into the page-aligned value.
1295     */
1296    assert(map->sections_nb < TARGET_PAGE_SIZE);
1297
1298    if (map->sections_nb == map->sections_nb_alloc) {
1299        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1300        map->sections = g_renew(MemoryRegionSection, map->sections,
1301                                map->sections_nb_alloc);
1302    }
1303    map->sections[map->sections_nb] = *section;
1304    memory_region_ref(section->mr);
1305    return map->sections_nb++;
1306}
1307
1308static void phys_section_destroy(MemoryRegion *mr)
1309{
1310    bool have_sub_page = mr->subpage;
1311
1312    memory_region_unref(mr);
1313
1314    if (have_sub_page) {
1315        subpage_t *subpage = container_of(mr, subpage_t, iomem);
1316        object_unref(OBJECT(&subpage->iomem));
1317        g_free(subpage);
1318    }
1319}
1320
1321static void phys_sections_free(PhysPageMap *map)
1322{
1323    while (map->sections_nb > 0) {
1324        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1325        phys_section_destroy(section->mr);
1326    }
1327    g_free(map->sections);
1328    g_free(map->nodes);
1329}
1330
1331static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1332{
1333    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1334    subpage_t *subpage;
1335    hwaddr base = section->offset_within_address_space
1336        & TARGET_PAGE_MASK;
1337    MemoryRegionSection *existing = phys_page_find(d, base);
1338    MemoryRegionSection subsection = {
1339        .offset_within_address_space = base,
1340        .size = int128_make64(TARGET_PAGE_SIZE),
1341    };
1342    hwaddr start, end;
1343
1344    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1345
1346    if (!(existing->mr->subpage)) {
1347        subpage = subpage_init(fv, base);
1348        subsection.fv = fv;
1349        subsection.mr = &subpage->iomem;
1350        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1351                      phys_section_add(&d->map, &subsection));
1352    } else {
1353        subpage = container_of(existing->mr, subpage_t, iomem);
1354    }
1355    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1356    end = start + int128_get64(section->size) - 1;
1357    subpage_register(subpage, start, end,
1358                     phys_section_add(&d->map, section));
1359}
1360
1361
1362static void register_multipage(FlatView *fv,
1363                               MemoryRegionSection *section)
1364{
1365    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1366    hwaddr start_addr = section->offset_within_address_space;
1367    uint16_t section_index = phys_section_add(&d->map, section);
1368    uint64_t num_pages = int128_get64(int128_rshift(section->size,
1369                                                    TARGET_PAGE_BITS));
1370
1371    assert(num_pages);
1372    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1373}
1374
1375void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1376{
1377    MemoryRegionSection now = *section, remain = *section;
1378    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1379
1380    if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1381        uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1382                       - now.offset_within_address_space;
1383
1384        now.size = int128_min(int128_make64(left), now.size);
1385        register_subpage(fv, &now);
1386    } else {
1387        now.size = int128_zero();
1388    }
1389    while (int128_ne(remain.size, now.size)) {
1390        remain.size = int128_sub(remain.size, now.size);
1391        remain.offset_within_address_space += int128_get64(now.size);
1392        remain.offset_within_region += int128_get64(now.size);
1393        now = remain;
1394        if (int128_lt(remain.size, page_size)) {
1395            register_subpage(fv, &now);
1396        } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1397            now.size = page_size;
1398            register_subpage(fv, &now);
1399        } else {
1400            now.size = int128_and(now.size, int128_neg(page_size));
1401            register_multipage(fv, &now);
1402        }
1403    }
1404}
1405
1406void qemu_flush_coalesced_mmio_buffer(void)
1407{
1408    if (kvm_enabled())
1409        kvm_flush_coalesced_mmio_buffer();
1410}
1411
1412void qemu_mutex_lock_ramlist(void)
1413{
1414    qemu_mutex_lock(&ram_list.mutex);
1415}
1416
1417void qemu_mutex_unlock_ramlist(void)
1418{
1419    qemu_mutex_unlock(&ram_list.mutex);
1420}
1421
1422void ram_block_dump(Monitor *mon)
1423{
1424    RAMBlock *block;
1425    char *psize;
1426
1427    rcu_read_lock();
1428    monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
1429                   "Block Name", "PSize", "Offset", "Used", "Total");
1430    RAMBLOCK_FOREACH(block) {
1431        psize = size_to_str(block->page_size);
1432        monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1433                       " 0x%016" PRIx64 "\n", block->idstr, psize,
1434                       (uint64_t)block->offset,
1435                       (uint64_t)block->used_length,
1436                       (uint64_t)block->max_length);
1437        g_free(psize);
1438    }
1439    rcu_read_unlock();
1440}
1441
1442#ifdef __linux__
1443/*
1444 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
1445 * may or may not name the same files / on the same filesystem now as
1446 * when we actually open and map them.  Iterate over the file
1447 * descriptors instead, and use qemu_fd_getpagesize().
1448 */
1449static int find_max_supported_pagesize(Object *obj, void *opaque)
1450{
1451    char *mem_path;
1452    long *hpsize_min = opaque;
1453
1454    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1455        mem_path = object_property_get_str(obj, "mem-path", NULL);
1456        if (mem_path) {
1457            long hpsize = qemu_mempath_getpagesize(mem_path);
1458            g_free(mem_path);
1459            if (hpsize < *hpsize_min) {
1460                *hpsize_min = hpsize;
1461            }
1462        } else {
1463            *hpsize_min = getpagesize();
1464        }
1465    }
1466
1467    return 0;
1468}
1469
1470long qemu_getrampagesize(void)
1471{
1472    long hpsize = LONG_MAX;
1473    long mainrampagesize;
1474    Object *memdev_root;
1475
1476    if (mem_path) {
1477        mainrampagesize = qemu_mempath_getpagesize(mem_path);
1478    } else {
1479        mainrampagesize = getpagesize();
1480    }
1481
1482    /* it's possible we have memory-backend objects with
1483     * hugepage-backed RAM. these may get mapped into system
1484     * address space via -numa parameters or memory hotplug
1485     * hooks. we want to take these into account, but we
1486     * also want to make sure these supported hugepage
1487     * sizes are applicable across the entire range of memory
1488     * we may boot from, so we take the min across all
1489     * backends, and assume normal pages in cases where a
1490     * backend isn't backed by hugepages.
1491     */
1492    memdev_root = object_resolve_path("/objects", NULL);
1493    if (memdev_root) {
1494        object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
1495    }
1496    if (hpsize == LONG_MAX) {
1497        /* No additional memory regions found ==> Report main RAM page size */
1498        return mainrampagesize;
1499    }
1500
1501    /* If NUMA is disabled or the NUMA nodes are not backed with a
1502     * memory-backend, then there is at least one node using "normal" RAM,
1503     * so if its page size is smaller we have got to report that size instead.
1504     */
1505    if (hpsize > mainrampagesize &&
1506        (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) {
1507        static bool warned;
1508        if (!warned) {
1509            error_report("Huge page support disabled (n/a for main memory).");
1510            warned = true;
1511        }
1512        return mainrampagesize;
1513    }
1514
1515    return hpsize;
1516}
1517#else
1518long qemu_getrampagesize(void)
1519{
1520    return getpagesize();
1521}
1522#endif
1523
1524#ifdef __linux__
1525static int64_t get_file_size(int fd)
1526{
1527    int64_t size = lseek(fd, 0, SEEK_END);
1528    if (size < 0) {
1529        return -errno;
1530    }
1531    return size;
1532}
1533
1534static int file_ram_open(const char *path,
1535                         const char *region_name,
1536                         bool *created,
1537                         Error **errp)
1538{
1539    char *filename;
1540    char *sanitized_name;
1541    char *c;
1542    int fd = -1;
1543
1544    *created = false;
1545    for (;;) {
1546        fd = open(path, O_RDWR);
1547        if (fd >= 0) {
1548            /* @path names an existing file, use it */
1549            break;
1550        }
1551        if (errno == ENOENT) {
1552            /* @path names a file that doesn't exist, create it */
1553            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1554            if (fd >= 0) {
1555                *created = true;
1556                break;
1557            }
1558        } else if (errno == EISDIR) {
1559            /* @path names a directory, create a file there */
1560            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1561            sanitized_name = g_strdup(region_name);
1562            for (c = sanitized_name; *c != '\0'; c++) {
1563                if (*c == '/') {
1564                    *c = '_';
1565                }
1566            }
1567
1568            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1569                                       sanitized_name);
1570            g_free(sanitized_name);
1571
1572            fd = mkstemp(filename);
1573            if (fd >= 0) {
1574                unlink(filename);
1575                g_free(filename);
1576                break;
1577            }
1578            g_free(filename);
1579        }
1580        if (errno != EEXIST && errno != EINTR) {
1581            error_setg_errno(errp, errno,
1582                             "can't open backing store %s for guest RAM",
1583                             path);
1584            return -1;
1585        }
1586        /*
1587         * Try again on EINTR and EEXIST.  The latter happens when
1588         * something else creates the file between our two open().
1589         */
1590    }
1591
1592    return fd;
1593}
1594
1595static void *file_ram_alloc(RAMBlock *block,
1596                            ram_addr_t memory,
1597                            int fd,
1598                            bool truncate,
1599                            Error **errp)
1600{
1601    void *area;
1602
1603    block->page_size = qemu_fd_getpagesize(fd);
1604    block->mr->align = block->page_size;
1605#if defined(__s390x__)
1606    if (kvm_enabled()) {
1607        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1608    }
1609#endif
1610
1611    if (memory < block->page_size) {
1612        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1613                   "or larger than page size 0x%zx",
1614                   memory, block->page_size);
1615        return NULL;
1616    }
1617
1618    memory = ROUND_UP(memory, block->page_size);
1619
1620    /*
1621     * ftruncate is not supported by hugetlbfs in older
1622     * hosts, so don't bother bailing out on errors.
1623     * If anything goes wrong with it under other filesystems,
1624     * mmap will fail.
1625     *
1626     * Do not truncate the non-empty backend file to avoid corrupting
1627     * the existing data in the file. Disabling shrinking is not
1628     * enough. For example, the current vNVDIMM implementation stores
1629     * the guest NVDIMM labels at the end of the backend file. If the
1630     * backend file is later extended, QEMU will not be able to find
1631     * those labels. Therefore, extending the non-empty backend file
1632     * is disabled as well.
1633     */
1634    if (truncate && ftruncate(fd, memory)) {
1635        perror("ftruncate");
1636    }
1637
1638    area = qemu_ram_mmap(fd, memory, block->mr->align,
1639                         block->flags & RAM_SHARED);
1640    if (area == MAP_FAILED) {
1641        error_setg_errno(errp, errno,
1642                         "unable to map backing store for guest RAM");
1643        return NULL;
1644    }
1645
1646    if (mem_prealloc) {
1647        os_mem_prealloc(fd, area, memory, smp_cpus, errp);
1648        if (errp && *errp) {
1649            qemu_ram_munmap(area, memory);
1650            return NULL;
1651        }
1652    }
1653
1654    block->fd = fd;
1655    return area;
1656}
1657#endif
1658
1659/* Called with the ramlist lock held.  */
1660static ram_addr_t find_ram_offset(ram_addr_t size)
1661{
1662    RAMBlock *block, *next_block;
1663    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1664
1665    assert(size != 0); /* it would hand out same offset multiple times */
1666
1667    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1668        return 0;
1669    }
1670
1671    RAMBLOCK_FOREACH(block) {
1672        ram_addr_t end, next = RAM_ADDR_MAX;
1673
1674        end = block->offset + block->max_length;
1675
1676        RAMBLOCK_FOREACH(next_block) {
1677            if (next_block->offset >= end) {
1678                next = MIN(next, next_block->offset);
1679            }
1680        }
1681        if (next - end >= size && next - end < mingap) {
1682            offset = end;
1683            mingap = next - end;
1684        }
1685    }
1686
1687    if (offset == RAM_ADDR_MAX) {
1688        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1689                (uint64_t)size);
1690        abort();
1691    }
1692
1693    return offset;
1694}
1695
1696unsigned long last_ram_page(void)
1697{
1698    RAMBlock *block;
1699    ram_addr_t last = 0;
1700
1701    rcu_read_lock();
1702    RAMBLOCK_FOREACH(block) {
1703        last = MAX(last, block->offset + block->max_length);
1704    }
1705    rcu_read_unlock();
1706    return last >> TARGET_PAGE_BITS;
1707}
1708
1709static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1710{
1711    int ret;
1712
1713    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1714    if (!machine_dump_guest_core(current_machine)) {
1715        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1716        if (ret) {
1717            perror("qemu_madvise");
1718            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1719                            "but dump_guest_core=off specified\n");
1720        }
1721    }
1722}
1723
1724const char *qemu_ram_get_idstr(RAMBlock *rb)
1725{
1726    return rb->idstr;
1727}
1728
1729bool qemu_ram_is_shared(RAMBlock *rb)
1730{
1731    return rb->flags & RAM_SHARED;
1732}
1733
1734/* Called with iothread lock held.  */
1735void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1736{
1737    RAMBlock *block;
1738
1739    assert(new_block);
1740    assert(!new_block->idstr[0]);
1741
1742    if (dev) {
1743        char *id = qdev_get_dev_path(dev);
1744        if (id) {
1745            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1746            g_free(id);
1747        }
1748    }
1749    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1750
1751    rcu_read_lock();
1752    RAMBLOCK_FOREACH(block) {
1753        if (block != new_block &&
1754            !strcmp(block->idstr, new_block->idstr)) {
1755            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1756                    new_block->idstr);
1757            abort();
1758        }
1759    }
1760    rcu_read_unlock();
1761}
1762
1763/* Called with iothread lock held.  */
1764void qemu_ram_unset_idstr(RAMBlock *block)
1765{
1766    /* FIXME: arch_init.c assumes that this is not called throughout
1767     * migration.  Ignore the problem since hot-unplug during migration
1768     * does not work anyway.
1769     */
1770    if (block) {
1771        memset(block->idstr, 0, sizeof(block->idstr));
1772    }
1773}
1774
1775size_t qemu_ram_pagesize(RAMBlock *rb)
1776{
1777    return rb->page_size;
1778}
1779
1780/* Returns the largest size of page in use */
1781size_t qemu_ram_pagesize_largest(void)
1782{
1783    RAMBlock *block;
1784    size_t largest = 0;
1785
1786    RAMBLOCK_FOREACH(block) {
1787        largest = MAX(largest, qemu_ram_pagesize(block));
1788    }
1789
1790    return largest;
1791}
1792
1793static int memory_try_enable_merging(void *addr, size_t len)
1794{
1795    if (!machine_mem_merge(current_machine)) {
1796        /* disabled by the user */
1797        return 0;
1798    }
1799
1800    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1801}
1802
1803/* Only legal before guest might have detected the memory size: e.g. on
1804 * incoming migration, or right after reset.
1805 *
1806 * As memory core doesn't know how is memory accessed, it is up to
1807 * resize callback to update device state and/or add assertions to detect
1808 * misuse, if necessary.
1809 */
1810int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1811{
1812    assert(block);
1813
1814    newsize = HOST_PAGE_ALIGN(newsize);
1815
1816    if (block->used_length == newsize) {
1817        return 0;
1818    }
1819
1820    if (!(block->flags & RAM_RESIZEABLE)) {
1821        error_setg_errno(errp, EINVAL,
1822                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
1823                         " in != 0x" RAM_ADDR_FMT, block->idstr,
1824                         newsize, block->used_length);
1825        return -EINVAL;
1826    }
1827
1828    if (block->max_length < newsize) {
1829        error_setg_errno(errp, EINVAL,
1830                         "Length too large: %s: 0x" RAM_ADDR_FMT
1831                         " > 0x" RAM_ADDR_FMT, block->idstr,
1832                         newsize, block->max_length);
1833        return -EINVAL;
1834    }
1835
1836    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1837    block->used_length = newsize;
1838    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
1839                                        DIRTY_CLIENTS_ALL);
1840    memory_region_set_size(block->mr, newsize);
1841    if (block->resized) {
1842        block->resized(block->idstr, newsize, block->host);
1843    }
1844    return 0;
1845}
1846
1847/* Called with ram_list.mutex held */
1848static void dirty_memory_extend(ram_addr_t old_ram_size,
1849                                ram_addr_t new_ram_size)
1850{
1851    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
1852                                             DIRTY_MEMORY_BLOCK_SIZE);
1853    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
1854                                             DIRTY_MEMORY_BLOCK_SIZE);
1855    int i;
1856
1857    /* Only need to extend if block count increased */
1858    if (new_num_blocks <= old_num_blocks) {
1859        return;
1860    }
1861
1862    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1863        DirtyMemoryBlocks *old_blocks;
1864        DirtyMemoryBlocks *new_blocks;
1865        int j;
1866
1867        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
1868        new_blocks = g_malloc(sizeof(*new_blocks) +
1869                              sizeof(new_blocks->blocks[0]) * new_num_blocks);
1870
1871        if (old_num_blocks) {
1872            memcpy(new_blocks->blocks, old_blocks->blocks,
1873                   old_num_blocks * sizeof(old_blocks->blocks[0]));
1874        }
1875
1876        for (j = old_num_blocks; j < new_num_blocks; j++) {
1877            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
1878        }
1879
1880        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
1881
1882        if (old_blocks) {
1883            g_free_rcu(old_blocks, rcu);
1884        }
1885    }
1886}
1887
1888static void ram_block_add(RAMBlock *new_block, Error **errp)
1889{
1890    RAMBlock *block;
1891    RAMBlock *last_block = NULL;
1892    ram_addr_t old_ram_size, new_ram_size;
1893    Error *err = NULL;
1894
1895    old_ram_size = last_ram_page();
1896
1897    qemu_mutex_lock_ramlist();
1898    new_block->offset = find_ram_offset(new_block->max_length);
1899
1900    if (!new_block->host) {
1901        if (xen_enabled()) {
1902            xen_ram_alloc(new_block->offset, new_block->max_length,
1903                          new_block->mr, &err);
1904            if (err) {
1905                error_propagate(errp, err);
1906                qemu_mutex_unlock_ramlist();
1907                return;
1908            }
1909        } else {
1910            new_block->host = phys_mem_alloc(new_block->max_length,
1911                                             &new_block->mr->align);
1912            if (!new_block->host) {
1913                error_setg_errno(errp, errno,
1914                                 "cannot set up guest memory '%s'",
1915                                 memory_region_name(new_block->mr));
1916                qemu_mutex_unlock_ramlist();
1917                return;
1918            }
1919            memory_try_enable_merging(new_block->host, new_block->max_length);
1920        }
1921    }
1922
1923    new_ram_size = MAX(old_ram_size,
1924              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
1925    if (new_ram_size > old_ram_size) {
1926        dirty_memory_extend(old_ram_size, new_ram_size);
1927    }
1928    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1929     * QLIST (which has an RCU-friendly variant) does not have insertion at
1930     * tail, so save the last element in last_block.
1931     */
1932    RAMBLOCK_FOREACH(block) {
1933        last_block = block;
1934        if (block->max_length < new_block->max_length) {
1935            break;
1936        }
1937    }
1938    if (block) {
1939        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1940    } else if (last_block) {
1941        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1942    } else { /* list is empty */
1943        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1944    }
1945    ram_list.mru_block = NULL;
1946
1947    /* Write list before version */
1948    smp_wmb();
1949    ram_list.version++;
1950    qemu_mutex_unlock_ramlist();
1951
1952    cpu_physical_memory_set_dirty_range(new_block->offset,
1953                                        new_block->used_length,
1954                                        DIRTY_CLIENTS_ALL);
1955
1956    if (new_block->host) {
1957        qemu_ram_setup_dump(new_block->host, new_block->max_length);
1958        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1959        /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
1960        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1961        ram_block_notify_add(new_block->host, new_block->max_length);
1962    }
1963}
1964
1965#ifdef __linux__
1966RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
1967                                 bool share, int fd,
1968                                 Error **errp)
1969{
1970    RAMBlock *new_block;
1971    Error *local_err = NULL;
1972    int64_t file_size;
1973
1974    if (xen_enabled()) {
1975        error_setg(errp, "-mem-path not supported with Xen");
1976        return NULL;
1977    }
1978
1979    if (kvm_enabled() && !kvm_has_sync_mmu()) {
1980        error_setg(errp,
1981                   "host lacks kvm mmu notifiers, -mem-path unsupported");
1982        return NULL;
1983    }
1984
1985    if (phys_mem_alloc != qemu_anon_ram_alloc) {
1986        /*
1987         * file_ram_alloc() needs to allocate just like
1988         * phys_mem_alloc, but we haven't bothered to provide
1989         * a hook there.
1990         */
1991        error_setg(errp,
1992                   "-mem-path not supported with this accelerator");
1993        return NULL;
1994    }
1995
1996    size = HOST_PAGE_ALIGN(size);
1997    file_size = get_file_size(fd);
1998    if (file_size > 0 && file_size < size) {
1999        error_setg(errp, "backing store %s size 0x%" PRIx64
2000                   " does not match 'size' option 0x" RAM_ADDR_FMT,
2001                   mem_path, file_size, size);
2002        return NULL;
2003    }
2004
2005    new_block = g_malloc0(sizeof(*new_block));
2006    new_block->mr = mr;
2007    new_block->used_length = size;
2008    new_block->max_length = size;
2009    new_block->flags = share ? RAM_SHARED : 0;
2010    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
2011    if (!new_block->host) {
2012        g_free(new_block);
2013        return NULL;
2014    }
2015
2016    ram_block_add(new_block, &local_err);
2017    if (local_err) {
2018        g_free(new_block);
2019        error_propagate(errp, local_err);
2020        return NULL;
2021    }
2022    return new_block;
2023
2024}
2025
2026
2027RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2028                                   bool share, const char *mem_path,
2029                                   Error **errp)
2030{
2031    int fd;
2032    bool created;
2033    RAMBlock *block;
2034
2035    fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
2036    if (fd < 0) {
2037        return NULL;
2038    }
2039
2040    block = qemu_ram_alloc_from_fd(size, mr, share, fd, errp);
2041    if (!block) {
2042        if (created) {
2043            unlink(mem_path);
2044        }
2045        close(fd);
2046        return NULL;
2047    }
2048
2049    return block;
2050}
2051#endif
2052
2053static
2054RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2055                                  void (*resized)(const char*,
2056                                                  uint64_t length,
2057                                                  void *host),
2058                                  void *host, bool resizeable,
2059                                  MemoryRegion *mr, Error **errp)
2060{
2061    RAMBlock *new_block;
2062    Error *local_err = NULL;
2063
2064    size = HOST_PAGE_ALIGN(size);
2065    max_size = HOST_PAGE_ALIGN(max_size);
2066    new_block = g_malloc0(sizeof(*new_block));
2067    new_block->mr = mr;
2068    new_block->resized = resized;
2069    new_block->used_length = size;
2070    new_block->max_length = max_size;
2071    assert(max_size >= size);
2072    new_block->fd = -1;
2073    new_block->page_size = getpagesize();
2074    new_block->host = host;
2075    if (host) {
2076        new_block->flags |= RAM_PREALLOC;
2077    }
2078    if (resizeable) {
2079        new_block->flags |= RAM_RESIZEABLE;
2080    }
2081    ram_block_add(new_block, &local_err);
2082    if (local_err) {
2083        g_free(new_block);
2084        error_propagate(errp, local_err);
2085        return NULL;
2086    }
2087    return new_block;
2088}
2089
2090RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2091                                   MemoryRegion *mr, Error **errp)
2092{
2093    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
2094}
2095
2096RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
2097{
2098    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
2099}
2100
2101RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2102                                     void (*resized)(const char*,
2103                                                     uint64_t length,
2104                                                     void *host),
2105                                     MemoryRegion *mr, Error **errp)
2106{
2107    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
2108}
2109
2110static void reclaim_ramblock(RAMBlock *block)
2111{
2112    if (block->flags & RAM_PREALLOC) {
2113        ;
2114    } else if (xen_enabled()) {
2115        xen_invalidate_map_cache_entry(block->host);
2116#ifndef _WIN32
2117    } else if (block->fd >= 0) {
2118        qemu_ram_munmap(block->host, block->max_length);
2119        close(block->fd);
2120#endif
2121    } else {
2122        qemu_anon_ram_free(block->host, block->max_length);
2123    }
2124    g_free(block);
2125}
2126
2127void qemu_ram_free(RAMBlock *block)
2128{
2129    if (!block) {
2130        return;
2131    }
2132
2133    if (block->host) {
2134        ram_block_notify_remove(block->host, block->max_length);
2135    }
2136
2137    qemu_mutex_lock_ramlist();
2138    QLIST_REMOVE_RCU(block, next);
2139    ram_list.mru_block = NULL;
2140    /* Write list before version */
2141    smp_wmb();
2142    ram_list.version++;
2143    call_rcu(block, reclaim_ramblock, rcu);
2144    qemu_mutex_unlock_ramlist();
2145}
2146
2147#ifndef _WIN32
2148void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
2149{
2150    RAMBlock *block;
2151    ram_addr_t offset;
2152    int flags;
2153    void *area, *vaddr;
2154
2155    RAMBLOCK_FOREACH(block) {
2156        offset = addr - block->offset;
2157        if (offset < block->max_length) {
2158            vaddr = ramblock_ptr(block, offset);
2159            if (block->flags & RAM_PREALLOC) {
2160                ;
2161            } else if (xen_enabled()) {
2162                abort();
2163            } else {
2164                flags = MAP_FIXED;
2165                if (block->fd >= 0) {
2166                    flags |= (block->flags & RAM_SHARED ?
2167                              MAP_SHARED : MAP_PRIVATE);
2168                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2169                                flags, block->fd, offset);
2170                } else {
2171                    /*
2172                     * Remap needs to match alloc.  Accelerators that
2173                     * set phys_mem_alloc never remap.  If they did,
2174                     * we'd need a remap hook here.
2175                     */
2176                    assert(phys_mem_alloc == qemu_anon_ram_alloc);
2177
2178                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
2179                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2180                                flags, -1, 0);
2181                }
2182                if (area != vaddr) {
2183                    fprintf(stderr, "Could not remap addr: "
2184                            RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
2185                            length, addr);
2186                    exit(1);
2187                }
2188                memory_try_enable_merging(vaddr, length);
2189                qemu_ram_setup_dump(vaddr, length);
2190            }
2191        }
2192    }
2193}
2194#endif /* !_WIN32 */
2195
2196/* Return a host pointer to ram allocated with qemu_ram_alloc.
2197 * This should not be used for general purpose DMA.  Use address_space_map
2198 * or address_space_rw instead. For local memory (e.g. video ram) that the
2199 * device owns, use memory_region_get_ram_ptr.
2200 *
2201 * Called within RCU critical section.
2202 */
2203void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2204{
2205    RAMBlock *block = ram_block;
2206
2207    if (block == NULL) {
2208        block = qemu_get_ram_block(addr);
2209        addr -= block->offset;
2210    }
2211
2212    if (xen_enabled() && block->host == NULL) {
2213        /* We need to check if the requested address is in the RAM
2214         * because we don't want to map the entire memory in QEMU.
2215         * In that case just map until the end of the page.
2216         */
2217        if (block->offset == 0) {
2218            return xen_map_cache(addr, 0, 0, false);
2219        }
2220
2221        block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2222    }
2223    return ramblock_ptr(block, addr);
2224}
2225
2226/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2227 * but takes a size argument.
2228 *
2229 * Called within RCU critical section.
2230 */
2231static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2232                                 hwaddr *size, bool lock)
2233{
2234    RAMBlock *block = ram_block;
2235    if (*size == 0) {
2236        return NULL;
2237    }
2238
2239    if (block == NULL) {
2240        block = qemu_get_ram_block(addr);
2241        addr -= block->offset;
2242    }
2243    *size = MIN(*size, block->max_length - addr);
2244
2245    if (xen_enabled() && block->host == NULL) {
2246        /* We need to check if the requested address is in the RAM
2247         * because we don't want to map the entire memory in QEMU.
2248         * In that case just map the requested area.
2249         */
2250        if (block->offset == 0) {
2251            return xen_map_cache(addr, *size, lock, lock);
2252        }
2253
2254        block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2255    }
2256
2257    return ramblock_ptr(block, addr);
2258}
2259
2260/*
2261 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
2262 * in that RAMBlock.
2263 *
2264 * ptr: Host pointer to look up
2265 * round_offset: If true round the result offset down to a page boundary
2266 * *ram_addr: set to result ram_addr
2267 * *offset: set to result offset within the RAMBlock
2268 *
2269 * Returns: RAMBlock (or NULL if not found)
2270 *
2271 * By the time this function returns, the returned pointer is not protected
2272 * by RCU anymore.  If the caller is not within an RCU critical section and
2273 * does not hold the iothread lock, it must have other means of protecting the
2274 * pointer, such as a reference to the region that includes the incoming
2275 * ram_addr_t.
2276 */
2277RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2278                                   ram_addr_t *offset)
2279{
2280    RAMBlock *block;
2281    uint8_t *host = ptr;
2282
2283    if (xen_enabled()) {
2284        ram_addr_t ram_addr;
2285        rcu_read_lock();
2286        ram_addr = xen_ram_addr_from_mapcache(ptr);
2287        block = qemu_get_ram_block(ram_addr);
2288        if (block) {
2289            *offset = ram_addr - block->offset;
2290        }
2291        rcu_read_unlock();
2292        return block;
2293    }
2294
2295    rcu_read_lock();
2296    block = atomic_rcu_read(&ram_list.mru_block);
2297    if (block && block->host && host - block->host < block->max_length) {
2298        goto found;
2299    }
2300
2301    RAMBLOCK_FOREACH(block) {
2302        /* This case append when the block is not mapped. */
2303        if (block->host == NULL) {
2304            continue;
2305        }
2306        if (host - block->host < block->max_length) {
2307            goto found;
2308        }
2309    }
2310
2311    rcu_read_unlock();
2312    return NULL;
2313
2314found:
2315    *offset = (host - block->host);
2316    if (round_offset) {
2317        *offset &= TARGET_PAGE_MASK;
2318    }
2319    rcu_read_unlock();
2320    return block;
2321}
2322
2323/*
2324 * Finds the named RAMBlock
2325 *
2326 * name: The name of RAMBlock to find
2327 *
2328 * Returns: RAMBlock (or NULL if not found)
2329 */
2330RAMBlock *qemu_ram_block_by_name(const char *name)
2331{
2332    RAMBlock *block;
2333
2334    RAMBLOCK_FOREACH(block) {
2335        if (!strcmp(name, block->idstr)) {
2336            return block;
2337        }
2338    }
2339
2340    return NULL;
2341}
2342
2343/* Some of the softmmu routines need to translate from a host pointer
2344   (typically a TLB entry) back to a ram offset.  */
2345ram_addr_t qemu_ram_addr_from_host(void *ptr)
2346{
2347    RAMBlock *block;
2348    ram_addr_t offset;
2349
2350    block = qemu_ram_block_from_host(ptr, false, &offset);
2351    if (!block) {
2352        return RAM_ADDR_INVALID;
2353    }
2354
2355    return block->offset + offset;
2356}
2357
2358/* Called within RCU critical section. */
2359void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
2360                          CPUState *cpu,
2361                          vaddr mem_vaddr,
2362                          ram_addr_t ram_addr,
2363                          unsigned size)
2364{
2365    ndi->cpu = cpu;
2366    ndi->ram_addr = ram_addr;
2367    ndi->mem_vaddr = mem_vaddr;
2368    ndi->size = size;
2369    ndi->locked = false;
2370
2371    assert(tcg_enabled());
2372    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2373        ndi->locked = true;
2374        tb_lock();
2375        tb_invalidate_phys_page_fast(ram_addr, size);
2376    }
2377}
2378
2379/* Called within RCU critical section. */
2380void memory_notdirty_write_complete(NotDirtyInfo *ndi)
2381{
2382    if (ndi->locked) {
2383        tb_unlock();
2384    }
2385
2386    /* Set both VGA and migration bits for simplicity and to remove
2387     * the notdirty callback faster.
2388     */
2389    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
2390                                        DIRTY_CLIENTS_NOCODE);
2391    /* we remove the notdirty callback only if the code has been
2392       flushed */
2393    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
2394        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
2395    }
2396}
2397
2398/* Called within RCU critical section.  */
2399static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2400                               uint64_t val, unsigned size)
2401{
2402    NotDirtyInfo ndi;
2403
2404    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
2405                         ram_addr, size);
2406
2407    switch (size) {
2408    case 1:
2409        stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2410        break;
2411    case 2:
2412        stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2413        break;
2414    case 4:
2415        stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2416        break;
2417    case 8:
2418        stq_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2419        break;
2420    default:
2421        abort();
2422    }
2423    memory_notdirty_write_complete(&ndi);
2424}
2425
2426static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
2427                                 unsigned size, bool is_write)
2428{
2429    return is_write;
2430}
2431
2432static const MemoryRegionOps notdirty_mem_ops = {
2433    .write = notdirty_mem_write,
2434    .valid.accepts = notdirty_mem_accepts,
2435    .endianness = DEVICE_NATIVE_ENDIAN,
2436    .valid = {
2437        .min_access_size = 1,
2438        .max_access_size = 8,
2439        .unaligned = false,
2440    },
2441    .impl = {
2442        .min_access_size = 1,
2443        .max_access_size = 8,
2444        .unaligned = false,
2445    },
2446};
2447
2448/* Generate a debug exception if a watchpoint has been hit.  */
2449static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
2450{
2451    CPUState *cpu = current_cpu;
2452    CPUClass *cc = CPU_GET_CLASS(cpu);
2453    target_ulong vaddr;
2454    CPUWatchpoint *wp;
2455
2456    assert(tcg_enabled());
2457    if (cpu->watchpoint_hit) {
2458        /* We re-entered the check after replacing the TB. Now raise
2459         * the debug interrupt so that is will trigger after the
2460         * current instruction. */
2461        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2462        return;
2463    }
2464    vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2465    vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
2466    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2467        if (cpu_watchpoint_address_matches(wp, vaddr, len)
2468            && (wp->flags & flags)) {
2469            if (flags == BP_MEM_READ) {
2470                wp->flags |= BP_WATCHPOINT_HIT_READ;
2471            } else {
2472                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2473            }
2474            wp->hitaddr = vaddr;
2475            wp->hitattrs = attrs;
2476            if (!cpu->watchpoint_hit) {
2477                if (wp->flags & BP_CPU &&
2478                    !cc->debug_check_watchpoint(cpu, wp)) {
2479                    wp->flags &= ~BP_WATCHPOINT_HIT;
2480                    continue;
2481                }
2482                cpu->watchpoint_hit = wp;
2483
2484                /* Both tb_lock and iothread_mutex will be reset when
2485                 * cpu_loop_exit or cpu_loop_exit_noexc longjmp
2486                 * back into the cpu_exec main loop.
2487                 */
2488                tb_lock();
2489                tb_check_watchpoint(cpu);
2490                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2491                    cpu->exception_index = EXCP_DEBUG;
2492                    cpu_loop_exit(cpu);
2493                } else {
2494                    /* Force execution of one insn next time.  */
2495                    cpu->cflags_next_tb = 1 | curr_cflags();
2496                    cpu_loop_exit_noexc(cpu);
2497                }
2498            }
2499        } else {
2500            wp->flags &= ~BP_WATCHPOINT_HIT;
2501        }
2502    }
2503}
2504
2505/* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
2506   so these check for a hit then pass through to the normal out-of-line
2507   phys routines.  */
2508static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
2509                                  unsigned size, MemTxAttrs attrs)
2510{
2511    MemTxResult res;
2512    uint64_t data;
2513    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2514    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2515
2516    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2517    switch (size) {
2518    case 1:
2519        data = address_space_ldub(as, addr, attrs, &res);
2520        break;
2521    case 2:
2522        data = address_space_lduw(as, addr, attrs, &res);
2523        break;
2524    case 4:
2525        data = address_space_ldl(as, addr, attrs, &res);
2526        break;
2527    case 8:
2528        data = address_space_ldq(as, addr, attrs, &res);
2529        break;
2530    default: abort();
2531    }
2532    *pdata = data;
2533    return res;
2534}
2535
2536static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
2537                                   uint64_t val, unsigned size,
2538                                   MemTxAttrs attrs)
2539{
2540    MemTxResult res;
2541    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2542    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2543
2544    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2545    switch (size) {
2546    case 1:
2547        address_space_stb(as, addr, val, attrs, &res);
2548        break;
2549    case 2:
2550        address_space_stw(as, addr, val, attrs, &res);
2551        break;
2552    case 4:
2553        address_space_stl(as, addr, val, attrs, &res);
2554        break;
2555    case 8:
2556        address_space_stq(as, addr, val, attrs, &res);
2557        break;
2558    default: abort();
2559    }
2560    return res;
2561}
2562
2563static const MemoryRegionOps watch_mem_ops = {
2564    .read_with_attrs = watch_mem_read,
2565    .write_with_attrs = watch_mem_write,
2566    .endianness = DEVICE_NATIVE_ENDIAN,
2567    .valid = {
2568        .min_access_size = 1,
2569        .max_access_size = 8,
2570        .unaligned = false,
2571    },
2572    .impl = {
2573        .min_access_size = 1,
2574        .max_access_size = 8,
2575        .unaligned = false,
2576    },
2577};
2578
2579static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
2580                                      MemTxAttrs attrs, uint8_t *buf, int len);
2581static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2582                                  const uint8_t *buf, int len);
2583static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
2584                                  bool is_write);
2585
2586static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2587                                unsigned len, MemTxAttrs attrs)
2588{
2589    subpage_t *subpage = opaque;
2590    uint8_t buf[8];
2591    MemTxResult res;
2592
2593#if defined(DEBUG_SUBPAGE)
2594    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2595           subpage, len, addr);
2596#endif
2597    res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2598    if (res) {
2599        return res;
2600    }
2601    switch (len) {
2602    case 1:
2603        *data = ldub_p(buf);
2604        return MEMTX_OK;
2605    case 2:
2606        *data = lduw_p(buf);
2607        return MEMTX_OK;
2608    case 4:
2609        *data = ldl_p(buf);
2610        return MEMTX_OK;
2611    case 8:
2612        *data = ldq_p(buf);
2613        return MEMTX_OK;
2614    default:
2615        abort();
2616    }
2617}
2618
2619static MemTxResult subpage_write(void *opaque, hwaddr addr,
2620                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2621{
2622    subpage_t *subpage = opaque;
2623    uint8_t buf[8];
2624
2625#if defined(DEBUG_SUBPAGE)
2626    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2627           " value %"PRIx64"\n",
2628           __func__, subpage, len, addr, value);
2629#endif
2630    switch (len) {
2631    case 1:
2632        stb_p(buf, value);
2633        break;
2634    case 2:
2635        stw_p(buf, value);
2636        break;
2637    case 4:
2638        stl_p(buf, value);
2639        break;
2640    case 8:
2641        stq_p(buf, value);
2642        break;
2643    default:
2644        abort();
2645    }
2646    return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2647}
2648
2649static bool subpage_accepts(void *opaque, hwaddr addr,
2650                            unsigned len, bool is_write)
2651{
2652    subpage_t *subpage = opaque;
2653#if defined(DEBUG_SUBPAGE)
2654    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2655           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2656#endif
2657
2658    return flatview_access_valid(subpage->fv, addr + subpage->base,
2659                                 len, is_write);
2660}
2661
2662static const MemoryRegionOps subpage_ops = {
2663    .read_with_attrs = subpage_read,
2664    .write_with_attrs = subpage_write,
2665    .impl.min_access_size = 1,
2666    .impl.max_access_size = 8,
2667    .valid.min_access_size = 1,
2668    .valid.max_access_size = 8,
2669    .valid.accepts = subpage_accepts,
2670    .endianness = DEVICE_NATIVE_ENDIAN,
2671};
2672
2673static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2674                             uint16_t section)
2675{
2676    int idx, eidx;
2677
2678    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2679        return -1;
2680    idx = SUBPAGE_IDX(start);
2681    eidx = SUBPAGE_IDX(end);
2682#if defined(DEBUG_SUBPAGE)
2683    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2684           __func__, mmio, start, end, idx, eidx, section);
2685#endif
2686    for (; idx <= eidx; idx++) {
2687        mmio->sub_section[idx] = section;
2688    }
2689
2690    return 0;
2691}
2692
2693static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2694{
2695    subpage_t *mmio;
2696
2697    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2698    mmio->fv = fv;
2699    mmio->base = base;
2700    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2701                          NULL, TARGET_PAGE_SIZE);
2702    mmio->iomem.subpage = true;
2703#if defined(DEBUG_SUBPAGE)
2704    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2705           mmio, base, TARGET_PAGE_SIZE);
2706#endif
2707    subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2708
2709    return mmio;
2710}
2711
2712static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2713{
2714    assert(fv);
2715    MemoryRegionSection section = {
2716        .fv = fv,
2717        .mr = mr,
2718        .offset_within_address_space = 0,
2719        .offset_within_region = 0,
2720        .size = int128_2_64(),
2721    };
2722
2723    return phys_section_add(map, &section);
2724}
2725
2726MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2727{
2728    int asidx = cpu_asidx_from_attrs(cpu, attrs);
2729    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2730    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2731    MemoryRegionSection *sections = d->map.sections;
2732
2733    return sections[index & ~TARGET_PAGE_MASK].mr;
2734}
2735
2736static void io_mem_init(void)
2737{
2738    memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2739    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2740                          NULL, UINT64_MAX);
2741
2742    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
2743     * which can be called without the iothread mutex.
2744     */
2745    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2746                          NULL, UINT64_MAX);
2747    memory_region_clear_global_locking(&io_mem_notdirty);
2748
2749    memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2750                          NULL, UINT64_MAX);
2751}
2752
2753AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
2754{
2755    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2756    uint16_t n;
2757
2758    n = dummy_section(&d->map, fv, &io_mem_unassigned);
2759    assert(n == PHYS_SECTION_UNASSIGNED);
2760    n = dummy_section(&d->map, fv, &io_mem_notdirty);
2761    assert(n == PHYS_SECTION_NOTDIRTY);
2762    n = dummy_section(&d->map, fv, &io_mem_rom);
2763    assert(n == PHYS_SECTION_ROM);
2764    n = dummy_section(&d->map, fv, &io_mem_watch);
2765    assert(n == PHYS_SECTION_WATCH);
2766
2767    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2768
2769    return d;
2770}
2771
2772void address_space_dispatch_free(AddressSpaceDispatch *d)
2773{
2774    phys_sections_free(&d->map);
2775    g_free(d);
2776}
2777
2778static void tcg_commit(MemoryListener *listener)
2779{
2780    CPUAddressSpace *cpuas;
2781    AddressSpaceDispatch *d;
2782
2783    /* since each CPU stores ram addresses in its TLB cache, we must
2784       reset the modified entries */
2785    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2786    cpu_reloading_memory_map();
2787    /* The CPU and TLB are protected by the iothread lock.
2788     * We reload the dispatch pointer now because cpu_reloading_memory_map()
2789     * may have split the RCU critical section.
2790     */
2791    d = address_space_to_dispatch(cpuas->as);
2792    atomic_rcu_set(&cpuas->memory_dispatch, d);
2793    tlb_flush(cpuas->cpu);
2794}
2795
2796static void memory_map_init(void)
2797{
2798    system_memory = g_malloc(sizeof(*system_memory));
2799
2800    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2801    address_space_init(&address_space_memory, system_memory, "memory");
2802
2803    system_io = g_malloc(sizeof(*system_io));
2804    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2805                          65536);
2806    address_space_init(&address_space_io, system_io, "I/O");
2807}
2808
2809MemoryRegion *get_system_memory(void)
2810{
2811    return system_memory;
2812}
2813
2814MemoryRegion *get_system_io(void)
2815{
2816    return system_io;
2817}
2818
2819#endif /* !defined(CONFIG_USER_ONLY) */
2820
2821/* physical memory access (slow version, mainly for debug) */
2822#if defined(CONFIG_USER_ONLY)
2823int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2824                        uint8_t *buf, int len, int is_write)
2825{
2826    int l, flags;
2827    target_ulong page;
2828    void * p;
2829
2830    while (len > 0) {
2831        page = addr & TARGET_PAGE_MASK;
2832        l = (page + TARGET_PAGE_SIZE) - addr;
2833        if (l > len)
2834            l = len;
2835        flags = page_get_flags(page);
2836        if (!(flags & PAGE_VALID))
2837            return -1;
2838        if (is_write) {
2839            if (!(flags & PAGE_WRITE))
2840                return -1;
2841            /* XXX: this code should not depend on lock_user */
2842            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2843                return -1;
2844            memcpy(p, buf, l);
2845            unlock_user(p, addr, l);
2846        } else {
2847            if (!(flags & PAGE_READ))
2848                return -1;
2849            /* XXX: this code should not depend on lock_user */
2850            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2851                return -1;
2852            memcpy(buf, p, l);
2853            unlock_user(p, addr, 0);
2854        }
2855        len -= l;
2856        buf += l;
2857        addr += l;
2858    }
2859    return 0;
2860}
2861
2862#else
2863
2864static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
2865                                     hwaddr length)
2866{
2867    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2868    addr += memory_region_get_ram_addr(mr);
2869
2870    /* No early return if dirty_log_mask is or becomes 0, because
2871     * cpu_physical_memory_set_dirty_range will still call
2872     * xen_modified_memory.
2873     */
2874    if (dirty_log_mask) {
2875        dirty_log_mask =
2876            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
2877    }
2878    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2879        assert(tcg_enabled());
2880        tb_lock();
2881        tb_invalidate_phys_range(addr, addr + length);
2882        tb_unlock();
2883        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2884    }
2885    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2886}
2887
2888static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2889{
2890    unsigned access_size_max = mr->ops->valid.max_access_size;
2891
2892    /* Regions are assumed to support 1-4 byte accesses unless
2893       otherwise specified.  */
2894    if (access_size_max == 0) {
2895        access_size_max = 4;
2896    }
2897
2898    /* Bound the maximum access by the alignment of the address.  */
2899    if (!mr->ops->impl.unaligned) {
2900        unsigned align_size_max = addr & -addr;
2901        if (align_size_max != 0 && align_size_max < access_size_max) {
2902            access_size_max = align_size_max;
2903        }
2904    }
2905
2906    /* Don't attempt accesses larger than the maximum.  */
2907    if (l > access_size_max) {
2908        l = access_size_max;
2909    }
2910    l = pow2floor(l);
2911
2912    return l;
2913}
2914
2915static bool prepare_mmio_access(MemoryRegion *mr)
2916{
2917    bool unlocked = !qemu_mutex_iothread_locked();
2918    bool release_lock = false;
2919
2920    if (unlocked && mr->global_locking) {
2921        qemu_mutex_lock_iothread();
2922        unlocked = false;
2923        release_lock = true;
2924    }
2925    if (mr->flush_coalesced_mmio) {
2926        if (unlocked) {
2927            qemu_mutex_lock_iothread();
2928        }
2929        qemu_flush_coalesced_mmio_buffer();
2930        if (unlocked) {
2931            qemu_mutex_unlock_iothread();
2932        }
2933    }
2934
2935    return release_lock;
2936}
2937
2938/* Called within RCU critical section.  */
2939static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
2940                                           MemTxAttrs attrs,
2941                                           const uint8_t *buf,
2942                                           int len, hwaddr addr1,
2943                                           hwaddr l, MemoryRegion *mr)
2944{
2945    uint8_t *ptr;
2946    uint64_t val;
2947    MemTxResult result = MEMTX_OK;
2948    bool release_lock = false;
2949
2950    for (;;) {
2951        if (!memory_access_is_direct(mr, true)) {
2952            release_lock |= prepare_mmio_access(mr);
2953            l = memory_access_size(mr, l, addr1);
2954            /* XXX: could force current_cpu to NULL to avoid
2955               potential bugs */
2956            switch (l) {
2957            case 8:
2958                /* 64 bit write access */
2959                val = ldq_p(buf);
2960                result |= memory_region_dispatch_write(mr, addr1, val, 8,
2961                                                       attrs);
2962                break;
2963            case 4:
2964                /* 32 bit write access */
2965                val = (uint32_t)ldl_p(buf);
2966                result |= memory_region_dispatch_write(mr, addr1, val, 4,
2967                                                       attrs);
2968                break;
2969            case 2:
2970                /* 16 bit write access */
2971                val = lduw_p(buf);
2972                result |= memory_region_dispatch_write(mr, addr1, val, 2,
2973                                                       attrs);
2974                break;
2975            case 1:
2976                /* 8 bit write access */
2977                val = ldub_p(buf);
2978                result |= memory_region_dispatch_write(mr, addr1, val, 1,
2979                                                       attrs);
2980                break;
2981            default:
2982                abort();
2983            }
2984        } else {
2985            /* RAM case */
2986            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
2987            memcpy(ptr, buf, l);
2988            invalidate_and_set_dirty(mr, addr1, l);
2989        }
2990
2991        if (release_lock) {
2992            qemu_mutex_unlock_iothread();
2993            release_lock = false;
2994        }
2995
2996        len -= l;
2997        buf += l;
2998        addr += l;
2999
3000        if (!len) {
3001            break;
3002        }
3003
3004        l = len;
3005        mr = flatview_translate(fv, addr, &addr1, &l, true);
3006    }
3007
3008    return result;
3009}
3010
3011/* Called from RCU critical section.  */
3012static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3013                                  const uint8_t *buf, int len)
3014{
3015    hwaddr l;
3016    hwaddr addr1;
3017    MemoryRegion *mr;
3018    MemTxResult result = MEMTX_OK;
3019
3020    l = len;
3021    mr = flatview_translate(fv, addr, &addr1, &l, true);
3022    result = flatview_write_continue(fv, addr, attrs, buf, len,
3023                                     addr1, l, mr);
3024
3025    return result;
3026}
3027
3028/* Called within RCU critical section.  */
3029MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
3030                                   MemTxAttrs attrs, uint8_t *buf,
3031                                   int len, hwaddr addr1, hwaddr l,
3032                                   MemoryRegion *mr)
3033{
3034    uint8_t *ptr;
3035    uint64_t val;
3036    MemTxResult result = MEMTX_OK;
3037    bool release_lock = false;
3038
3039    for (;;) {
3040        if (!memory_access_is_direct(mr, false)) {
3041            /* I/O case */
3042            release_lock |= prepare_mmio_access(mr);
3043            l = memory_access_size(mr, l, addr1);
3044            switch (l) {
3045            case 8:
3046                /* 64 bit read access */
3047                result |= memory_region_dispatch_read(mr, addr1, &val, 8,
3048                                                      attrs);
3049                stq_p(buf, val);
3050                break;
3051            case 4:
3052                /* 32 bit read access */
3053                result |= memory_region_dispatch_read(mr, addr1, &val, 4,
3054                                                      attrs);
3055                stl_p(buf, val);
3056                break;
3057            case 2:
3058                /* 16 bit read access */
3059                result |= memory_region_dispatch_read(mr, addr1, &val, 2,
3060                                                      attrs);
3061                stw_p(buf, val);
3062                break;
3063            case 1:
3064                /* 8 bit read access */
3065                result |= memory_region_dispatch_read(mr, addr1, &val, 1,
3066                                                      attrs);
3067                stb_p(buf, val);
3068                break;
3069            default:
3070                abort();
3071            }
3072        } else {
3073            /* RAM case */
3074            ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3075            memcpy(buf, ptr, l);
3076        }
3077
3078        if (release_lock) {
3079            qemu_mutex_unlock_iothread();
3080            release_lock = false;
3081        }
3082
3083        len -= l;
3084        buf += l;
3085        addr += l;
3086
3087        if (!len) {
3088            break;
3089        }
3090
3091        l = len;
3092        mr = flatview_translate(fv, addr, &addr1, &l, false);
3093    }
3094
3095    return result;
3096}
3097
3098/* Called from RCU critical section.  */
3099static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
3100                                 MemTxAttrs attrs, uint8_t *buf, int len)
3101{
3102    hwaddr l;
3103    hwaddr addr1;
3104    MemoryRegion *mr;
3105
3106    l = len;
3107    mr = flatview_translate(fv, addr, &addr1, &l, false);
3108    return flatview_read_continue(fv, addr, attrs, buf, len,
3109                                  addr1, l, mr);
3110}
3111
3112MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3113                                    MemTxAttrs attrs, uint8_t *buf, int len)
3114{
3115    MemTxResult result = MEMTX_OK;
3116    FlatView *fv;
3117
3118    if (len > 0) {
3119        rcu_read_lock();
3120        fv = address_space_to_flatview(as);
3121        result = flatview_read(fv, addr, attrs, buf, len);
3122        rcu_read_unlock();
3123    }
3124
3125    return result;
3126}
3127
3128MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
3129                                MemTxAttrs attrs,
3130                                const uint8_t *buf, int len)
3131{
3132    MemTxResult result = MEMTX_OK;
3133    FlatView *fv;
3134
3135    if (len > 0) {
3136        rcu_read_lock();
3137        fv = address_space_to_flatview(as);
3138        result = flatview_write(fv, addr, attrs, buf, len);
3139        rcu_read_unlock();
3140    }
3141
3142    return result;
3143}
3144
3145MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3146                             uint8_t *buf, int len, bool is_write)
3147{
3148    if (is_write) {
3149        return address_space_write(as, addr, attrs, buf, len);
3150    } else {
3151        return address_space_read_full(as, addr, attrs, buf, len);
3152    }
3153}
3154
3155void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
3156                            int len, int is_write)
3157{
3158    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3159                     buf, len, is_write);
3160}
3161
3162enum write_rom_type {
3163    WRITE_DATA,
3164    FLUSH_CACHE,
3165};
3166
3167static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
3168    hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
3169{
3170    hwaddr l;
3171    uint8_t *ptr;
3172    hwaddr addr1;
3173    MemoryRegion *mr;
3174
3175    rcu_read_lock();
3176    while (len > 0) {
3177        l = len;
3178        mr = address_space_translate(as, addr, &addr1, &l, true);
3179
3180        if (!(memory_region_is_ram(mr) ||
3181              memory_region_is_romd(mr))) {
3182            l = memory_access_size(mr, l, addr1);
3183        } else {
3184            /* ROM/RAM case */
3185            ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3186            switch (type) {
3187            case WRITE_DATA:
3188                memcpy(ptr, buf, l);
3189                invalidate_and_set_dirty(mr, addr1, l);
3190                break;
3191            case FLUSH_CACHE:
3192                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
3193                break;
3194            }
3195        }
3196        len -= l;
3197        buf += l;
3198        addr += l;
3199    }
3200    rcu_read_unlock();
3201}
3202
3203/* used for ROM loading : can write in RAM and ROM */
3204void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
3205                                   const uint8_t *buf, int len)
3206{
3207    cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
3208}
3209
3210void cpu_flush_icache_range(hwaddr start, int len)
3211{
3212    /*
3213     * This function should do the same thing as an icache flush that was
3214     * triggered from within the guest. For TCG we are always cache coherent,
3215     * so there is no need to flush anything. For KVM / Xen we need to flush
3216     * the host's instruction cache at least.
3217     */
3218    if (tcg_enabled()) {
3219        return;
3220    }
3221
3222    cpu_physical_memory_write_rom_internal(&address_space_memory,
3223                                           start, NULL, len, FLUSH_CACHE);
3224}
3225
3226typedef struct {
3227    MemoryRegion *mr;
3228    void *buffer;
3229    hwaddr addr;
3230    hwaddr len;
3231    bool in_use;
3232} BounceBuffer;
3233
3234static BounceBuffer bounce;
3235
3236typedef struct MapClient {
3237    QEMUBH *bh;
3238    QLIST_ENTRY(MapClient) link;
3239} MapClient;
3240
3241QemuMutex map_client_list_lock;
3242static QLIST_HEAD(map_client_list, MapClient) map_client_list
3243    = QLIST_HEAD_INITIALIZER(map_client_list);
3244
3245static void cpu_unregister_map_client_do(MapClient *client)
3246{
3247    QLIST_REMOVE(client, link);
3248    g_free(client);
3249}
3250
3251static void cpu_notify_map_clients_locked(void)
3252{
3253    MapClient *client;
3254
3255    while (!QLIST_EMPTY(&map_client_list)) {
3256        client = QLIST_FIRST(&map_client_list);
3257        qemu_bh_schedule(client->bh);
3258        cpu_unregister_map_client_do(client);
3259    }
3260}
3261
3262void cpu_register_map_client(QEMUBH *bh)
3263{
3264    MapClient *client = g_malloc(sizeof(*client));
3265
3266    qemu_mutex_lock(&map_client_list_lock);
3267    client->bh = bh;
3268    QLIST_INSERT_HEAD(&map_client_list, client, link);
3269    if (!atomic_read(&bounce.in_use)) {
3270        cpu_notify_map_clients_locked();
3271    }
3272    qemu_mutex_unlock(&map_client_list_lock);
3273}
3274
3275void cpu_exec_init_all(void)
3276{
3277    qemu_mutex_init(&ram_list.mutex);
3278    /* The data structures we set up here depend on knowing the page size,
3279     * so no more changes can be made after this point.
3280     * In an ideal world, nothing we did before we had finished the
3281     * machine setup would care about the target page size, and we could
3282     * do this much later, rather than requiring board models to state
3283     * up front what their requirements are.
3284     */
3285    finalize_target_page_bits();
3286    io_mem_init();
3287    memory_map_init();
3288    qemu_mutex_init(&map_client_list_lock);
3289}
3290
3291void cpu_unregister_map_client(QEMUBH *bh)
3292{
3293    MapClient *client;
3294
3295    qemu_mutex_lock(&map_client_list_lock);
3296    QLIST_FOREACH(client, &map_client_list, link) {
3297        if (client->bh == bh) {
3298            cpu_unregister_map_client_do(client);
3299            break;
3300        }
3301    }
3302    qemu_mutex_unlock(&map_client_list_lock);
3303}
3304
3305static void cpu_notify_map_clients(void)
3306{
3307    qemu_mutex_lock(&map_client_list_lock);
3308    cpu_notify_map_clients_locked();
3309    qemu_mutex_unlock(&map_client_list_lock);
3310}
3311
3312static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
3313                                  bool is_write)
3314{
3315    MemoryRegion *mr;
3316    hwaddr l, xlat;
3317
3318    while (len > 0) {
3319        l = len;
3320        mr = flatview_translate(fv, addr, &xlat, &l, is_write);
3321        if (!memory_access_is_direct(mr, is_write)) {
3322            l = memory_access_size(mr, l, addr);
3323            if (!memory_region_access_valid(mr, xlat, l, is_write)) {
3324                return false;
3325            }
3326        }
3327
3328        len -= l;
3329        addr += l;
3330    }
3331    return true;
3332}
3333
3334bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3335                                int len, bool is_write)
3336{
3337    FlatView *fv;
3338    bool result;
3339
3340    rcu_read_lock();
3341    fv = address_space_to_flatview(as);
3342    result = flatview_access_valid(fv, addr, len, is_write);
3343    rcu_read_unlock();
3344    return result;
3345}
3346
3347static hwaddr
3348flatview_extend_translation(FlatView *fv, hwaddr addr,
3349                                 hwaddr target_len,
3350                                 MemoryRegion *mr, hwaddr base, hwaddr len,
3351                                 bool is_write)
3352{
3353    hwaddr done = 0;
3354    hwaddr xlat;
3355    MemoryRegion *this_mr;
3356
3357    for (;;) {
3358        target_len -= len;
3359        addr += len;
3360        done += len;
3361        if (target_len == 0) {
3362            return done;
3363        }
3364
3365        len = target_len;
3366        this_mr = flatview_translate(fv, addr, &xlat,
3367                                                   &len, is_write);
3368        if (this_mr != mr || xlat != base + done) {
3369            return done;
3370        }
3371    }
3372}
3373
3374/* Map a physical memory region into a host virtual address.
3375 * May map a subset of the requested range, given by and returned in *plen.
3376 * May return NULL if resources needed to perform the mapping are exhausted.
3377 * Use only for reads OR writes - not for read-modify-write operations.
3378 * Use cpu_register_map_client() to know when retrying the map operation is
3379 * likely to succeed.
3380 */
3381void *address_space_map(AddressSpace *as,
3382                        hwaddr addr,
3383                        hwaddr *plen,
3384                        bool is_write)
3385{
3386    hwaddr len = *plen;
3387    hwaddr l, xlat;
3388    MemoryRegion *mr;
3389    void *ptr;
3390    FlatView *fv;
3391
3392    if (len == 0) {
3393        return NULL;
3394    }
3395
3396    l = len;
3397    rcu_read_lock();
3398    fv = address_space_to_flatview(as);
3399    mr = flatview_translate(fv, addr, &xlat, &l, is_write);
3400
3401    if (!memory_access_is_direct(mr, is_write)) {
3402        if (atomic_xchg(&bounce.in_use, true)) {
3403            rcu_read_unlock();
3404            return NULL;
3405        }
3406        /* Avoid unbounded allocations */
3407        l = MIN(l, TARGET_PAGE_SIZE);
3408        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3409        bounce.addr = addr;
3410        bounce.len = l;
3411
3412        memory_region_ref(mr);
3413        bounce.mr = mr;
3414        if (!is_write) {
3415            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
3416                               bounce.buffer, l);
3417        }
3418
3419        rcu_read_unlock();
3420        *plen = l;
3421        return bounce.buffer;
3422    }
3423
3424
3425    memory_region_ref(mr);
3426    *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3427                                             l, is_write);
3428    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3429    rcu_read_unlock();
3430
3431    return ptr;
3432}
3433
3434/* Unmaps a memory region previously mapped by address_space_map().
3435 * Will also mark the memory as dirty if is_write == 1.  access_len gives
3436 * the amount of memory that was actually read or written by the caller.
3437 */
3438void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3439                         int is_write, hwaddr access_len)
3440{
3441    if (buffer != bounce.buffer) {
3442        MemoryRegion *mr;
3443        ram_addr_t addr1;
3444
3445        mr = memory_region_from_host(buffer, &addr1);
3446        assert(mr != NULL);
3447        if (is_write) {
3448            invalidate_and_set_dirty(mr, addr1, access_len);
3449        }
3450        if (xen_enabled()) {
3451            xen_invalidate_map_cache_entry(buffer);
3452        }
3453        memory_region_unref(mr);
3454        return;
3455    }
3456    if (is_write) {
3457        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3458                            bounce.buffer, access_len);
3459    }
3460    qemu_vfree(bounce.buffer);
3461    bounce.buffer = NULL;
3462    memory_region_unref(bounce.mr);
3463    atomic_mb_set(&bounce.in_use, false);
3464    cpu_notify_map_clients();
3465}
3466
3467void *cpu_physical_memory_map(hwaddr addr,
3468                              hwaddr *plen,
3469                              int is_write)
3470{
3471    return address_space_map(&address_space_memory, addr, plen, is_write);
3472}
3473
3474void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3475                               int is_write, hwaddr access_len)
3476{
3477    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3478}
3479
3480#define ARG1_DECL                AddressSpace *as
3481#define ARG1                     as
3482#define SUFFIX
3483#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3484#define IS_DIRECT(mr, is_write)  memory_access_is_direct(mr, is_write)
3485#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
3486#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
3487#define RCU_READ_LOCK(...)       rcu_read_lock()
3488#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3489#include "memory_ldst.inc.c"
3490
3491int64_t address_space_cache_init(MemoryRegionCache *cache,
3492                                 AddressSpace *as,
3493                                 hwaddr addr,
3494                                 hwaddr len,
3495                                 bool is_write)
3496{
3497    cache->len = len;
3498    cache->as = as;
3499    cache->xlat = addr;
3500    return len;
3501}
3502
3503void address_space_cache_invalidate(MemoryRegionCache *cache,
3504                                    hwaddr addr,
3505                                    hwaddr access_len)
3506{
3507}
3508
3509void address_space_cache_destroy(MemoryRegionCache *cache)
3510{
3511    cache->as = NULL;
3512}
3513
3514#define ARG1_DECL                MemoryRegionCache *cache
3515#define ARG1                     cache
3516#define SUFFIX                   _cached
3517#define TRANSLATE(addr, ...)     \
3518    address_space_translate(cache->as, cache->xlat + (addr), __VA_ARGS__)
3519#define IS_DIRECT(mr, is_write)  true
3520#define MAP_RAM(mr, ofs)         qemu_map_ram_ptr((mr)->ram_block, ofs)
3521#define INVALIDATE(mr, ofs, len) invalidate_and_set_dirty(mr, ofs, len)
3522#define RCU_READ_LOCK()          rcu_read_lock()
3523#define RCU_READ_UNLOCK()        rcu_read_unlock()
3524#include "memory_ldst.inc.c"
3525
3526/* virtual memory access for debug (includes writing to ROM) */
3527int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3528                        uint8_t *buf, int len, int is_write)
3529{
3530    int l;
3531    hwaddr phys_addr;
3532    target_ulong page;
3533
3534    cpu_synchronize_state(cpu);
3535    while (len > 0) {
3536        int asidx;
3537        MemTxAttrs attrs;
3538
3539        page = addr & TARGET_PAGE_MASK;
3540        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3541        asidx = cpu_asidx_from_attrs(cpu, attrs);
3542        /* if no physical page mapped, return an error */
3543        if (phys_addr == -1)
3544            return -1;
3545        l = (page + TARGET_PAGE_SIZE) - addr;
3546        if (l > len)
3547            l = len;
3548        phys_addr += (addr & ~TARGET_PAGE_MASK);
3549        if (is_write) {
3550            cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
3551                                          phys_addr, buf, l);
3552        } else {
3553            address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3554                             MEMTXATTRS_UNSPECIFIED,
3555                             buf, l, 0);
3556        }
3557        len -= l;
3558        buf += l;
3559        addr += l;
3560    }
3561    return 0;
3562}
3563
3564/*
3565 * Allows code that needs to deal with migration bitmaps etc to still be built
3566 * target independent.
3567 */
3568size_t qemu_target_page_size(void)
3569{
3570    return TARGET_PAGE_SIZE;
3571}
3572
3573int qemu_target_page_bits(void)
3574{
3575    return TARGET_PAGE_BITS;
3576}
3577
3578int qemu_target_page_bits_min(void)
3579{
3580    return TARGET_PAGE_BITS_MIN;
3581}
3582#endif
3583
3584/*
3585 * A helper function for the _utterly broken_ virtio device model to find out if
3586 * it's running on a big endian machine. Don't do this at home kids!
3587 */
3588bool target_words_bigendian(void);
3589bool target_words_bigendian(void)
3590{
3591#if defined(TARGET_WORDS_BIGENDIAN)
3592    return true;
3593#else
3594    return false;
3595#endif
3596}
3597
3598#ifndef CONFIG_USER_ONLY
3599bool cpu_physical_memory_is_io(hwaddr phys_addr)
3600{
3601    MemoryRegion*mr;
3602    hwaddr l = 1;
3603    bool res;
3604
3605    rcu_read_lock();
3606    mr = address_space_translate(&address_space_memory,
3607                                 phys_addr, &phys_addr, &l, false);
3608
3609    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3610    rcu_read_unlock();
3611    return res;
3612}
3613
3614int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3615{
3616    RAMBlock *block;
3617    int ret = 0;
3618
3619    rcu_read_lock();
3620    RAMBLOCK_FOREACH(block) {
3621        ret = func(block->idstr, block->host, block->offset,
3622                   block->used_length, opaque);
3623        if (ret) {
3624            break;
3625        }
3626    }
3627    rcu_read_unlock();
3628    return ret;
3629}
3630
3631/*
3632 * Unmap pages of memory from start to start+length such that
3633 * they a) read as 0, b) Trigger whatever fault mechanism
3634 * the OS provides for postcopy.
3635 * The pages must be unmapped by the end of the function.
3636 * Returns: 0 on success, none-0 on failure
3637 *
3638 */
3639int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
3640{
3641    int ret = -1;
3642
3643    uint8_t *host_startaddr = rb->host + start;
3644
3645    if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
3646        error_report("ram_block_discard_range: Unaligned start address: %p",
3647                     host_startaddr);
3648        goto err;
3649    }
3650
3651    if ((start + length) <= rb->used_length) {
3652        uint8_t *host_endaddr = host_startaddr + length;
3653        if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
3654            error_report("ram_block_discard_range: Unaligned end address: %p",
3655                         host_endaddr);
3656            goto err;
3657        }
3658
3659        errno = ENOTSUP; /* If we are missing MADVISE etc */
3660
3661        if (rb->page_size == qemu_host_page_size) {
3662#if defined(CONFIG_MADVISE)
3663            /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
3664             * freeing the page.
3665             */
3666            ret = madvise(host_startaddr, length, MADV_DONTNEED);
3667#endif
3668        } else {
3669            /* Huge page case  - unfortunately it can't do DONTNEED, but
3670             * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
3671             * huge page file.
3672             */
3673#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
3674            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
3675                            start, length);
3676#endif
3677        }
3678        if (ret) {
3679            ret = -errno;
3680            error_report("ram_block_discard_range: Failed to discard range "
3681                         "%s:%" PRIx64 " +%zx (%d)",
3682                         rb->idstr, start, length, ret);
3683        }
3684    } else {
3685        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
3686                     "/%zx/" RAM_ADDR_FMT")",
3687                     rb->idstr, start, length, rb->used_length);
3688    }
3689
3690err:
3691    return ret;
3692}
3693
3694#endif
3695
3696void page_size_init(void)
3697{
3698    /* NOTE: we can always suppose that qemu_host_page_size >=
3699       TARGET_PAGE_SIZE */
3700    if (qemu_host_page_size == 0) {
3701        qemu_host_page_size = qemu_real_host_page_size;
3702    }
3703    if (qemu_host_page_size < TARGET_PAGE_SIZE) {
3704        qemu_host_page_size = TARGET_PAGE_SIZE;
3705    }
3706    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
3707}
3708
3709#if !defined(CONFIG_USER_ONLY)
3710
3711static void mtree_print_phys_entries(fprintf_function mon, void *f,
3712                                     int start, int end, int skip, int ptr)
3713{
3714    if (start == end - 1) {
3715        mon(f, "\t%3d      ", start);
3716    } else {
3717        mon(f, "\t%3d..%-3d ", start, end - 1);
3718    }
3719    mon(f, " skip=%d ", skip);
3720    if (ptr == PHYS_MAP_NODE_NIL) {
3721        mon(f, " ptr=NIL");
3722    } else if (!skip) {
3723        mon(f, " ptr=#%d", ptr);
3724    } else {
3725        mon(f, " ptr=[%d]", ptr);
3726    }
3727    mon(f, "\n");
3728}
3729
3730#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
3731                           int128_sub((size), int128_one())) : 0)
3732
3733void mtree_print_dispatch(fprintf_function mon, void *f,
3734                          AddressSpaceDispatch *d, MemoryRegion *root)
3735{
3736    int i;
3737
3738    mon(f, "  Dispatch\n");
3739    mon(f, "    Physical sections\n");
3740
3741    for (i = 0; i < d->map.sections_nb; ++i) {
3742        MemoryRegionSection *s = d->map.sections + i;
3743        const char *names[] = { " [unassigned]", " [not dirty]",
3744                                " [ROM]", " [watch]" };
3745
3746        mon(f, "      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s",
3747            i,
3748            s->offset_within_address_space,
3749            s->offset_within_address_space + MR_SIZE(s->mr->size),
3750            s->mr->name ? s->mr->name : "(noname)",
3751            i < ARRAY_SIZE(names) ? names[i] : "",
3752            s->mr == root ? " [ROOT]" : "",
3753            s == d->mru_section ? " [MRU]" : "",
3754            s->mr->is_iommu ? " [iommu]" : "");
3755
3756        if (s->mr->alias) {
3757            mon(f, " alias=%s", s->mr->alias->name ?
3758                    s->mr->alias->name : "noname");
3759        }
3760        mon(f, "\n");
3761    }
3762
3763    mon(f, "    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
3764               P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
3765    for (i = 0; i < d->map.nodes_nb; ++i) {
3766        int j, jprev;
3767        PhysPageEntry prev;
3768        Node *n = d->map.nodes + i;
3769
3770        mon(f, "      [%d]\n", i);
3771
3772        for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
3773            PhysPageEntry *pe = *n + j;
3774
3775            if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
3776                continue;
3777            }
3778
3779            mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
3780
3781            jprev = j;
3782            prev = *pe;
3783        }
3784
3785        if (jprev != ARRAY_SIZE(*n)) {
3786            mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr);
3787        }
3788    }
3789}
3790
3791#endif
3792