qemu/exec.c
<<
>>
Prefs
   1/*
   2 *  Virtual page mapping
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu-common.h"
  22#include "qapi/error.h"
  23
  24#include "qemu/cutils.h"
  25#include "cpu.h"
  26#include "exec/exec-all.h"
  27#include "exec/target_page.h"
  28#include "tcg/tcg.h"
  29#include "hw/qdev-core.h"
  30#include "hw/qdev-properties.h"
  31#if !defined(CONFIG_USER_ONLY)
  32#include "hw/core/cpu-exec-gpio.h"
  33#include "hw/boards.h"
  34#include "hw/xen/xen.h"
  35#endif
  36#include "sysemu/kvm.h"
  37#include "sysemu/sysemu.h"
  38#include "sysemu/tcg.h"
  39#include "sysemu/qtest.h"
  40#include "qemu/timer.h"
  41#include "qemu/config-file.h"
  42#include "qemu/error-report.h"
  43#include "qemu/qemu-print.h"
  44#if defined(CONFIG_USER_ONLY)
  45#include "qemu.h"
  46#else /* !CONFIG_USER_ONLY */
  47#include "exec/memory.h"
  48#include "exec/ioport.h"
  49#include "sysemu/dma.h"
  50#include "sysemu/hostmem.h"
  51#include "sysemu/hw_accel.h"
  52#include "exec/address-spaces.h"
  53#include "sysemu/xen-mapcache.h"
  54#include "trace-root.h"
  55
  56#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
  57#include <linux/falloc.h>
  58#endif
  59
  60#endif
  61#include "qemu/rcu_queue.h"
  62#include "qemu/main-loop.h"
  63#include "translate-all.h"
  64#include "sysemu/replay.h"
  65
  66#include "exec/memory-internal.h"
  67#include "exec/ram_addr.h"
  68#include "exec/log.h"
  69
  70#include "qemu/pmem.h"
  71
  72#include "migration/vmstate.h"
  73
  74#include "qemu/range.h"
  75#ifndef _WIN32
  76#include "qemu/mmap-alloc.h"
  77#endif
  78
  79#include "monitor/monitor.h"
  80
  81//#define DEBUG_SUBPAGE
  82
  83#if !defined(CONFIG_USER_ONLY)
  84/* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  85 * are protected by the ramlist lock.
  86 */
  87RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  88
  89static MemoryRegion *system_memory;
  90static MemoryRegion *system_io;
  91
  92AddressSpace address_space_io;
  93AddressSpace address_space_memory;
  94
  95static MemoryRegion io_mem_unassigned;
  96#endif
  97
  98CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  99
 100/* current CPU in the current thread. It is only valid inside
 101   cpu_exec() */
 102__thread CPUState *current_cpu;
 103
 104uintptr_t qemu_host_page_size;
 105intptr_t qemu_host_page_mask;
 106
 107#if !defined(CONFIG_USER_ONLY)
 108/* 0 = Do not count executed instructions.
 109   1 = Precise instruction counting.
 110   2 = Adaptive rate instruction counting.  */
 111int use_icount;
 112
 113typedef struct PhysPageEntry PhysPageEntry;
 114
 115struct PhysPageEntry {
 116    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 117    uint32_t skip : 6;
 118     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 119    uint32_t ptr : 26;
 120};
 121
 122#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 123
 124/* Size of the L2 (and L3, etc) page tables.  */
 125#define ADDR_SPACE_BITS 64
 126
 127#define P_L2_BITS 9
 128#define P_L2_SIZE (1 << P_L2_BITS)
 129
 130#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 131
 132typedef PhysPageEntry Node[P_L2_SIZE];
 133
 134typedef struct PhysPageMap {
 135    struct rcu_head rcu;
 136
 137    unsigned sections_nb;
 138    unsigned sections_nb_alloc;
 139    unsigned nodes_nb;
 140    unsigned nodes_nb_alloc;
 141    Node *nodes;
 142    MemoryRegionSection *sections;
 143} PhysPageMap;
 144
 145struct AddressSpaceDispatch {
 146    MemoryRegionSection *mru_section;
 147    /* This is a multi-level map on the physical address space.
 148     * The bottom level has pointers to MemoryRegionSections.
 149     */
 150    PhysPageEntry phys_map;
 151    PhysPageMap map;
 152};
 153
 154#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 155typedef struct subpage_t {
 156    MemoryRegion iomem;
 157    FlatView *fv;
 158    hwaddr base;
 159    uint16_t sub_section[];
 160} subpage_t;
 161
 162#define PHYS_SECTION_UNASSIGNED 0
 163
 164static void io_mem_init(void);
 165static void memory_map_init(void);
 166static void tcg_log_global_after_sync(MemoryListener *listener);
 167static void tcg_commit(MemoryListener *listener);
 168
 169/**
 170 * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 171 * @cpu: the CPU whose AddressSpace this is
 172 * @as: the AddressSpace itself
 173 * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 174 * @tcg_as_listener: listener for tracking changes to the AddressSpace
 175 */
 176struct CPUAddressSpace {
 177    CPUState *cpu;
 178    AddressSpace *as;
 179    struct AddressSpaceDispatch *memory_dispatch;
 180    MemoryListener tcg_as_listener;
 181};
 182
 183struct DirtyBitmapSnapshot {
 184    ram_addr_t start;
 185    ram_addr_t end;
 186    unsigned long dirty[];
 187};
 188
 189#endif
 190
 191#if !defined(CONFIG_USER_ONLY)
 192
 193static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 194{
 195    static unsigned alloc_hint = 16;
 196    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 197        map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
 198        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 199        alloc_hint = map->nodes_nb_alloc;
 200    }
 201}
 202
 203static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 204{
 205    unsigned i;
 206    uint32_t ret;
 207    PhysPageEntry e;
 208    PhysPageEntry *p;
 209
 210    ret = map->nodes_nb++;
 211    p = map->nodes[ret];
 212    assert(ret != PHYS_MAP_NODE_NIL);
 213    assert(ret != map->nodes_nb_alloc);
 214
 215    e.skip = leaf ? 0 : 1;
 216    e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 217    for (i = 0; i < P_L2_SIZE; ++i) {
 218        memcpy(&p[i], &e, sizeof(e));
 219    }
 220    return ret;
 221}
 222
 223static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 224                                hwaddr *index, uint64_t *nb, uint16_t leaf,
 225                                int level)
 226{
 227    PhysPageEntry *p;
 228    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 229
 230    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 231        lp->ptr = phys_map_node_alloc(map, level == 0);
 232    }
 233    p = map->nodes[lp->ptr];
 234    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 235
 236    while (*nb && lp < &p[P_L2_SIZE]) {
 237        if ((*index & (step - 1)) == 0 && *nb >= step) {
 238            lp->skip = 0;
 239            lp->ptr = leaf;
 240            *index += step;
 241            *nb -= step;
 242        } else {
 243            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 244        }
 245        ++lp;
 246    }
 247}
 248
 249static void phys_page_set(AddressSpaceDispatch *d,
 250                          hwaddr index, uint64_t nb,
 251                          uint16_t leaf)
 252{
 253    /* Wildly overreserve - it doesn't matter much. */
 254    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 255
 256    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 257}
 258
 259/* Compact a non leaf page entry. Simply detect that the entry has a single child,
 260 * and update our entry so we can skip it and go directly to the destination.
 261 */
 262static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 263{
 264    unsigned valid_ptr = P_L2_SIZE;
 265    int valid = 0;
 266    PhysPageEntry *p;
 267    int i;
 268
 269    if (lp->ptr == PHYS_MAP_NODE_NIL) {
 270        return;
 271    }
 272
 273    p = nodes[lp->ptr];
 274    for (i = 0; i < P_L2_SIZE; i++) {
 275        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 276            continue;
 277        }
 278
 279        valid_ptr = i;
 280        valid++;
 281        if (p[i].skip) {
 282            phys_page_compact(&p[i], nodes);
 283        }
 284    }
 285
 286    /* We can only compress if there's only one child. */
 287    if (valid != 1) {
 288        return;
 289    }
 290
 291    assert(valid_ptr < P_L2_SIZE);
 292
 293    /* Don't compress if it won't fit in the # of bits we have. */
 294    if (P_L2_LEVELS >= (1 << 6) &&
 295        lp->skip + p[valid_ptr].skip >= (1 << 6)) {
 296        return;
 297    }
 298
 299    lp->ptr = p[valid_ptr].ptr;
 300    if (!p[valid_ptr].skip) {
 301        /* If our only child is a leaf, make this a leaf. */
 302        /* By design, we should have made this node a leaf to begin with so we
 303         * should never reach here.
 304         * But since it's so simple to handle this, let's do it just in case we
 305         * change this rule.
 306         */
 307        lp->skip = 0;
 308    } else {
 309        lp->skip += p[valid_ptr].skip;
 310    }
 311}
 312
 313void address_space_dispatch_compact(AddressSpaceDispatch *d)
 314{
 315    if (d->phys_map.skip) {
 316        phys_page_compact(&d->phys_map, d->map.nodes);
 317    }
 318}
 319
 320static inline bool section_covers_addr(const MemoryRegionSection *section,
 321                                       hwaddr addr)
 322{
 323    /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 324     * the section must cover the entire address space.
 325     */
 326    return int128_gethi(section->size) ||
 327           range_covers_byte(section->offset_within_address_space,
 328                             int128_getlo(section->size), addr);
 329}
 330
 331static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
 332{
 333    PhysPageEntry lp = d->phys_map, *p;
 334    Node *nodes = d->map.nodes;
 335    MemoryRegionSection *sections = d->map.sections;
 336    hwaddr index = addr >> TARGET_PAGE_BITS;
 337    int i;
 338
 339    for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 340        if (lp.ptr == PHYS_MAP_NODE_NIL) {
 341            return &sections[PHYS_SECTION_UNASSIGNED];
 342        }
 343        p = nodes[lp.ptr];
 344        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 345    }
 346
 347    if (section_covers_addr(&sections[lp.ptr], addr)) {
 348        return &sections[lp.ptr];
 349    } else {
 350        return &sections[PHYS_SECTION_UNASSIGNED];
 351    }
 352}
 353
 354/* Called from RCU critical section */
 355static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 356                                                        hwaddr addr,
 357                                                        bool resolve_subpage)
 358{
 359    MemoryRegionSection *section = atomic_read(&d->mru_section);
 360    subpage_t *subpage;
 361
 362    if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
 363        !section_covers_addr(section, addr)) {
 364        section = phys_page_find(d, addr);
 365        atomic_set(&d->mru_section, section);
 366    }
 367    if (resolve_subpage && section->mr->subpage) {
 368        subpage = container_of(section->mr, subpage_t, iomem);
 369        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 370    }
 371    return section;
 372}
 373
 374/* Called from RCU critical section */
 375static MemoryRegionSection *
 376address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 377                                 hwaddr *plen, bool resolve_subpage)
 378{
 379    MemoryRegionSection *section;
 380    MemoryRegion *mr;
 381    Int128 diff;
 382
 383    section = address_space_lookup_region(d, addr, resolve_subpage);
 384    /* Compute offset within MemoryRegionSection */
 385    addr -= section->offset_within_address_space;
 386
 387    /* Compute offset within MemoryRegion */
 388    *xlat = addr + section->offset_within_region;
 389
 390    mr = section->mr;
 391
 392    /* MMIO registers can be expected to perform full-width accesses based only
 393     * on their address, without considering adjacent registers that could
 394     * decode to completely different MemoryRegions.  When such registers
 395     * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 396     * regions overlap wildly.  For this reason we cannot clamp the accesses
 397     * here.
 398     *
 399     * If the length is small (as is the case for address_space_ldl/stl),
 400     * everything works fine.  If the incoming length is large, however,
 401     * the caller really has to do the clamping through memory_access_size.
 402     */
 403    if (memory_region_is_ram(mr)) {
 404        diff = int128_sub(section->size, int128_make64(addr));
 405        *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 406    }
 407    return section;
 408}
 409
 410/**
 411 * address_space_translate_iommu - translate an address through an IOMMU
 412 * memory region and then through the target address space.
 413 *
 414 * @iommu_mr: the IOMMU memory region that we start the translation from
 415 * @addr: the address to be translated through the MMU
 416 * @xlat: the translated address offset within the destination memory region.
 417 *        It cannot be %NULL.
 418 * @plen_out: valid read/write length of the translated address. It
 419 *            cannot be %NULL.
 420 * @page_mask_out: page mask for the translated address. This
 421 *            should only be meaningful for IOMMU translated
 422 *            addresses, since there may be huge pages that this bit
 423 *            would tell. It can be %NULL if we don't care about it.
 424 * @is_write: whether the translation operation is for write
 425 * @is_mmio: whether this can be MMIO, set true if it can
 426 * @target_as: the address space targeted by the IOMMU
 427 * @attrs: transaction attributes
 428 *
 429 * This function is called from RCU critical section.  It is the common
 430 * part of flatview_do_translate and address_space_translate_cached.
 431 */
 432static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
 433                                                         hwaddr *xlat,
 434                                                         hwaddr *plen_out,
 435                                                         hwaddr *page_mask_out,
 436                                                         bool is_write,
 437                                                         bool is_mmio,
 438                                                         AddressSpace **target_as,
 439                                                         MemTxAttrs attrs)
 440{
 441    MemoryRegionSection *section;
 442    hwaddr page_mask = (hwaddr)-1;
 443
 444    do {
 445        hwaddr addr = *xlat;
 446        IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 447        int iommu_idx = 0;
 448        IOMMUTLBEntry iotlb;
 449
 450        if (imrc->attrs_to_index) {
 451            iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
 452        }
 453
 454        if (imrc->translate_attr) {
 455            iotlb = imrc->translate_attr(iommu_mr, addr, is_write, &attrs);
 456        } else {
 457            iotlb = imrc->translate(iommu_mr, addr, is_write ?
 458                                    IOMMU_WO : IOMMU_RO, iommu_idx);
 459        }
 460
 461        if (!(iotlb.perm & (1 << is_write))) {
 462            goto unassigned;
 463        }
 464
 465        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 466                | (addr & iotlb.addr_mask));
 467        page_mask &= iotlb.addr_mask;
 468        *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
 469        *target_as = iotlb.target_as;
 470
 471        section = address_space_translate_internal(
 472                address_space_to_dispatch(iotlb.target_as), addr, xlat,
 473                plen_out, is_mmio);
 474
 475        iommu_mr = memory_region_get_iommu(section->mr);
 476    } while (unlikely(iommu_mr));
 477
 478    if (page_mask_out) {
 479        *page_mask_out = page_mask;
 480    }
 481    return *section;
 482
 483unassigned:
 484    return (MemoryRegionSection) { .mr = &io_mem_unassigned };
 485}
 486
 487/**
 488 * flatview_do_translate - translate an address in FlatView
 489 *
 490 * @fv: the flat view that we want to translate on
 491 * @addr: the address to be translated in above address space
 492 * @xlat: the translated address offset within memory region. It
 493 *        cannot be @NULL.
 494 * @plen_out: valid read/write length of the translated address. It
 495 *            can be @NULL when we don't care about it.
 496 * @page_mask_out: page mask for the translated address. This
 497 *            should only be meaningful for IOMMU translated
 498 *            addresses, since there may be huge pages that this bit
 499 *            would tell. It can be @NULL if we don't care about it.
 500 * @is_write: whether the translation operation is for write
 501 * @is_mmio: whether this can be MMIO, set true if it can
 502 * @target_as: the address space targeted by the IOMMU
 503 * @attrs: memory transaction attributes
 504 *
 505 * This function is called from RCU critical section
 506 */
 507static MemoryRegionSection flatview_do_translate(FlatView *fv,
 508                                                 hwaddr addr,
 509                                                 hwaddr *xlat,
 510                                                 hwaddr *plen_out,
 511                                                 hwaddr *page_mask_out,
 512                                                 bool is_write,
 513                                                 bool is_mmio,
 514                                                 AddressSpace **target_as,
 515                                                 MemTxAttrs attrs)
 516{
 517    MemoryRegionSection *section;
 518    IOMMUMemoryRegion *iommu_mr;
 519    hwaddr plen = (hwaddr)(-1);
 520
 521    if (!plen_out) {
 522        plen_out = &plen;
 523    }
 524
 525    section = address_space_translate_internal(
 526            flatview_to_dispatch(fv), addr, xlat,
 527            plen_out, is_mmio);
 528
 529    iommu_mr = memory_region_get_iommu(section->mr);
 530    if (unlikely(iommu_mr)) {
 531        return address_space_translate_iommu(iommu_mr, xlat,
 532                                             plen_out, page_mask_out,
 533                                             is_write, is_mmio,
 534                                             target_as, attrs);
 535    }
 536    if (page_mask_out) {
 537        /* Not behind an IOMMU, use default page size. */
 538        *page_mask_out = ~TARGET_PAGE_MASK;
 539    }
 540
 541    return *section;
 542}
 543
 544/* Called from RCU critical section */
 545IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
 546                                            bool is_write, MemTxAttrs attrs)
 547{
 548    MemoryRegionSection section;
 549    hwaddr xlat, page_mask;
 550
 551    /*
 552     * This can never be MMIO, and we don't really care about plen,
 553     * but page mask.
 554     */
 555    section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
 556                                    NULL, &page_mask, is_write, false, &as,
 557                                    attrs);
 558
 559    /* Illegal translation */
 560    if (section.mr == &io_mem_unassigned) {
 561        goto iotlb_fail;
 562    }
 563
 564    /* Convert memory region offset into address space offset */
 565    xlat += section.offset_within_address_space -
 566        section.offset_within_region;
 567
 568    return (IOMMUTLBEntry) {
 569        .target_as = as,
 570        .iova = addr & ~page_mask,
 571        .translated_addr = xlat & ~page_mask,
 572        .addr_mask = page_mask,
 573        /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
 574        .perm = IOMMU_RW,
 575    };
 576
 577iotlb_fail:
 578    return (IOMMUTLBEntry) {0};
 579}
 580
 581/* Called from RCU critical section */
 582MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
 583                                 hwaddr *plen, bool is_write,
 584                                 MemTxAttrs attrs)
 585{
 586    MemoryRegion *mr;
 587    MemoryRegionSection section;
 588    AddressSpace *as = NULL;
 589
 590    /* This can be MMIO, so setup MMIO bit. */
 591    section = flatview_do_translate(fv, addr, xlat, plen, NULL,
 592                                    is_write, true, &as, attrs);
 593    mr = section.mr;
 594
 595    if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 596        hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 597        *plen = MIN(page, *plen);
 598    }
 599
 600    return mr;
 601}
 602
 603typedef struct TCGIOMMUNotifier {
 604    IOMMUNotifier n;
 605    MemoryRegion *mr;
 606    CPUState *cpu;
 607    int iommu_idx;
 608    bool active;
 609} TCGIOMMUNotifier;
 610
 611static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 612{
 613    TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
 614
 615    if (!notifier->active) {
 616        return;
 617    }
 618    tlb_flush(notifier->cpu);
 619    notifier->active = false;
 620    /* We leave the notifier struct on the list to avoid reallocating it later.
 621     * Generally the number of IOMMUs a CPU deals with will be small.
 622     * In any case we can't unregister the iommu notifier from a notify
 623     * callback.
 624     */
 625}
 626
 627static void tcg_register_iommu_notifier(CPUState *cpu,
 628                                        IOMMUMemoryRegion *iommu_mr,
 629                                        int iommu_idx)
 630{
 631    /* Make sure this CPU has an IOMMU notifier registered for this
 632     * IOMMU/IOMMU index combination, so that we can flush its TLB
 633     * when the IOMMU tells us the mappings we've cached have changed.
 634     */
 635    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
 636    TCGIOMMUNotifier *notifier;
 637    Error *err = NULL;
 638    int i, ret;
 639
 640    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
 641        notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
 642        if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
 643            break;
 644        }
 645    }
 646    if (i == cpu->iommu_notifiers->len) {
 647        /* Not found, add a new entry at the end of the array */
 648        cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
 649        notifier = g_new0(TCGIOMMUNotifier, 1);
 650        g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
 651
 652        notifier->mr = mr;
 653        notifier->iommu_idx = iommu_idx;
 654        notifier->cpu = cpu;
 655        /* Rather than trying to register interest in the specific part
 656         * of the iommu's address space that we've accessed and then
 657         * expand it later as subsequent accesses touch more of it, we
 658         * just register interest in the whole thing, on the assumption
 659         * that iommu reconfiguration will be rare.
 660         */
 661        iommu_notifier_init(&notifier->n,
 662                            tcg_iommu_unmap_notify,
 663                            IOMMU_NOTIFIER_UNMAP,
 664                            0,
 665                            HWADDR_MAX,
 666                            iommu_idx);
 667        ret = memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
 668                                                    &err);
 669        if (ret) {
 670            error_report_err(err);
 671            exit(1);
 672        }
 673    }
 674
 675    if (!notifier->active) {
 676        notifier->active = true;
 677    }
 678}
 679
 680static void tcg_iommu_free_notifier_list(CPUState *cpu)
 681{
 682    /* Destroy the CPU's notifier list */
 683    int i;
 684    TCGIOMMUNotifier *notifier;
 685
 686    for (i = 0; i < cpu->iommu_notifiers->len; i++) {
 687        notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
 688        memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
 689        g_free(notifier);
 690    }
 691    g_array_free(cpu->iommu_notifiers, true);
 692}
 693
 694/* Called from RCU critical section */
 695MemoryRegionSection *
 696address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 697                                  hwaddr *xlat, hwaddr *plen,
 698                                  MemTxAttrs attrs, int *prot)
 699{
 700    MemoryRegionSection *section;
 701    IOMMUMemoryRegion *iommu_mr;
 702    IOMMUMemoryRegionClass *imrc;
 703    IOMMUTLBEntry iotlb;
 704    int iommu_idx;
 705    AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 706    struct {
 707        AddressSpace *as;
 708        MemoryRegionSection *section;
 709        hwaddr addr;
 710        hwaddr plen;
 711    } root = {0};
 712
 713    root.as = cpu->cpu_ases[asidx].as;
 714    root.addr = addr;
 715    iotlb.target_as = root.as;
 716
 717    for (;;) {
 718        section = address_space_translate_internal(d, addr, &addr, plen, false);
 719        if (!root.section) {
 720            root.section = section;
 721            root.plen = *plen;
 722        }
 723
 724        iommu_mr = memory_region_get_iommu(section->mr);
 725        if (!iommu_mr) {
 726            break;
 727        }
 728
 729        imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 730
 731        iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
 732        tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
 733        /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
 734         * doesn't short-cut its translation table walk.
 735         */
 736        if (imrc->translate_attr) {
 737            iotlb = imrc->translate_attr(iommu_mr, addr, IOMMU_NONE, &attrs);
 738        } else {
 739            iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
 740        }
 741        addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 742                | (addr & iotlb.addr_mask));
 743        /* Update the caller's prot bits to remove permissions the IOMMU
 744         * is giving us a failure response for. If we get down to no
 745         * permissions left at all we can give up now.
 746         */
 747        if (!(iotlb.perm & IOMMU_RO)) {
 748            *prot &= ~(PAGE_READ | PAGE_EXEC);
 749        }
 750        if (!(iotlb.perm & IOMMU_WO)) {
 751            *prot &= ~PAGE_WRITE;
 752        }
 753
 754        if (!*prot) {
 755            goto translate_fail;
 756        }
 757
 758        d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
 759    }
 760
 761    assert(!memory_region_is_iommu(section->mr));
 762    if (!memory_region_is_ram(section->mr) && iotlb.target_as != root.as) {
 763        section = root.section;
 764        addr = root.addr;
 765        *plen = root.plen;
 766    }
 767
 768    *xlat = addr;
 769    return section;
 770
 771translate_fail:
 772    return &d->map.sections[PHYS_SECTION_UNASSIGNED];
 773}
 774#endif
 775
 776#if !defined(CONFIG_USER_ONLY)
 777
 778static int cpu_common_post_load(void *opaque, int version_id)
 779{
 780    CPUState *cpu = opaque;
 781
 782    /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 783       version_id is increased. */
 784    cpu->interrupt_request &= ~0x01;
 785    tlb_flush(cpu);
 786
 787    /* loadvm has just updated the content of RAM, bypassing the
 788     * usual mechanisms that ensure we flush TBs for writes to
 789     * memory we've translated code from. So we must flush all TBs,
 790     * which will now be stale.
 791     */
 792    tb_flush(cpu);
 793
 794    return 0;
 795}
 796
 797static int cpu_common_pre_load(void *opaque)
 798{
 799    CPUState *cpu = opaque;
 800
 801    cpu->exception_index = -1;
 802
 803    return 0;
 804}
 805
 806static bool cpu_common_exception_index_needed(void *opaque)
 807{
 808    CPUState *cpu = opaque;
 809
 810    return tcg_enabled() && cpu->exception_index != -1;
 811}
 812
 813static const VMStateDescription vmstate_cpu_common_exception_index = {
 814    .name = "cpu_common/exception_index",
 815    .version_id = 1,
 816    .minimum_version_id = 1,
 817    .needed = cpu_common_exception_index_needed,
 818    .fields = (VMStateField[]) {
 819        VMSTATE_INT32(exception_index, CPUState),
 820        VMSTATE_END_OF_LIST()
 821    }
 822};
 823
 824static bool cpu_common_crash_occurred_needed(void *opaque)
 825{
 826    CPUState *cpu = opaque;
 827
 828    return cpu->crash_occurred;
 829}
 830
 831static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 832    .name = "cpu_common/crash_occurred",
 833    .version_id = 1,
 834    .minimum_version_id = 1,
 835    .needed = cpu_common_crash_occurred_needed,
 836    .fields = (VMStateField[]) {
 837        VMSTATE_BOOL(crash_occurred, CPUState),
 838        VMSTATE_END_OF_LIST()
 839    }
 840};
 841
 842const VMStateDescription vmstate_cpu_common = {
 843    .name = "cpu_common",
 844    .version_id = 1,
 845    .minimum_version_id = 1,
 846    .pre_load = cpu_common_pre_load,
 847    .post_load = cpu_common_post_load,
 848    .fields = (VMStateField[]) {
 849        VMSTATE_UINT32(halted, CPUState),
 850        VMSTATE_UINT32(interrupt_request, CPUState),
 851        VMSTATE_END_OF_LIST()
 852    },
 853    .subsections = (const VMStateDescription*[]) {
 854        &vmstate_cpu_common_exception_index,
 855        &vmstate_cpu_common_crash_occurred,
 856        NULL
 857    }
 858};
 859
 860#endif
 861
 862CPUState *qemu_get_cpu(int index)
 863{
 864    CPUState *cpu;
 865
 866    CPU_FOREACH(cpu) {
 867        if (cpu->cpu_index == index) {
 868            return cpu;
 869        }
 870    }
 871
 872    return NULL;
 873}
 874
 875#if !defined(CONFIG_USER_ONLY)
 876void cpu_address_space_init(CPUState *cpu, int asidx,
 877                            const char *prefix, MemoryRegion *mr)
 878{
 879    CPUAddressSpace *newas;
 880    AddressSpace *as = g_new0(AddressSpace, 1);
 881    char *as_name;
 882
 883    assert(mr);
 884    as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
 885    address_space_init(as, mr, as_name);
 886    g_free(as_name);
 887
 888    /* Target code should have set num_ases before calling us */
 889    assert(asidx < cpu->num_ases);
 890
 891    if (asidx == 0) {
 892        /* address space 0 gets the convenience alias */
 893        cpu->as = as;
 894    }
 895
 896    /* KVM cannot currently support multiple address spaces. */
 897    assert(asidx == 0 || !kvm_enabled());
 898
 899    if (!cpu->cpu_ases) {
 900        cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 901    }
 902
 903    newas = &cpu->cpu_ases[asidx];
 904    newas->cpu = cpu;
 905    newas->as = as;
 906    if (tcg_enabled()) {
 907        newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
 908        newas->tcg_as_listener.commit = tcg_commit;
 909        memory_listener_register(&newas->tcg_as_listener, as);
 910    }
 911}
 912
 913AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 914{
 915    /* Return the AddressSpace corresponding to the specified index */
 916    return cpu->cpu_ases[asidx].as;
 917}
 918#endif
 919
 920void cpu_exec_unrealizefn(CPUState *cpu)
 921{
 922    CPUClass *cc = CPU_GET_CLASS(cpu);
 923
 924    cpu_list_remove(cpu);
 925
 926    if (cc->vmsd != NULL) {
 927        vmstate_unregister(NULL, cc->vmsd, cpu);
 928    }
 929    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 930        vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 931    }
 932#ifndef CONFIG_USER_ONLY
 933    tcg_iommu_free_notifier_list(cpu);
 934#endif
 935}
 936
 937Property cpu_common_props[] = {
 938#ifndef CONFIG_USER_ONLY
 939    /* Create a memory property for softmmu CPU object,
 940     * so users can wire up its memory. (This can't go in hw/core/cpu.c
 941     * because that file is compiled only once for both user-mode
 942     * and system builds.) The default if no link is set up is to use
 943     * the system address space.
 944     */
 945    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
 946                     MemoryRegion *),
 947#endif
 948    DEFINE_PROP_BOOL("halt", CPUState, reset_pin, false),
 949    DEFINE_PROP_STRING("gdb-id", CPUState, gdb_id),
 950    DEFINE_PROP_END_OF_LIST(),
 951};
 952
 953void cpu_exec_reset(CPUState *cpu)
 954{
 955#ifndef CONFIG_USER_ONLY
 956    /* Desired state before lost to pin-driven action */
 957    bool old_halt = cpu->halt_pin;
 958    bool old_reset = cpu->reset_pin;
 959
 960    cpu_halt_gpio(cpu, 0, old_halt);
 961    cpu_reset_gpio(cpu, 0, old_reset);
 962#endif
 963}
 964
 965void cpu_exec_initfn(CPUState *cpu)
 966{
 967    cpu->as = NULL;
 968    cpu->num_ases = 0;
 969
 970#ifndef CONFIG_USER_ONLY
 971    cpu->thread_id = qemu_get_thread_id();
 972    cpu->memory = system_memory;
 973    object_ref(OBJECT(cpu->memory));
 974
 975    /* Xilinx: The GPIO lines we use */
 976    qdev_init_gpio_in_named(DEVICE(cpu), cpu_reset_gpio, "reset", 1);
 977    qdev_init_gpio_in_named(DEVICE(cpu), cpu_halt_gpio, "halt", 1);
 978#endif
 979}
 980
 981void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 982{
 983    CPUClass *cc = CPU_GET_CLASS(cpu);
 984    static bool tcg_target_initialized;
 985
 986    cpu_list_add(cpu);
 987
 988    if (tcg_enabled() && !tcg_target_initialized) {
 989        tcg_target_initialized = true;
 990        cc->tcg_initialize();
 991    }
 992    tlb_init(cpu);
 993
 994    qemu_plugin_vcpu_init_hook(cpu);
 995
 996#ifdef CONFIG_USER_ONLY
 997    assert(cc->vmsd == NULL);
 998#else /* !CONFIG_USER_ONLY */
 999    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
1000        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
1001    }
1002    if (cc->vmsd != NULL) {
1003        vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
1004    }
1005
1006    cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
1007#endif
1008}
1009
1010const char *parse_cpu_option(const char *cpu_option)
1011{
1012    ObjectClass *oc;
1013    CPUClass *cc;
1014    gchar **model_pieces;
1015    const char *cpu_type;
1016
1017    model_pieces = g_strsplit(cpu_option, ",", 2);
1018    if (!model_pieces[0]) {
1019        error_report("-cpu option cannot be empty");
1020        exit(1);
1021    }
1022
1023    oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
1024    if (oc == NULL) {
1025        error_report("unable to find CPU model '%s'", model_pieces[0]);
1026        g_strfreev(model_pieces);
1027        exit(EXIT_FAILURE);
1028    }
1029
1030    cpu_type = object_class_get_name(oc);
1031    cc = CPU_CLASS(oc);
1032    cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
1033    g_strfreev(model_pieces);
1034    return cpu_type;
1035}
1036
1037#if defined(CONFIG_USER_ONLY)
1038void tb_invalidate_phys_addr(target_ulong addr)
1039{
1040    mmap_lock();
1041    tb_invalidate_phys_page_range(addr, addr + 1);
1042    mmap_unlock();
1043}
1044
1045static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
1046{
1047    tb_invalidate_phys_addr(pc);
1048}
1049#else
1050void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
1051{
1052    ram_addr_t ram_addr;
1053    MemoryRegion *mr;
1054    hwaddr l = 1;
1055
1056    if (!tcg_enabled()) {
1057        return;
1058    }
1059
1060    RCU_READ_LOCK_GUARD();
1061    mr = address_space_translate(as, addr, &addr, &l, false, attrs);
1062    if (!(memory_region_is_ram(mr)
1063          || memory_region_is_romd(mr))) {
1064        return;
1065    }
1066    ram_addr = memory_region_get_ram_addr(mr) + addr;
1067    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
1068}
1069
1070static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
1071{
1072    /*
1073     * There may not be a virtual to physical translation for the pc
1074     * right now, but there may exist cached TB for this pc.
1075     * Flush the whole TB cache to force re-translation of such TBs.
1076     * This is heavyweight, but we're debugging anyway.
1077     */
1078    tb_flush(cpu);
1079}
1080#endif
1081
1082#ifndef CONFIG_USER_ONLY
1083/* Add a watchpoint.  */
1084int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
1085                          int flags, CPUWatchpoint **watchpoint)
1086{
1087    CPUWatchpoint *wp;
1088
1089    /* forbid ranges which are empty or run off the end of the address space */
1090    if (len == 0 || (addr + len - 1) < addr) {
1091        error_report("tried to set invalid watchpoint at %"
1092                     VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
1093        return -EINVAL;
1094    }
1095    wp = g_malloc(sizeof(*wp));
1096
1097    wp->vaddr = addr;
1098    wp->len = len;
1099    wp->flags = flags;
1100
1101    /* keep all GDB-injected watchpoints in front */
1102    if (flags & BP_GDB) {
1103        QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
1104    } else {
1105        QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
1106    }
1107
1108    tlb_flush_page(cpu, addr);
1109
1110    if (watchpoint)
1111        *watchpoint = wp;
1112    return 0;
1113}
1114
1115/* Remove a specific watchpoint.  */
1116int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
1117                          int flags)
1118{
1119    CPUWatchpoint *wp;
1120
1121    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1122        if (addr == wp->vaddr && len == wp->len
1123                && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
1124            cpu_watchpoint_remove_by_ref(cpu, wp);
1125            return 0;
1126        }
1127    }
1128    return -ENOENT;
1129}
1130
1131/* Remove a specific watchpoint by reference.  */
1132void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
1133{
1134    QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
1135
1136    tlb_flush_page(cpu, watchpoint->vaddr);
1137
1138    g_free(watchpoint);
1139}
1140
1141/* Remove all matching watchpoints.  */
1142void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
1143{
1144    CPUWatchpoint *wp, *next;
1145
1146    QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
1147        if (wp->flags & mask) {
1148            cpu_watchpoint_remove_by_ref(cpu, wp);
1149        }
1150    }
1151}
1152
1153/* Return true if this watchpoint address matches the specified
1154 * access (ie the address range covered by the watchpoint overlaps
1155 * partially or completely with the address range covered by the
1156 * access).
1157 */
1158static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
1159                                              vaddr addr, vaddr len)
1160{
1161    /* We know the lengths are non-zero, but a little caution is
1162     * required to avoid errors in the case where the range ends
1163     * exactly at the top of the address space and so addr + len
1164     * wraps round to zero.
1165     */
1166    vaddr wpend = wp->vaddr + wp->len - 1;
1167    vaddr addrend = addr + len - 1;
1168
1169    return !(addr > wpend || wp->vaddr > addrend);
1170}
1171
1172/* Return flags for watchpoints that match addr + prot.  */
1173int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
1174{
1175    CPUWatchpoint *wp;
1176    int ret = 0;
1177
1178    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1179        if (watchpoint_address_matches(wp, addr, len)) {
1180            ret |= wp->flags;
1181        }
1182    }
1183    return ret;
1184}
1185#endif /* !CONFIG_USER_ONLY */
1186
1187/* Add a breakpoint.  */
1188int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
1189                          CPUBreakpoint **breakpoint)
1190{
1191    CPUBreakpoint *bp;
1192
1193    bp = g_malloc(sizeof(*bp));
1194
1195    bp->pc = pc;
1196    bp->flags = flags;
1197
1198    /* keep all GDB-injected breakpoints in front */
1199    if (flags & BP_GDB) {
1200        QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
1201    } else {
1202        QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
1203    }
1204
1205    breakpoint_invalidate(cpu, pc);
1206
1207    if (breakpoint) {
1208        *breakpoint = bp;
1209    }
1210    return 0;
1211}
1212
1213/* Remove a specific breakpoint.  */
1214int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
1215{
1216    CPUBreakpoint *bp;
1217
1218    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1219        if (bp->pc == pc && bp->flags == flags) {
1220            cpu_breakpoint_remove_by_ref(cpu, bp);
1221            return 0;
1222        }
1223    }
1224    return -ENOENT;
1225}
1226
1227/* Remove a specific breakpoint by reference.  */
1228void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
1229{
1230    QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
1231
1232    breakpoint_invalidate(cpu, breakpoint->pc);
1233
1234    g_free(breakpoint);
1235}
1236
1237/* Remove all matching breakpoints. */
1238void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
1239{
1240    CPUBreakpoint *bp, *next;
1241
1242    QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
1243        if (bp->flags & mask) {
1244            cpu_breakpoint_remove_by_ref(cpu, bp);
1245        }
1246    }
1247}
1248
1249/* enable or disable single step mode. EXCP_DEBUG is returned by the
1250   CPU loop after each instruction */
1251void cpu_single_step(CPUState *cpu, int enabled)
1252{
1253    if (cpu->singlestep_enabled != enabled) {
1254        cpu->singlestep_enabled = enabled;
1255        if (kvm_enabled()) {
1256            kvm_update_guest_debug(cpu, 0);
1257        } else {
1258            /* must flush all the translated code to avoid inconsistencies */
1259            /* XXX: only flush what is necessary */
1260            tb_flush(cpu);
1261        }
1262    }
1263}
1264
1265void cpu_abort(CPUState *cpu, const char *fmt, ...)
1266{
1267    va_list ap;
1268    va_list ap2;
1269
1270    va_start(ap, fmt);
1271    va_copy(ap2, ap);
1272    fprintf(stderr, "qemu: fatal: ");
1273    vfprintf(stderr, fmt, ap);
1274    fprintf(stderr, "\n");
1275    cpu_dump_state(cpu, stderr, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1276    if (qemu_log_separate()) {
1277        FILE *logfile = qemu_log_lock();
1278        qemu_log("qemu: fatal: ");
1279        qemu_log_vprintf(fmt, ap2);
1280        qemu_log("\n");
1281        log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1282        qemu_log_flush();
1283        qemu_log_unlock(logfile);
1284        qemu_log_close();
1285    }
1286    va_end(ap2);
1287    va_end(ap);
1288    replay_finish();
1289#if defined(CONFIG_USER_ONLY)
1290    {
1291        struct sigaction act;
1292        sigfillset(&act.sa_mask);
1293        act.sa_handler = SIG_DFL;
1294        act.sa_flags = 0;
1295        sigaction(SIGABRT, &act, NULL);
1296    }
1297#endif
1298    abort();
1299}
1300
1301#if !defined(CONFIG_USER_ONLY)
1302/* Called from RCU critical section */
1303static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
1304{
1305    RAMBlock *block;
1306
1307    block = atomic_rcu_read(&ram_list.mru_block);
1308    if (block && addr - block->offset < block->max_length) {
1309        return block;
1310    }
1311    RAMBLOCK_FOREACH(block) {
1312        if (addr - block->offset < block->max_length) {
1313            goto found;
1314        }
1315    }
1316
1317    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1318    abort();
1319
1320found:
1321    /* It is safe to write mru_block outside the iothread lock.  This
1322     * is what happens:
1323     *
1324     *     mru_block = xxx
1325     *     rcu_read_unlock()
1326     *                                        xxx removed from list
1327     *                  rcu_read_lock()
1328     *                  read mru_block
1329     *                                        mru_block = NULL;
1330     *                                        call_rcu(reclaim_ramblock, xxx);
1331     *                  rcu_read_unlock()
1332     *
1333     * atomic_rcu_set is not needed here.  The block was already published
1334     * when it was placed into the list.  Here we're just making an extra
1335     * copy of the pointer.
1336     */
1337    ram_list.mru_block = block;
1338    return block;
1339}
1340
1341static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
1342{
1343    CPUState *cpu;
1344    ram_addr_t start1;
1345    RAMBlock *block;
1346    ram_addr_t end;
1347
1348    assert(tcg_enabled());
1349    end = TARGET_PAGE_ALIGN(start + length);
1350    start &= TARGET_PAGE_MASK;
1351
1352    RCU_READ_LOCK_GUARD();
1353    block = qemu_get_ram_block(start);
1354    assert(block == qemu_get_ram_block(end - 1));
1355    start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1356    CPU_FOREACH(cpu) {
1357        tlb_reset_dirty(cpu, start1, length);
1358    }
1359}
1360
1361/* Note: start and end must be within the same ram block.  */
1362bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1363                                              ram_addr_t length,
1364                                              unsigned client)
1365{
1366    DirtyMemoryBlocks *blocks;
1367    unsigned long end, page, start_page;
1368    bool dirty = false;
1369    RAMBlock *ramblock;
1370    uint64_t mr_offset, mr_size;
1371
1372    if (length == 0) {
1373        return false;
1374    }
1375
1376    end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1377    start_page = start >> TARGET_PAGE_BITS;
1378    page = start_page;
1379
1380    WITH_RCU_READ_LOCK_GUARD() {
1381        blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1382        ramblock = qemu_get_ram_block(start);
1383        /* Range sanity check on the ramblock */
1384        assert(start >= ramblock->offset &&
1385               start + length <= ramblock->offset + ramblock->used_length);
1386
1387        while (page < end) {
1388            unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1389            unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1390            unsigned long num = MIN(end - page,
1391                                    DIRTY_MEMORY_BLOCK_SIZE - offset);
1392
1393            dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1394                                                  offset, num);
1395            page += num;
1396        }
1397
1398        mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset;
1399        mr_size = (end - start_page) << TARGET_PAGE_BITS;
1400        memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
1401    }
1402
1403    if (dirty && tcg_enabled()) {
1404        tlb_reset_dirty_range_all(start, length);
1405    }
1406
1407    return dirty;
1408}
1409
1410DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1411    (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
1412{
1413    DirtyMemoryBlocks *blocks;
1414    ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
1415    unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
1416    ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
1417    ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
1418    DirtyBitmapSnapshot *snap;
1419    unsigned long page, end, dest;
1420
1421    snap = g_malloc0(sizeof(*snap) +
1422                     ((last - first) >> (TARGET_PAGE_BITS + 3)));
1423    snap->start = first;
1424    snap->end   = last;
1425
1426    page = first >> TARGET_PAGE_BITS;
1427    end  = last  >> TARGET_PAGE_BITS;
1428    dest = 0;
1429
1430    WITH_RCU_READ_LOCK_GUARD() {
1431        blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1432
1433        while (page < end) {
1434            unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1435            unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1436            unsigned long num = MIN(end - page,
1437                                    DIRTY_MEMORY_BLOCK_SIZE - offset);
1438
1439            assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
1440            assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
1441            offset >>= BITS_PER_LEVEL;
1442
1443            bitmap_copy_and_clear_atomic(snap->dirty + dest,
1444                                         blocks->blocks[idx] + offset,
1445                                         num);
1446            page += num;
1447            dest += num >> BITS_PER_LEVEL;
1448        }
1449    }
1450
1451    if (tcg_enabled()) {
1452        tlb_reset_dirty_range_all(start, length);
1453    }
1454
1455    memory_region_clear_dirty_bitmap(mr, offset, length);
1456
1457    return snap;
1458}
1459
1460bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
1461                                            ram_addr_t start,
1462                                            ram_addr_t length)
1463{
1464    unsigned long page, end;
1465
1466    assert(start >= snap->start);
1467    assert(start + length <= snap->end);
1468
1469    end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
1470    page = (start - snap->start) >> TARGET_PAGE_BITS;
1471
1472    while (page < end) {
1473        if (test_bit(page, snap->dirty)) {
1474            return true;
1475        }
1476        page++;
1477    }
1478    return false;
1479}
1480
1481/* Called from RCU critical section */
1482hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1483                                       MemoryRegionSection *section)
1484{
1485    AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
1486    return section - d->map.sections;
1487}
1488#endif /* defined(CONFIG_USER_ONLY) */
1489
1490#if !defined(CONFIG_USER_ONLY)
1491
1492static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
1493                            uint16_t section);
1494static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1495
1496static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
1497                               qemu_anon_ram_alloc;
1498
1499/*
1500 * Set a custom physical guest memory alloator.
1501 * Accelerators with unusual needs may need this.  Hopefully, we can
1502 * get rid of it eventually.
1503 */
1504void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared))
1505{
1506    phys_mem_alloc = alloc;
1507}
1508
1509static uint16_t phys_section_add(PhysPageMap *map,
1510                                 MemoryRegionSection *section)
1511{
1512    /* The physical section number is ORed with a page-aligned
1513     * pointer to produce the iotlb entries.  Thus it should
1514     * never overflow into the page-aligned value.
1515     */
1516    assert(map->sections_nb < TARGET_PAGE_SIZE);
1517
1518    if (map->sections_nb == map->sections_nb_alloc) {
1519        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1520        map->sections = g_renew(MemoryRegionSection, map->sections,
1521                                map->sections_nb_alloc);
1522    }
1523    map->sections[map->sections_nb] = *section;
1524    memory_region_ref(section->mr);
1525    return map->sections_nb++;
1526}
1527
1528static void phys_section_destroy(MemoryRegion *mr)
1529{
1530    bool have_sub_page = mr->subpage;
1531
1532    memory_region_unref(mr);
1533
1534    if (have_sub_page) {
1535        subpage_t *subpage = container_of(mr, subpage_t, iomem);
1536        object_unref(OBJECT(&subpage->iomem));
1537        g_free(subpage);
1538    }
1539}
1540
1541static void phys_sections_free(PhysPageMap *map)
1542{
1543    while (map->sections_nb > 0) {
1544        MemoryRegionSection *section = &map->sections[--map->sections_nb];
1545        phys_section_destroy(section->mr);
1546    }
1547    g_free(map->sections);
1548    g_free(map->nodes);
1549}
1550
1551static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1552{
1553    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1554    subpage_t *subpage;
1555    hwaddr base = section->offset_within_address_space
1556        & TARGET_PAGE_MASK;
1557    MemoryRegionSection *existing = phys_page_find(d, base);
1558    MemoryRegionSection subsection = {
1559        .offset_within_address_space = base,
1560        .size = int128_make64(TARGET_PAGE_SIZE),
1561    };
1562    hwaddr start, end;
1563
1564    assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1565
1566    if (!(existing->mr->subpage)) {
1567        subpage = subpage_init(fv, base);
1568        subsection.fv = fv;
1569        subsection.mr = &subpage->iomem;
1570        phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1571                      phys_section_add(&d->map, &subsection));
1572    } else {
1573        subpage = container_of(existing->mr, subpage_t, iomem);
1574    }
1575    start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1576    end = start + int128_get64(section->size) - 1;
1577    subpage_register(subpage, start, end,
1578                     phys_section_add(&d->map, section));
1579}
1580
1581
1582static void register_multipage(FlatView *fv,
1583                               MemoryRegionSection *section)
1584{
1585    AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1586    hwaddr start_addr = section->offset_within_address_space;
1587    uint16_t section_index = phys_section_add(&d->map, section);
1588    uint64_t num_pages = int128_get64(int128_rshift(section->size,
1589                                                    TARGET_PAGE_BITS));
1590
1591    assert(num_pages);
1592    phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1593}
1594
1595/*
1596 * The range in *section* may look like this:
1597 *
1598 *      |s|PPPPPPP|s|
1599 *
1600 * where s stands for subpage and P for page.
1601 */
1602void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1603{
1604    MemoryRegionSection remain = *section;
1605    Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1606
1607    /* register first subpage */
1608    if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1609        uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
1610                        - remain.offset_within_address_space;
1611
1612        MemoryRegionSection now = remain;
1613        now.size = int128_min(int128_make64(left), now.size);
1614        register_subpage(fv, &now);
1615        if (int128_eq(remain.size, now.size)) {
1616            return;
1617        }
1618        remain.size = int128_sub(remain.size, now.size);
1619        remain.offset_within_address_space += int128_get64(now.size);
1620        remain.offset_within_region += int128_get64(now.size);
1621    }
1622
1623    /* register whole pages */
1624    if (int128_ge(remain.size, page_size)) {
1625        MemoryRegionSection now = remain;
1626        now.size = int128_and(now.size, int128_neg(page_size));
1627        register_multipage(fv, &now);
1628        if (int128_eq(remain.size, now.size)) {
1629            return;
1630        }
1631        remain.size = int128_sub(remain.size, now.size);
1632        remain.offset_within_address_space += int128_get64(now.size);
1633        remain.offset_within_region += int128_get64(now.size);
1634    }
1635
1636    /* register last subpage */
1637    register_subpage(fv, &remain);
1638}
1639
1640void qemu_flush_coalesced_mmio_buffer(void)
1641{
1642    if (kvm_enabled())
1643        kvm_flush_coalesced_mmio_buffer();
1644}
1645
1646void qemu_mutex_lock_ramlist(void)
1647{
1648    qemu_mutex_lock(&ram_list.mutex);
1649}
1650
1651void qemu_mutex_unlock_ramlist(void)
1652{
1653    qemu_mutex_unlock(&ram_list.mutex);
1654}
1655
1656void ram_block_dump(Monitor *mon)
1657{
1658    RAMBlock *block;
1659    char *psize;
1660
1661    RCU_READ_LOCK_GUARD();
1662    monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
1663                   "Block Name", "PSize", "Offset", "Used", "Total");
1664    RAMBLOCK_FOREACH(block) {
1665        psize = size_to_str(block->page_size);
1666        monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1667                       " 0x%016" PRIx64 "\n", block->idstr, psize,
1668                       (uint64_t)block->offset,
1669                       (uint64_t)block->used_length,
1670                       (uint64_t)block->max_length);
1671        g_free(psize);
1672    }
1673}
1674
1675#ifdef __linux__
1676/*
1677 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
1678 * may or may not name the same files / on the same filesystem now as
1679 * when we actually open and map them.  Iterate over the file
1680 * descriptors instead, and use qemu_fd_getpagesize().
1681 */
1682static int find_min_backend_pagesize(Object *obj, void *opaque)
1683{
1684    long *hpsize_min = opaque;
1685
1686    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1687        HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1688        long hpsize = host_memory_backend_pagesize(backend);
1689
1690        if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
1691            *hpsize_min = hpsize;
1692        }
1693    }
1694
1695    return 0;
1696}
1697
1698static int find_max_backend_pagesize(Object *obj, void *opaque)
1699{
1700    long *hpsize_max = opaque;
1701
1702    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1703        HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1704        long hpsize = host_memory_backend_pagesize(backend);
1705
1706        if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
1707            *hpsize_max = hpsize;
1708        }
1709    }
1710
1711    return 0;
1712}
1713
1714/*
1715 * TODO: We assume right now that all mapped host memory backends are
1716 * used as RAM, however some might be used for different purposes.
1717 */
1718long qemu_minrampagesize(void)
1719{
1720    long hpsize = LONG_MAX;
1721    Object *memdev_root = object_resolve_path("/objects", NULL);
1722
1723    object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
1724    return hpsize;
1725}
1726
1727long qemu_maxrampagesize(void)
1728{
1729    long pagesize = 0;
1730    Object *memdev_root = object_resolve_path("/objects", NULL);
1731
1732    object_child_foreach(memdev_root, find_max_backend_pagesize, &pagesize);
1733    return pagesize;
1734}
1735#else
1736long qemu_minrampagesize(void)
1737{
1738    return qemu_real_host_page_size;
1739}
1740long qemu_maxrampagesize(void)
1741{
1742    return qemu_real_host_page_size;
1743}
1744#endif
1745
1746static int64_t get_file_size(int fd)
1747{
1748    int64_t size;
1749#if defined(__linux__)
1750    struct stat st;
1751
1752    if (fstat(fd, &st) < 0) {
1753        return -errno;
1754    }
1755
1756    /* Special handling for devdax character devices */
1757    if (S_ISCHR(st.st_mode)) {
1758        g_autofree char *subsystem_path = NULL;
1759        g_autofree char *subsystem = NULL;
1760
1761        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
1762                                         major(st.st_rdev), minor(st.st_rdev));
1763        subsystem = g_file_read_link(subsystem_path, NULL);
1764
1765        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
1766            g_autofree char *size_path = NULL;
1767            g_autofree char *size_str = NULL;
1768
1769            size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
1770                                    major(st.st_rdev), minor(st.st_rdev));
1771
1772            if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
1773                return g_ascii_strtoll(size_str, NULL, 0);
1774            }
1775        }
1776    }
1777#endif /* defined(__linux__) */
1778
1779    /* st.st_size may be zero for special files yet lseek(2) works */
1780    size = lseek(fd, 0, SEEK_END);
1781    if (size < 0) {
1782        return -errno;
1783    }
1784    return size;
1785}
1786
1787static int file_ram_open(const char *path,
1788                         const char *region_name,
1789                         bool *created,
1790                         Error **errp)
1791{
1792    char *filename;
1793    char *sanitized_name;
1794    char *c;
1795    int fd = -1;
1796
1797    *created = false;
1798    for (;;) {
1799#ifdef _WIN32
1800        fd = _open_osfhandle((intptr_t)CreateFile(path,
1801                   GENERIC_READ | GENERIC_WRITE,
1802                   FILE_SHARE_READ | FILE_SHARE_WRITE,
1803                   NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL),
1804                   _O_RDWR);
1805#else
1806        fd = open(path, O_RDWR);
1807#endif
1808        if (fd >= 0) {
1809            /* @path names an existing file, use it */
1810            break;
1811        }
1812        if (errno == ENOENT) {
1813            /* @path names a file that doesn't exist, create it */
1814            fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1815            if (fd >= 0) {
1816                *created = true;
1817                break;
1818            }
1819        } else if (errno == EISDIR) {
1820            /* @path names a directory, create a file there */
1821            /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1822            sanitized_name = g_strdup(region_name);
1823            for (c = sanitized_name; *c != '\0'; c++) {
1824                if (*c == '/') {
1825                    *c = '_';
1826                }
1827            }
1828
1829            filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1830                                       sanitized_name);
1831            g_free(sanitized_name);
1832
1833#ifdef _WIN32
1834            fd = _open(_mktemp(filename), _O_CREAT | _O_RDWR);
1835#else
1836            fd = mkstemp(filename);
1837#endif
1838            if (fd >= 0) {
1839                unlink(filename);
1840                g_free(filename);
1841                break;
1842            }
1843            g_free(filename);
1844        }
1845        if (errno != EEXIST && errno != EINTR) {
1846            error_setg_errno(errp, errno,
1847                             "can't open backing store %s for guest RAM",
1848                             path);
1849            return -1;
1850        }
1851        /*
1852         * Try again on EINTR and EEXIST.  The latter happens when
1853         * something else creates the file between our two open().
1854         */
1855    }
1856
1857    return fd;
1858}
1859
1860static void *file_ram_alloc(RAMBlock *block,
1861                            ram_addr_t memory,
1862                            int fd,
1863                            bool truncate,
1864                            Error **errp)
1865{
1866    void *area;
1867
1868#ifdef _WIN32
1869    SYSTEM_INFO SysInfo;
1870    GetSystemInfo(&SysInfo);
1871    block->page_size = SysInfo.dwPageSize;
1872#else
1873    block->page_size = qemu_fd_getpagesize(fd);
1874#endif
1875
1876    if (block->mr->align % block->page_size) {
1877        error_setg(errp, "alignment 0x%" PRIx64
1878                   " must be multiples of page size 0x%zx",
1879                   block->mr->align, block->page_size);
1880        return NULL;
1881    } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
1882        error_setg(errp, "alignment 0x%" PRIx64
1883                   " must be a power of two", block->mr->align);
1884        return NULL;
1885    }
1886    block->mr->align = MAX(block->page_size, block->mr->align);
1887#if defined(__s390x__)
1888    if (kvm_enabled()) {
1889        block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1890    }
1891#endif
1892
1893    if (memory < block->page_size) {
1894        error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1895                   "or larger than page size 0x%zx",
1896                   memory, block->page_size);
1897        return NULL;
1898    }
1899
1900    memory = ROUND_UP(memory, block->page_size);
1901
1902    /*
1903     * ftruncate is not supported by hugetlbfs in older
1904     * hosts, so don't bother bailing out on errors.
1905     * If anything goes wrong with it under other filesystems,
1906     * mmap will fail.
1907     *
1908     * Do not truncate the non-empty backend file to avoid corrupting
1909     * the existing data in the file. Disabling shrinking is not
1910     * enough. For example, the current vNVDIMM implementation stores
1911     * the guest NVDIMM labels at the end of the backend file. If the
1912     * backend file is later extended, QEMU will not be able to find
1913     * those labels. Therefore, extending the non-empty backend file
1914     * is disabled as well.
1915     */
1916    if (truncate && ftruncate(fd, memory)) {
1917        perror("ftruncate");
1918    }
1919
1920#ifdef _WIN32
1921    HANDLE fd_temp = (HANDLE)_get_osfhandle(fd);
1922    block->hMapFile = CreateFileMapping(fd_temp, NULL, PAGE_READWRITE,
1923                                        memory >> 32,
1924                                        memory, NULL);
1925    area = MapViewOfFile(block->hMapFile, FILE_MAP_ALL_ACCESS, 0, 0, 0);
1926    if (area == NULL) {
1927#else
1928    area = qemu_ram_mmap(fd, memory, block->mr->align,
1929                         block->flags & RAM_SHARED, block->flags & RAM_PMEM);
1930    if (area == MAP_FAILED) {
1931#endif
1932        error_setg_errno(errp, errno,
1933                         "unable to map backing store for guest RAM");
1934        return NULL;
1935    }
1936
1937    block->fd = fd;
1938    return area;
1939}
1940
1941/* Allocate space within the ram_addr_t space that governs the
1942 * dirty bitmaps.
1943 * Called with the ramlist lock held.
1944 */
1945static ram_addr_t find_ram_offset(ram_addr_t size)
1946{
1947    RAMBlock *block, *next_block;
1948    ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1949
1950    assert(size != 0); /* it would hand out same offset multiple times */
1951
1952    if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1953        return 0;
1954    }
1955
1956    RAMBLOCK_FOREACH(block) {
1957        ram_addr_t candidate, next = RAM_ADDR_MAX;
1958
1959        /* Align blocks to start on a 'long' in the bitmap
1960         * which makes the bitmap sync'ing take the fast path.
1961         */
1962        candidate = block->offset + block->max_length;
1963        candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
1964
1965        /* Search for the closest following block
1966         * and find the gap.
1967         */
1968        RAMBLOCK_FOREACH(next_block) {
1969            if (next_block->offset >= candidate) {
1970                next = MIN(next, next_block->offset);
1971            }
1972        }
1973
1974        /* If it fits remember our place and remember the size
1975         * of gap, but keep going so that we might find a smaller
1976         * gap to fill so avoiding fragmentation.
1977         */
1978        if (next - candidate >= size && next - candidate < mingap) {
1979            offset = candidate;
1980            mingap = next - candidate;
1981        }
1982
1983        trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
1984    }
1985
1986    if (offset == RAM_ADDR_MAX) {
1987        fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1988                (uint64_t)size);
1989        abort();
1990    }
1991
1992    trace_find_ram_offset(size, offset);
1993
1994    return offset;
1995}
1996
1997static unsigned long last_ram_page(void)
1998{
1999    RAMBlock *block;
2000    ram_addr_t last = 0;
2001
2002    RCU_READ_LOCK_GUARD();
2003    RAMBLOCK_FOREACH(block) {
2004        last = MAX(last, block->offset + block->max_length);
2005    }
2006    return last >> TARGET_PAGE_BITS;
2007}
2008
2009static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
2010{
2011    int ret;
2012
2013    /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
2014    if (!machine_dump_guest_core(current_machine)) {
2015        ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
2016        if (ret) {
2017            perror("qemu_madvise");
2018            fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
2019                            "but dump_guest_core=off specified\n");
2020        }
2021    }
2022}
2023
2024const char *qemu_ram_get_idstr(RAMBlock *rb)
2025{
2026    return rb->idstr;
2027}
2028
2029void *qemu_ram_get_host_addr(RAMBlock *rb)
2030{
2031    return rb->host;
2032}
2033
2034ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
2035{
2036    return rb->offset;
2037}
2038
2039ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
2040{
2041    return rb->used_length;
2042}
2043
2044bool qemu_ram_is_shared(RAMBlock *rb)
2045{
2046    return rb->flags & RAM_SHARED;
2047}
2048
2049/* Note: Only set at the start of postcopy */
2050bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
2051{
2052    return rb->flags & RAM_UF_ZEROPAGE;
2053}
2054
2055void qemu_ram_set_uf_zeroable(RAMBlock *rb)
2056{
2057    rb->flags |= RAM_UF_ZEROPAGE;
2058}
2059
2060bool qemu_ram_is_migratable(RAMBlock *rb)
2061{
2062    return rb->flags & RAM_MIGRATABLE;
2063}
2064
2065void qemu_ram_set_migratable(RAMBlock *rb)
2066{
2067    rb->flags |= RAM_MIGRATABLE;
2068}
2069
2070void qemu_ram_unset_migratable(RAMBlock *rb)
2071{
2072    rb->flags &= ~RAM_MIGRATABLE;
2073}
2074
2075/* Called with iothread lock held.  */
2076void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
2077{
2078    RAMBlock *block;
2079
2080    assert(new_block);
2081    assert(!new_block->idstr[0]);
2082
2083    if (dev) {
2084        char *id = qdev_get_dev_path(dev);
2085        if (id) {
2086            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
2087            g_free(id);
2088        }
2089    }
2090    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
2091
2092    RCU_READ_LOCK_GUARD();
2093    RAMBLOCK_FOREACH(block) {
2094        if (block != new_block &&
2095            !strcmp(block->idstr, new_block->idstr)) {
2096            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
2097                    new_block->idstr);
2098            abort();
2099        }
2100    }
2101}
2102
2103/* Called with iothread lock held.  */
2104void qemu_ram_unset_idstr(RAMBlock *block)
2105{
2106    /* FIXME: arch_init.c assumes that this is not called throughout
2107     * migration.  Ignore the problem since hot-unplug during migration
2108     * does not work anyway.
2109     */
2110    if (block) {
2111        memset(block->idstr, 0, sizeof(block->idstr));
2112    }
2113}
2114
2115size_t qemu_ram_pagesize(RAMBlock *rb)
2116{
2117    return rb->page_size;
2118}
2119
2120/* Returns the largest size of page in use */
2121size_t qemu_ram_pagesize_largest(void)
2122{
2123    RAMBlock *block;
2124    size_t largest = 0;
2125
2126    RAMBLOCK_FOREACH(block) {
2127        largest = MAX(largest, qemu_ram_pagesize(block));
2128    }
2129
2130    return largest;
2131}
2132
2133static int memory_try_enable_merging(void *addr, size_t len)
2134{
2135    if (!machine_mem_merge(current_machine)) {
2136        /* disabled by the user */
2137        return 0;
2138    }
2139
2140    return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
2141}
2142
2143/* Only legal before guest might have detected the memory size: e.g. on
2144 * incoming migration, or right after reset.
2145 *
2146 * As memory core doesn't know how is memory accessed, it is up to
2147 * resize callback to update device state and/or add assertions to detect
2148 * misuse, if necessary.
2149 */
2150int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
2151{
2152    const ram_addr_t unaligned_size = newsize;
2153
2154    assert(block);
2155
2156    newsize = HOST_PAGE_ALIGN(newsize);
2157
2158    if (block->used_length == newsize) {
2159        /*
2160         * We don't have to resize the ram block (which only knows aligned
2161         * sizes), however, we have to notify if the unaligned size changed.
2162         */
2163        if (unaligned_size != memory_region_size(block->mr)) {
2164            memory_region_set_size(block->mr, unaligned_size);
2165            if (block->resized) {
2166                block->resized(block->idstr, unaligned_size, block->host);
2167            }
2168        }
2169        return 0;
2170    }
2171
2172    if (!(block->flags & RAM_RESIZEABLE)) {
2173        error_setg_errno(errp, EINVAL,
2174                         "Length mismatch: %s: 0x" RAM_ADDR_FMT
2175                         " in != 0x" RAM_ADDR_FMT, block->idstr,
2176                         newsize, block->used_length);
2177        return -EINVAL;
2178    }
2179
2180    if (block->max_length < newsize) {
2181        error_setg_errno(errp, EINVAL,
2182                         "Length too large: %s: 0x" RAM_ADDR_FMT
2183                         " > 0x" RAM_ADDR_FMT, block->idstr,
2184                         newsize, block->max_length);
2185        return -EINVAL;
2186    }
2187
2188    cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
2189    block->used_length = newsize;
2190    cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
2191                                        DIRTY_CLIENTS_ALL);
2192    memory_region_set_size(block->mr, unaligned_size);
2193    if (block->resized) {
2194        block->resized(block->idstr, unaligned_size, block->host);
2195    }
2196    return 0;
2197}
2198
2199/*
2200 * Trigger sync on the given ram block for range [start, start + length]
2201 * with the backing store if one is available.
2202 * Otherwise no-op.
2203 * @Note: this is supposed to be a synchronous op.
2204 */
2205void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length)
2206{
2207    /* The requested range should fit in within the block range */
2208    g_assert((start + length) <= block->used_length);
2209
2210#ifdef CONFIG_LIBPMEM
2211    /* The lack of support for pmem should not block the sync */
2212    if (ramblock_is_pmem(block)) {
2213        void *addr = ramblock_ptr(block, start);
2214        pmem_persist(addr, length);
2215        return;
2216    }
2217#endif
2218    if (block->fd >= 0) {
2219        /**
2220         * Case there is no support for PMEM or the memory has not been
2221         * specified as persistent (or is not one) - use the msync.
2222         * Less optimal but still achieves the same goal
2223         */
2224        void *addr = ramblock_ptr(block, start);
2225        if (qemu_msync(addr, length, block->fd)) {
2226            warn_report("%s: failed to sync memory range: start: "
2227                    RAM_ADDR_FMT " length: " RAM_ADDR_FMT,
2228                    __func__, start, length);
2229        }
2230    }
2231}
2232
2233/* Called with ram_list.mutex held */
2234static void dirty_memory_extend(ram_addr_t old_ram_size,
2235                                ram_addr_t new_ram_size)
2236{
2237    ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
2238                                             DIRTY_MEMORY_BLOCK_SIZE);
2239    ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
2240                                             DIRTY_MEMORY_BLOCK_SIZE);
2241    int i;
2242
2243    /* Only need to extend if block count increased */
2244    if (new_num_blocks <= old_num_blocks) {
2245        return;
2246    }
2247
2248    for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
2249        DirtyMemoryBlocks *old_blocks;
2250        DirtyMemoryBlocks *new_blocks;
2251        int j;
2252
2253        old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
2254        new_blocks = g_malloc(sizeof(*new_blocks) +
2255                              sizeof(new_blocks->blocks[0]) * new_num_blocks);
2256
2257        if (old_num_blocks) {
2258            memcpy(new_blocks->blocks, old_blocks->blocks,
2259                   old_num_blocks * sizeof(old_blocks->blocks[0]));
2260        }
2261
2262        for (j = old_num_blocks; j < new_num_blocks; j++) {
2263            new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
2264        }
2265
2266        atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
2267
2268        if (old_blocks) {
2269            g_free_rcu(old_blocks, rcu);
2270        }
2271    }
2272}
2273
2274static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
2275{
2276    RAMBlock *block;
2277    RAMBlock *last_block = NULL;
2278    ram_addr_t old_ram_size, new_ram_size;
2279    Error *err = NULL;
2280
2281    old_ram_size = last_ram_page();
2282
2283    qemu_mutex_lock_ramlist();
2284    new_block->offset = find_ram_offset(new_block->max_length);
2285
2286    if (!new_block->host) {
2287        if (xen_enabled()) {
2288            xen_ram_alloc(new_block->offset, new_block->max_length,
2289                          new_block->mr, &err);
2290            if (err) {
2291                error_propagate(errp, err);
2292                qemu_mutex_unlock_ramlist();
2293                return;
2294            }
2295        } else {
2296            new_block->host = phys_mem_alloc(new_block->max_length,
2297                                             &new_block->mr->align, shared);
2298            if (!new_block->host) {
2299                error_setg_errno(errp, errno,
2300                                 "cannot set up guest memory '%s'",
2301                                 memory_region_name(new_block->mr));
2302                qemu_mutex_unlock_ramlist();
2303                return;
2304            }
2305            memory_try_enable_merging(new_block->host, new_block->max_length);
2306        }
2307    }
2308
2309    new_ram_size = MAX(old_ram_size,
2310              (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
2311    if (new_ram_size > old_ram_size) {
2312        dirty_memory_extend(old_ram_size, new_ram_size);
2313    }
2314    /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
2315     * QLIST (which has an RCU-friendly variant) does not have insertion at
2316     * tail, so save the last element in last_block.
2317     */
2318    RAMBLOCK_FOREACH(block) {
2319        last_block = block;
2320        if (block->max_length < new_block->max_length) {
2321            break;
2322        }
2323    }
2324    if (block) {
2325        QLIST_INSERT_BEFORE_RCU(block, new_block, next);
2326    } else if (last_block) {
2327        QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
2328    } else { /* list is empty */
2329        QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
2330    }
2331    ram_list.mru_block = NULL;
2332
2333    /* Write list before version */
2334    smp_wmb();
2335    ram_list.version++;
2336    qemu_mutex_unlock_ramlist();
2337
2338    cpu_physical_memory_set_dirty_range(new_block->offset,
2339                                        new_block->used_length,
2340                                        DIRTY_CLIENTS_ALL);
2341
2342    if (new_block->host) {
2343        qemu_ram_setup_dump(new_block->host, new_block->max_length);
2344        qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
2345        /*
2346         * MADV_DONTFORK is also needed by KVM in absence of synchronous MMU
2347         * Configure it unless the machine is a qtest server, in which case
2348         * KVM is not used and it may be forked (eg for fuzzing purposes).
2349         */
2350        if (!qtest_enabled()) {
2351            qemu_madvise(new_block->host, new_block->max_length,
2352                         QEMU_MADV_DONTFORK);
2353        }
2354        ram_block_notify_add(new_block->host, new_block->max_length);
2355    }
2356}
2357
2358RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
2359                                 uint32_t ram_flags, int fd,
2360                                 Error **errp)
2361{
2362    RAMBlock *new_block;
2363    Error *local_err = NULL;
2364    int64_t file_size;
2365
2366    /* Just support these ram flags by now. */
2367    assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
2368
2369    if (xen_enabled()) {
2370        error_setg(errp, "-mem-path not supported with Xen");
2371        return NULL;
2372    }
2373
2374    if (kvm_enabled() && !kvm_has_sync_mmu()) {
2375        error_setg(errp,
2376                   "host lacks kvm mmu notifiers, -mem-path unsupported");
2377        return NULL;
2378    }
2379
2380    if (phys_mem_alloc != qemu_anon_ram_alloc) {
2381        /*
2382         * file_ram_alloc() needs to allocate just like
2383         * phys_mem_alloc, but we haven't bothered to provide
2384         * a hook there.
2385         */
2386        error_setg(errp,
2387                   "-mem-path not supported with this accelerator");
2388        return NULL;
2389    }
2390
2391    size = HOST_PAGE_ALIGN(size);
2392    file_size = get_file_size(fd);
2393    if (file_size > 0 && file_size < size) {
2394        error_setg(errp, "backing store size 0x%" PRIx64
2395                   " does not match 'size' option 0x" RAM_ADDR_FMT,
2396                   file_size, size);
2397        return NULL;
2398    }
2399
2400    new_block = g_malloc0(sizeof(*new_block));
2401    new_block->mr = mr;
2402    new_block->used_length = size;
2403    new_block->max_length = size;
2404    new_block->flags = ram_flags;
2405    new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
2406    if (!new_block->host) {
2407        g_free(new_block);
2408        return NULL;
2409    }
2410
2411    ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
2412    if (local_err) {
2413        g_free(new_block);
2414        error_propagate(errp, local_err);
2415        return NULL;
2416    }
2417    return new_block;
2418
2419}
2420
2421
2422RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2423                                   uint32_t ram_flags, const char *mem_path,
2424                                   Error **errp)
2425{
2426    int fd;
2427    bool created;
2428    RAMBlock *block;
2429
2430    fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
2431    if (fd < 0) {
2432        return NULL;
2433    }
2434
2435    block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
2436    if (!block) {
2437        if (created) {
2438            unlink(mem_path);
2439        }
2440#ifdef _WIN32
2441        _close(fd);
2442#else
2443        close(fd);
2444#endif
2445        return NULL;
2446    }
2447
2448    return block;
2449}
2450
2451static
2452RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2453                                  void (*resized)(const char*,
2454                                                  uint64_t length,
2455                                                  void *host),
2456                                  void *host, bool resizeable, bool share,
2457                                  MemoryRegion *mr, Error **errp)
2458{
2459    RAMBlock *new_block;
2460    Error *local_err = NULL;
2461
2462    size = HOST_PAGE_ALIGN(size);
2463    max_size = HOST_PAGE_ALIGN(max_size);
2464    new_block = g_malloc0(sizeof(*new_block));
2465    new_block->mr = mr;
2466    new_block->resized = resized;
2467    new_block->used_length = size;
2468    new_block->max_length = max_size;
2469    assert(max_size >= size);
2470    new_block->fd = -1;
2471    new_block->page_size = qemu_real_host_page_size;
2472    new_block->host = host;
2473    if (host) {
2474        new_block->flags |= RAM_PREALLOC;
2475    }
2476    if (resizeable) {
2477        new_block->flags |= RAM_RESIZEABLE;
2478    }
2479    ram_block_add(new_block, &local_err, share);
2480    if (local_err) {
2481        g_free(new_block);
2482        error_propagate(errp, local_err);
2483        return NULL;
2484    }
2485    return new_block;
2486}
2487
2488RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2489                                   MemoryRegion *mr, Error **errp)
2490{
2491    return qemu_ram_alloc_internal(size, size, NULL, host, false,
2492                                   false, mr, errp);
2493}
2494
2495RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
2496                         MemoryRegion *mr, Error **errp)
2497{
2498    return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
2499                                   share, mr, errp);
2500}
2501
2502RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2503                                     void (*resized)(const char*,
2504                                                     uint64_t length,
2505                                                     void *host),
2506                                     MemoryRegion *mr, Error **errp)
2507{
2508    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
2509                                   false, mr, errp);
2510}
2511
2512static void reclaim_ramblock(RAMBlock *block)
2513{
2514    if (block->flags & RAM_PREALLOC) {
2515        ;
2516    } else if (xen_enabled()) {
2517        xen_invalidate_map_cache_entry(block->host);
2518    } else if (block->fd >= 0) {
2519#ifdef _WIN32
2520        UnmapViewOfFile(block->host);
2521        CloseHandle(block->hMapFile);
2522        _close(block->fd);
2523#else
2524        qemu_ram_munmap(block->fd, block->host, block->max_length);
2525        close(block->fd);
2526#endif
2527    } else {
2528        qemu_anon_ram_free(block->host, block->max_length);
2529    }
2530    g_free(block);
2531}
2532
2533void qemu_ram_free(RAMBlock *block)
2534{
2535    if (!block) {
2536        return;
2537    }
2538
2539    if (block->host) {
2540        ram_block_notify_remove(block->host, block->max_length);
2541    }
2542
2543    qemu_mutex_lock_ramlist();
2544    QLIST_REMOVE_RCU(block, next);
2545    ram_list.mru_block = NULL;
2546    /* Write list before version */
2547    smp_wmb();
2548    ram_list.version++;
2549    call_rcu(block, reclaim_ramblock, rcu);
2550    qemu_mutex_unlock_ramlist();
2551}
2552
2553#ifndef _WIN32
2554void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
2555{
2556    RAMBlock *block;
2557    ram_addr_t offset;
2558    int flags;
2559    void *area, *vaddr;
2560
2561    RAMBLOCK_FOREACH(block) {
2562        offset = addr - block->offset;
2563        if (offset < block->max_length) {
2564            vaddr = ramblock_ptr(block, offset);
2565            if (block->flags & RAM_PREALLOC) {
2566                ;
2567            } else if (xen_enabled()) {
2568                abort();
2569            } else {
2570                flags = MAP_FIXED;
2571                if (block->fd >= 0) {
2572                    flags |= (block->flags & RAM_SHARED ?
2573                              MAP_SHARED : MAP_PRIVATE);
2574                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2575                                flags, block->fd, offset);
2576                } else {
2577                    /*
2578                     * Remap needs to match alloc.  Accelerators that
2579                     * set phys_mem_alloc never remap.  If they did,
2580                     * we'd need a remap hook here.
2581                     */
2582                    assert(phys_mem_alloc == qemu_anon_ram_alloc);
2583
2584                    flags |= MAP_PRIVATE | MAP_ANONYMOUS;
2585                    area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2586                                flags, -1, 0);
2587                }
2588                if (area != vaddr) {
2589                    error_report("Could not remap addr: "
2590                                 RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
2591                                 length, addr);
2592                    exit(1);
2593                }
2594                memory_try_enable_merging(vaddr, length);
2595                qemu_ram_setup_dump(vaddr, length);
2596            }
2597        }
2598    }
2599}
2600#endif /* !_WIN32 */
2601
2602/* Return a host pointer to ram allocated with qemu_ram_alloc.
2603 * This should not be used for general purpose DMA.  Use address_space_map
2604 * or address_space_rw instead. For local memory (e.g. video ram) that the
2605 * device owns, use memory_region_get_ram_ptr.
2606 *
2607 * Called within RCU critical section.
2608 */
2609void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2610{
2611    RAMBlock *block = ram_block;
2612
2613    if (block == NULL) {
2614        block = qemu_get_ram_block(addr);
2615        addr -= block->offset;
2616    }
2617
2618    if (xen_enabled() && block->host == NULL) {
2619        /* We need to check if the requested address is in the RAM
2620         * because we don't want to map the entire memory in QEMU.
2621         * In that case just map until the end of the page.
2622         */
2623        if (block->offset == 0) {
2624            return xen_map_cache(addr, 0, 0, false);
2625        }
2626
2627        block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2628    }
2629    return ramblock_ptr(block, addr);
2630}
2631
2632/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2633 * but takes a size argument.
2634 *
2635 * Called within RCU critical section.
2636 */
2637static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2638                                 hwaddr *size, bool lock)
2639{
2640    RAMBlock *block = ram_block;
2641    if (*size == 0) {
2642        return NULL;
2643    }
2644
2645    if (block == NULL) {
2646        block = qemu_get_ram_block(addr);
2647        addr -= block->offset;
2648    }
2649    *size = MIN(*size, block->max_length - addr);
2650
2651    if (xen_enabled() && block->host == NULL) {
2652        /* We need to check if the requested address is in the RAM
2653         * because we don't want to map the entire memory in QEMU.
2654         * In that case just map the requested area.
2655         */
2656        if (block->offset == 0) {
2657            return xen_map_cache(addr, *size, lock, lock);
2658        }
2659
2660        block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2661    }
2662
2663    return ramblock_ptr(block, addr);
2664}
2665
2666/* Return the offset of a hostpointer within a ramblock */
2667ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
2668{
2669    ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
2670    assert((uintptr_t)host >= (uintptr_t)rb->host);
2671    assert(res < rb->max_length);
2672
2673    return res;
2674}
2675
2676/*
2677 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
2678 * in that RAMBlock.
2679 *
2680 * ptr: Host pointer to look up
2681 * round_offset: If true round the result offset down to a page boundary
2682 * *ram_addr: set to result ram_addr
2683 * *offset: set to result offset within the RAMBlock
2684 *
2685 * Returns: RAMBlock (or NULL if not found)
2686 *
2687 * By the time this function returns, the returned pointer is not protected
2688 * by RCU anymore.  If the caller is not within an RCU critical section and
2689 * does not hold the iothread lock, it must have other means of protecting the
2690 * pointer, such as a reference to the region that includes the incoming
2691 * ram_addr_t.
2692 */
2693RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2694                                   ram_addr_t *offset)
2695{
2696    RAMBlock *block;
2697    uint8_t *host = ptr;
2698
2699    if (xen_enabled()) {
2700        ram_addr_t ram_addr;
2701        RCU_READ_LOCK_GUARD();
2702        ram_addr = xen_ram_addr_from_mapcache(ptr);
2703        block = qemu_get_ram_block(ram_addr);
2704        if (block) {
2705            *offset = ram_addr - block->offset;
2706        }
2707        return block;
2708    }
2709
2710    RCU_READ_LOCK_GUARD();
2711    block = atomic_rcu_read(&ram_list.mru_block);
2712    if (block && block->host && host - block->host < block->max_length) {
2713        goto found;
2714    }
2715
2716    RAMBLOCK_FOREACH(block) {
2717        /* This case append when the block is not mapped. */
2718        if (block->host == NULL) {
2719            continue;
2720        }
2721        if (host - block->host < block->max_length) {
2722            goto found;
2723        }
2724    }
2725
2726    return NULL;
2727
2728found:
2729    *offset = (host - block->host);
2730    if (round_offset) {
2731        *offset &= TARGET_PAGE_MASK;
2732    }
2733    return block;
2734}
2735
2736/*
2737 * Finds the named RAMBlock
2738 *
2739 * name: The name of RAMBlock to find
2740 *
2741 * Returns: RAMBlock (or NULL if not found)
2742 */
2743RAMBlock *qemu_ram_block_by_name(const char *name)
2744{
2745    RAMBlock *block;
2746
2747    RAMBLOCK_FOREACH(block) {
2748        if (!strcmp(name, block->idstr)) {
2749            return block;
2750        }
2751    }
2752
2753    return NULL;
2754}
2755
2756/* Some of the softmmu routines need to translate from a host pointer
2757   (typically a TLB entry) back to a ram offset.  */
2758ram_addr_t qemu_ram_addr_from_host(void *ptr)
2759{
2760    RAMBlock *block;
2761    ram_addr_t offset;
2762
2763    block = qemu_ram_block_from_host(ptr, false, &offset);
2764    if (!block) {
2765        return RAM_ADDR_INVALID;
2766    }
2767
2768    return block->offset + offset;
2769}
2770
2771/* Generate a debug exception if a watchpoint has been hit.  */
2772void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
2773                          MemTxAttrs attrs, int flags, uintptr_t ra)
2774{
2775    CPUClass *cc = CPU_GET_CLASS(cpu);
2776    CPUWatchpoint *wp;
2777
2778    assert(tcg_enabled());
2779    if (cpu->watchpoint_hit) {
2780        /*
2781         * We re-entered the check after replacing the TB.
2782         * Now raise the debug interrupt so that it will
2783         * trigger after the current instruction.
2784         */
2785        qemu_mutex_lock_iothread();
2786        cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2787        qemu_mutex_unlock_iothread();
2788        return;
2789    }
2790
2791    addr = cc->adjust_watchpoint_address(cpu, addr, len);
2792    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2793        if (watchpoint_address_matches(wp, addr, len)
2794            && (wp->flags & flags)) {
2795            if (flags == BP_MEM_READ) {
2796                wp->flags |= BP_WATCHPOINT_HIT_READ;
2797            } else {
2798                wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2799            }
2800            wp->hitaddr = MAX(addr, wp->vaddr);
2801            wp->hitattrs = attrs;
2802            if (!cpu->watchpoint_hit) {
2803                if (wp->flags & BP_CPU &&
2804                    !cc->debug_check_watchpoint(cpu, wp)) {
2805                    wp->flags &= ~BP_WATCHPOINT_HIT;
2806                    continue;
2807                }
2808                cpu->watchpoint_hit = wp;
2809
2810                mmap_lock();
2811                tb_check_watchpoint(cpu, ra);
2812                if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2813                    cpu->exception_index = EXCP_DEBUG;
2814                    mmap_unlock();
2815                    cpu_loop_exit_restore(cpu, ra);
2816                } else {
2817                    /* Force execution of one insn next time.  */
2818                    cpu->cflags_next_tb = 1 | curr_cflags();
2819                    mmap_unlock();
2820                    if (ra) {
2821                        cpu_restore_state(cpu, ra, true);
2822                    }
2823                    cpu_loop_exit_noexc(cpu);
2824                }
2825            }
2826        } else {
2827            wp->flags &= ~BP_WATCHPOINT_HIT;
2828        }
2829    }
2830}
2831
2832static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
2833                                 MemTxAttrs attrs, void *buf, hwaddr len);
2834static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2835                                  const void *buf, hwaddr len);
2836static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
2837                                  bool is_write, MemTxAttrs attrs);
2838
2839static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2840                                unsigned len, MemTxAttrs attrs)
2841{
2842    subpage_t *subpage = opaque;
2843    uint8_t buf[8];
2844    MemTxResult res;
2845
2846#if defined(DEBUG_SUBPAGE)
2847    printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2848           subpage, len, addr);
2849#endif
2850    res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2851    if (res) {
2852        return res;
2853    }
2854    *data = ldn_p(buf, len);
2855    return MEMTX_OK;
2856}
2857
2858static MemTxResult subpage_write(void *opaque, hwaddr addr,
2859                                 uint64_t value, unsigned len, MemTxAttrs attrs)
2860{
2861    subpage_t *subpage = opaque;
2862    uint8_t buf[8];
2863
2864#if defined(DEBUG_SUBPAGE)
2865    printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2866           " value %"PRIx64"\n",
2867           __func__, subpage, len, addr, value);
2868#endif
2869    stn_p(buf, len, value);
2870    return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2871}
2872
2873static bool subpage_accepts(void *opaque, hwaddr addr,
2874                            unsigned len, bool is_write,
2875                            MemTxAttrs attrs)
2876{
2877    subpage_t *subpage = opaque;
2878#if defined(DEBUG_SUBPAGE)
2879    printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2880           __func__, subpage, is_write ? 'w' : 'r', len, addr);
2881#endif
2882
2883    return flatview_access_valid(subpage->fv, addr + subpage->base,
2884                                 len, is_write, attrs);
2885}
2886
2887static const MemoryRegionOps subpage_ops = {
2888    .read_with_attrs = subpage_read,
2889    .write_with_attrs = subpage_write,
2890    .impl.min_access_size = 1,
2891    .impl.max_access_size = 8,
2892    .valid.min_access_size = 1,
2893    .valid.max_access_size = 8,
2894    .valid.accepts = subpage_accepts,
2895    .endianness = DEVICE_NATIVE_ENDIAN,
2896};
2897
2898static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
2899                            uint16_t section)
2900{
2901    int idx, eidx;
2902
2903    if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2904        return -1;
2905    idx = SUBPAGE_IDX(start);
2906    eidx = SUBPAGE_IDX(end);
2907#if defined(DEBUG_SUBPAGE)
2908    printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2909           __func__, mmio, start, end, idx, eidx, section);
2910#endif
2911    for (; idx <= eidx; idx++) {
2912        mmio->sub_section[idx] = section;
2913    }
2914
2915    return 0;
2916}
2917
2918static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2919{
2920    subpage_t *mmio;
2921
2922    /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
2923    mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2924    mmio->fv = fv;
2925    mmio->base = base;
2926    memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2927                          NULL, TARGET_PAGE_SIZE);
2928    mmio->iomem.subpage = true;
2929#if defined(DEBUG_SUBPAGE)
2930    printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2931           mmio, base, TARGET_PAGE_SIZE);
2932#endif
2933
2934    return mmio;
2935}
2936
2937static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2938{
2939    assert(fv);
2940    MemoryRegionSection section = {
2941        .fv = fv,
2942        .mr = mr,
2943        .offset_within_address_space = 0,
2944        .offset_within_region = 0,
2945        .size = int128_2_64(),
2946    };
2947
2948    return phys_section_add(map, &section);
2949}
2950
2951MemoryRegionSection *iotlb_to_section(CPUState *cpu,
2952                                      hwaddr index, MemTxAttrs attrs)
2953{
2954    int asidx = cpu_asidx_from_attrs(cpu, attrs);
2955    CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2956    AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2957    MemoryRegionSection *sections = d->map.sections;
2958
2959    return &sections[index & ~TARGET_PAGE_MASK];
2960}
2961
2962static void io_mem_init(void)
2963{
2964    memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2965                          NULL, UINT64_MAX);
2966}
2967
2968AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
2969{
2970    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2971    uint16_t n;
2972
2973    n = dummy_section(&d->map, fv, &io_mem_unassigned);
2974    assert(n == PHYS_SECTION_UNASSIGNED);
2975
2976    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2977
2978    return d;
2979}
2980
2981void address_space_dispatch_free(AddressSpaceDispatch *d)
2982{
2983    phys_sections_free(&d->map);
2984    g_free(d);
2985}
2986
2987static void do_nothing(CPUState *cpu, run_on_cpu_data d)
2988{
2989}
2990
2991static void tcg_log_global_after_sync(MemoryListener *listener)
2992{
2993    CPUAddressSpace *cpuas;
2994
2995    /* Wait for the CPU to end the current TB.  This avoids the following
2996     * incorrect race:
2997     *
2998     *      vCPU                         migration
2999     *      ----------------------       -------------------------
3000     *      TLB check -> slow path
3001     *        notdirty_mem_write
3002     *          write to RAM
3003     *          mark dirty
3004     *                                   clear dirty flag
3005     *      TLB check -> fast path
3006     *                                   read memory
3007     *        write to RAM
3008     *
3009     * by pushing the migration thread's memory read after the vCPU thread has
3010     * written the memory.
3011     */
3012    if (replay_mode == REPLAY_MODE_NONE) {
3013        /*
3014         * VGA can make calls to this function while updating the screen.
3015         * In record/replay mode this causes a deadlock, because
3016         * run_on_cpu waits for rr mutex. Therefore no races are possible
3017         * in this case and no need for making run_on_cpu when
3018         * record/replay is not enabled.
3019         */
3020        cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
3021        run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
3022    }
3023}
3024
3025static void tcg_commit(MemoryListener *listener)
3026{
3027    CPUAddressSpace *cpuas;
3028    AddressSpaceDispatch *d;
3029
3030    assert(tcg_enabled());
3031    /* since each CPU stores ram addresses in its TLB cache, we must
3032       reset the modified entries */
3033    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
3034    cpu_reloading_memory_map();
3035    /* The CPU and TLB are protected by the iothread lock.
3036     * We reload the dispatch pointer now because cpu_reloading_memory_map()
3037     * may have split the RCU critical section.
3038     */
3039    d = address_space_to_dispatch(cpuas->as);
3040    atomic_rcu_set(&cpuas->memory_dispatch, d);
3041    tlb_flush(cpuas->cpu);
3042}
3043
3044static void memory_map_init(void)
3045{
3046    system_memory = g_malloc(sizeof(*system_memory));
3047
3048    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
3049    address_space_init(&address_space_memory, system_memory, "memory");
3050
3051    system_io = g_malloc(sizeof(*system_io));
3052    memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
3053                          65536);
3054    address_space_init(&address_space_io, system_io, "I/O");
3055}
3056
3057MemoryRegion *get_system_memory(void)
3058{
3059    return system_memory;
3060}
3061
3062MemoryRegion *get_system_io(void)
3063{
3064    return system_io;
3065}
3066
3067#endif /* !defined(CONFIG_USER_ONLY) */
3068
3069/* physical memory access (slow version, mainly for debug) */
3070#if defined(CONFIG_USER_ONLY)
3071int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3072                        void *ptr, target_ulong len, bool is_write)
3073{
3074    int flags;
3075    target_ulong l, page;
3076    void * p;
3077    uint8_t *buf = ptr;
3078
3079    while (len > 0) {
3080        page = addr & TARGET_PAGE_MASK;
3081        l = (page + TARGET_PAGE_SIZE) - addr;
3082        if (l > len)
3083            l = len;
3084        flags = page_get_flags(page);
3085        if (!(flags & PAGE_VALID))
3086            return -1;
3087        if (is_write) {
3088            if (!(flags & PAGE_WRITE))
3089                return -1;
3090            /* XXX: this code should not depend on lock_user */
3091            if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
3092                return -1;
3093            memcpy(p, buf, l);
3094            unlock_user(p, addr, l);
3095        } else {
3096            if (!(flags & PAGE_READ))
3097                return -1;
3098            /* XXX: this code should not depend on lock_user */
3099            if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
3100                return -1;
3101            memcpy(buf, p, l);
3102            unlock_user(p, addr, 0);
3103        }
3104        len -= l;
3105        buf += l;
3106        addr += l;
3107    }
3108    return 0;
3109}
3110
3111void cpu_set_mr(Object *obj, Visitor *v, void *opaque,
3112                const char *name, Error **errp)
3113{
3114}
3115
3116#else
3117
3118void cpu_set_mr(Object *obj, Visitor *v, void *opaque,
3119                const char *name, Error **errp)
3120{
3121    CPUState *cpu = CPU(obj);
3122    Error *local_err = NULL;
3123    char *path = NULL;
3124
3125    visit_type_str(v, name, &path, &local_err);
3126
3127    if (!local_err && strcmp(path, "") != 0) {
3128        cpu->memory = MEMORY_REGION(object_resolve_link(obj, name, path,
3129                                &local_err));
3130    }
3131
3132    if (local_err) {
3133        error_propagate(errp, local_err);
3134        return;
3135    }
3136
3137    object_ref(OBJECT(cpu->memory));
3138    cpu->as = g_malloc0(sizeof(AddressSpace));
3139    address_space_init(cpu->as, cpu->memory, NULL);
3140}
3141
3142static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
3143                                     hwaddr length)
3144{
3145    uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3146    addr += memory_region_get_ram_addr(mr);
3147
3148    /* No early return if dirty_log_mask is or becomes 0, because
3149     * cpu_physical_memory_set_dirty_range will still call
3150     * xen_modified_memory.
3151     */
3152    if (dirty_log_mask) {
3153        dirty_log_mask =
3154            cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
3155    }
3156    if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
3157        assert(tcg_enabled());
3158        tb_invalidate_phys_range(addr, addr + length);
3159        dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3160    }
3161    cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
3162}
3163
3164void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
3165{
3166    /*
3167     * In principle this function would work on other memory region types too,
3168     * but the ROM device use case is the only one where this operation is
3169     * necessary.  Other memory regions should use the
3170     * address_space_read/write() APIs.
3171     */
3172    assert(memory_region_is_romd(mr));
3173
3174    invalidate_and_set_dirty(mr, addr, size);
3175}
3176
3177static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
3178{
3179    unsigned access_size_max = mr->ops->valid.max_access_size;
3180
3181    /* Regions are assumed to support 1-4 byte accesses unless
3182       otherwise specified.  */
3183    if (access_size_max == 0) {
3184        access_size_max = 4;
3185    }
3186
3187    /* Bound the maximum access by the alignment of the address.  */
3188    if (!mr->ops->impl.unaligned) {
3189        unsigned align_size_max = addr & -addr;
3190        if (align_size_max != 0 && align_size_max < access_size_max) {
3191            access_size_max = align_size_max;
3192        }
3193    }
3194
3195    /* Don't attempt accesses larger than the maximum.  */
3196    if (l > access_size_max) {
3197        l = access_size_max;
3198    }
3199    l = pow2floor(l);
3200
3201    return l;
3202}
3203
3204static bool prepare_mmio_access(MemoryRegion *mr)
3205{
3206    bool unlocked = !qemu_mutex_iothread_locked();
3207    bool release_lock = false;
3208
3209    if (unlocked && mr->global_locking) {
3210        qemu_mutex_lock_iothread();
3211        unlocked = false;
3212        release_lock = true;
3213    }
3214    if (mr->flush_coalesced_mmio) {
3215        if (unlocked) {
3216            qemu_mutex_lock_iothread();
3217        }
3218        qemu_flush_coalesced_mmio_buffer();
3219        if (unlocked) {
3220            qemu_mutex_unlock_iothread();
3221        }
3222    }
3223
3224    return release_lock;
3225}
3226
3227/* Called within RCU critical section.  */
3228static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
3229                                           MemTxAttrs attrs,
3230                                           const void *ptr,
3231                                           hwaddr len, hwaddr addr1,
3232                                           hwaddr l, MemoryRegion *mr)
3233{
3234    uint8_t *ram_ptr;
3235    uint64_t val;
3236    MemTxResult result = MEMTX_OK;
3237    bool release_lock = false;
3238    const uint8_t *buf = ptr;
3239
3240    for (;;) {
3241        if (!memory_access_is_direct(mr, true)) {
3242            release_lock |= prepare_mmio_access(mr);
3243            l = memory_access_size(mr, l, addr1);
3244            /* XXX: could force current_cpu to NULL to avoid
3245               potential bugs */
3246            if (l <= 8) {
3247                val = ldn_he_p(buf, l);
3248                result |= memory_region_dispatch_write(mr, addr1, val,
3249                                                   size_memop(l), attrs);
3250            } else {
3251                if (mr->ops->access) {
3252                    MemoryTransaction tr = {
3253                        .data.p8 = (uint8_t *) buf,
3254                        .rw = true,
3255                        .addr = addr1,
3256                        .size = l,
3257                        .attr = attrs,
3258                        .opaque = mr->opaque,
3259                    };
3260                    mr->ops->access(&tr);
3261                } else {
3262                    abort();
3263                }
3264            }
3265        } else {
3266            /* RAM case */
3267            ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3268            memcpy(ram_ptr, buf, l);
3269            invalidate_and_set_dirty(mr, addr1, l);
3270        }
3271
3272        if (release_lock) {
3273            qemu_mutex_unlock_iothread();
3274            release_lock = false;
3275        }
3276
3277        len -= l;
3278        buf += l;
3279        addr += l;
3280
3281        if (!len) {
3282            break;
3283        }
3284
3285        l = len;
3286        mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3287    }
3288
3289    return result;
3290}
3291
3292/* Called from RCU critical section.  */
3293static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3294                                  const void *buf, hwaddr len)
3295{
3296    hwaddr l;
3297    hwaddr addr1;
3298    MemoryRegion *mr;
3299    MemTxResult result = MEMTX_OK;
3300
3301    l = len;
3302    mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3303    result = flatview_write_continue(fv, addr, attrs, buf, len,
3304                                     addr1, l, mr);
3305
3306    return result;
3307}
3308
3309/* Called within RCU critical section.  */
3310MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
3311                                   MemTxAttrs attrs, void *ptr,
3312                                   hwaddr len, hwaddr addr1, hwaddr l,
3313                                   MemoryRegion *mr)
3314{
3315    uint8_t *ram_ptr;
3316    uint64_t val;
3317    MemTxResult result = MEMTX_OK;
3318    bool release_lock = false;
3319    uint8_t *buf = ptr;
3320
3321    for (;;) {
3322        if (!memory_access_is_direct(mr, false)) {
3323            /* I/O case */
3324            release_lock |= prepare_mmio_access(mr);
3325            l = memory_access_size(mr, l, addr1);
3326            if (l <= 8) {
3327                result |= memory_region_dispatch_read(mr, addr1, &val,
3328                                                      size_memop(l), attrs);
3329                stn_he_p(buf, l, val);
3330            } else {
3331                if (mr->ops->access) {
3332                    MemoryTransaction tr = {
3333                        .data.p8 = buf,
3334                        .rw = false,
3335                        .addr = addr1,
3336                        .size = l,
3337                        .attr = attrs,
3338                        .opaque = mr->opaque,
3339                    };
3340                    mr->ops->access(&tr);
3341                } else {
3342                    abort();
3343                }
3344            }
3345        } else {
3346            /* RAM case */
3347            ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3348            memcpy(buf, ram_ptr, l);
3349        }
3350
3351        if (release_lock) {
3352            qemu_mutex_unlock_iothread();
3353            release_lock = false;
3354        }
3355
3356        len -= l;
3357        buf += l;
3358        addr += l;
3359
3360        if (!len) {
3361            break;
3362        }
3363
3364        l = len;
3365        mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3366    }
3367
3368    return result;
3369}
3370
3371/* Called from RCU critical section.  */
3372static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
3373                                 MemTxAttrs attrs, void *buf, hwaddr len)
3374{
3375    hwaddr l;
3376    hwaddr addr1;
3377    MemoryRegion *mr;
3378
3379    l = len;
3380    mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3381    return flatview_read_continue(fv, addr, attrs, buf, len,
3382                                  addr1, l, mr);
3383}
3384
3385MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3386                                    MemTxAttrs attrs, void *buf, hwaddr len)
3387{
3388    MemTxResult result = MEMTX_OK;
3389    FlatView *fv;
3390
3391    if (len > 0) {
3392        RCU_READ_LOCK_GUARD();
3393        fv = address_space_to_flatview(as);
3394        result = flatview_read(fv, addr, attrs, buf, len);
3395    }
3396
3397    return result;
3398}
3399
3400MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
3401                                MemTxAttrs attrs,
3402                                const void *buf, hwaddr len)
3403{
3404    MemTxResult result = MEMTX_OK;
3405    FlatView *fv;
3406
3407    if (len > 0) {
3408        RCU_READ_LOCK_GUARD();
3409        fv = address_space_to_flatview(as);
3410        result = flatview_write(fv, addr, attrs, buf, len);
3411    }
3412
3413    return result;
3414}
3415
3416MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3417                             void *buf, hwaddr len, bool is_write)
3418{
3419    if (is_write) {
3420        return address_space_write(as, addr, attrs, buf, len);
3421    } else {
3422        return address_space_read_full(as, addr, attrs, buf, len);
3423    }
3424}
3425
3426void cpu_physical_memory_rw(hwaddr addr, void *buf,
3427                            hwaddr len, bool is_write)
3428{
3429    address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3430                     buf, len, is_write);
3431}
3432
3433enum write_rom_type {
3434    WRITE_DATA,
3435    FLUSH_CACHE,
3436};
3437
3438static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
3439                                                           hwaddr addr,
3440                                                           MemTxAttrs attrs,
3441                                                           const void *ptr,
3442                                                           hwaddr len,
3443                                                           enum write_rom_type type)
3444{
3445    hwaddr l;
3446    uint8_t *ram_ptr;
3447    hwaddr addr1;
3448    MemoryRegion *mr;
3449    const uint8_t *buf = ptr;
3450
3451    RCU_READ_LOCK_GUARD();
3452    while (len > 0) {
3453        l = len;
3454        mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
3455
3456        if (!(memory_region_is_ram(mr) ||
3457              memory_region_is_romd(mr))) {
3458            if (type == WRITE_DATA) {
3459                address_space_rw(as, addr, MEMTXATTRS_UNSPECIFIED,
3460                                 (uint8_t *) buf, len, true);
3461            } else {
3462                l = memory_access_size(mr, l, addr1);
3463            }
3464        } else {
3465            /* ROM/RAM case */
3466            ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3467            switch (type) {
3468            case WRITE_DATA:
3469                memcpy(ram_ptr, buf, l);
3470                invalidate_and_set_dirty(mr, addr1, l);
3471                break;
3472            case FLUSH_CACHE:
3473                flush_icache_range((uintptr_t)ram_ptr, (uintptr_t)ram_ptr + l);
3474                break;
3475            }
3476        }
3477        len -= l;
3478        buf += l;
3479        addr += l;
3480    }
3481    return MEMTX_OK;
3482}
3483
3484/* used for ROM loading : can write in RAM and ROM */
3485MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
3486                                    MemTxAttrs attrs,
3487                                    const void *buf, hwaddr len)
3488{
3489    return address_space_write_rom_internal(as, addr, attrs,
3490                                            buf, len, WRITE_DATA);
3491}
3492
3493void cpu_flush_icache_range(hwaddr start, hwaddr len)
3494{
3495    /*
3496     * This function should do the same thing as an icache flush that was
3497     * triggered from within the guest. For TCG we are always cache coherent,
3498     * so there is no need to flush anything. For KVM / Xen we need to flush
3499     * the host's instruction cache at least.
3500     */
3501    if (tcg_enabled()) {
3502        return;
3503    }
3504
3505    address_space_write_rom_internal(&address_space_memory,
3506                                     start, MEMTXATTRS_UNSPECIFIED,
3507                                     NULL, len, FLUSH_CACHE);
3508}
3509
3510typedef struct {
3511    MemoryRegion *mr;
3512    void *buffer;
3513    hwaddr addr;
3514    hwaddr len;
3515    bool in_use;
3516} BounceBuffer;
3517
3518static BounceBuffer bounce;
3519
3520typedef struct MapClient {
3521    QEMUBH *bh;
3522    QLIST_ENTRY(MapClient) link;
3523} MapClient;
3524
3525QemuMutex map_client_list_lock;
3526static QLIST_HEAD(, MapClient) map_client_list
3527    = QLIST_HEAD_INITIALIZER(map_client_list);
3528
3529static void cpu_unregister_map_client_do(MapClient *client)
3530{
3531    QLIST_REMOVE(client, link);
3532    g_free(client);
3533}
3534
3535static void cpu_notify_map_clients_locked(void)
3536{
3537    MapClient *client;
3538
3539    while (!QLIST_EMPTY(&map_client_list)) {
3540        client = QLIST_FIRST(&map_client_list);
3541        qemu_bh_schedule(client->bh);
3542        cpu_unregister_map_client_do(client);
3543    }
3544}
3545
3546void cpu_register_map_client(QEMUBH *bh)
3547{
3548    MapClient *client = g_malloc(sizeof(*client));
3549
3550    qemu_mutex_lock(&map_client_list_lock);
3551    client->bh = bh;
3552    QLIST_INSERT_HEAD(&map_client_list, client, link);
3553    if (!atomic_read(&bounce.in_use)) {
3554        cpu_notify_map_clients_locked();
3555    }
3556    qemu_mutex_unlock(&map_client_list_lock);
3557}
3558
3559void cpu_exec_init_all(void)
3560{
3561    qemu_mutex_init(&ram_list.mutex);
3562    /* The data structures we set up here depend on knowing the page size,
3563     * so no more changes can be made after this point.
3564     * In an ideal world, nothing we did before we had finished the
3565     * machine setup would care about the target page size, and we could
3566     * do this much later, rather than requiring board models to state
3567     * up front what their requirements are.
3568     */
3569    finalize_target_page_bits();
3570    io_mem_init();
3571    memory_map_init();
3572    qemu_mutex_init(&map_client_list_lock);
3573}
3574
3575void cpu_unregister_map_client(QEMUBH *bh)
3576{
3577    MapClient *client;
3578
3579    qemu_mutex_lock(&map_client_list_lock);
3580    QLIST_FOREACH(client, &map_client_list, link) {
3581        if (client->bh == bh) {
3582            cpu_unregister_map_client_do(client);
3583            break;
3584        }
3585    }
3586    qemu_mutex_unlock(&map_client_list_lock);
3587}
3588
3589static void cpu_notify_map_clients(void)
3590{
3591    qemu_mutex_lock(&map_client_list_lock);
3592    cpu_notify_map_clients_locked();
3593    qemu_mutex_unlock(&map_client_list_lock);
3594}
3595
3596static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
3597                                  bool is_write, MemTxAttrs attrs)
3598{
3599    MemoryRegion *mr;
3600    hwaddr l, xlat;
3601
3602    while (len > 0) {
3603        l = len;
3604        mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3605        if (!memory_access_is_direct(mr, is_write)) {
3606            l = memory_access_size(mr, l, addr);
3607            if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
3608                return false;
3609            }
3610        }
3611
3612        len -= l;
3613        addr += l;
3614    }
3615    return true;
3616}
3617
3618bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3619                                hwaddr len, bool is_write,
3620                                MemTxAttrs attrs)
3621{
3622    FlatView *fv;
3623    bool result;
3624
3625    RCU_READ_LOCK_GUARD();
3626    fv = address_space_to_flatview(as);
3627    result = flatview_access_valid(fv, addr, len, is_write, attrs);
3628    return result;
3629}
3630
3631static hwaddr
3632flatview_extend_translation(FlatView *fv, hwaddr addr,
3633                            hwaddr target_len,
3634                            MemoryRegion *mr, hwaddr base, hwaddr len,
3635                            bool is_write, MemTxAttrs attrs)
3636{
3637    hwaddr done = 0;
3638    hwaddr xlat;
3639    MemoryRegion *this_mr;
3640
3641    for (;;) {
3642        target_len -= len;
3643        addr += len;
3644        done += len;
3645        if (target_len == 0) {
3646            return done;
3647        }
3648
3649        len = target_len;
3650        this_mr = flatview_translate(fv, addr, &xlat,
3651                                     &len, is_write, attrs);
3652        if (this_mr != mr || xlat != base + done) {
3653            return done;
3654        }
3655    }
3656}
3657
3658/* Map a physical memory region into a host virtual address.
3659 * May map a subset of the requested range, given by and returned in *plen.
3660 * May return NULL if resources needed to perform the mapping are exhausted.
3661 * Use only for reads OR writes - not for read-modify-write operations.
3662 * Use cpu_register_map_client() to know when retrying the map operation is
3663 * likely to succeed.
3664 */
3665void *address_space_map(AddressSpace *as,
3666                        hwaddr addr,
3667                        hwaddr *plen,
3668                        bool is_write,
3669                        MemTxAttrs attrs)
3670{
3671    hwaddr len = *plen;
3672    hwaddr l, xlat;
3673    MemoryRegion *mr;
3674    void *ptr;
3675    FlatView *fv;
3676
3677    if (len == 0) {
3678        return NULL;
3679    }
3680
3681    l = len;
3682    RCU_READ_LOCK_GUARD();
3683    fv = address_space_to_flatview(as);
3684    mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3685
3686    if (!memory_access_is_direct(mr, is_write)) {
3687        if (atomic_xchg(&bounce.in_use, true)) {
3688            return NULL;
3689        }
3690        /* Avoid unbounded allocations */
3691        l = MIN(l, TARGET_PAGE_SIZE);
3692        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3693        bounce.addr = addr;
3694        bounce.len = l;
3695
3696        memory_region_ref(mr);
3697        bounce.mr = mr;
3698        if (!is_write) {
3699            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
3700                               bounce.buffer, l);
3701        }
3702
3703        *plen = l;
3704        return bounce.buffer;
3705    }
3706
3707
3708    memory_region_ref(mr);
3709    *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3710                                        l, is_write, attrs);
3711    ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3712
3713    return ptr;
3714}
3715
3716/* Unmaps a memory region previously mapped by address_space_map().
3717 * Will also mark the memory as dirty if is_write is true.  access_len gives
3718 * the amount of memory that was actually read or written by the caller.
3719 */
3720void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3721                         bool is_write, hwaddr access_len)
3722{
3723    if (buffer != bounce.buffer) {
3724        MemoryRegion *mr;
3725        ram_addr_t addr1;
3726
3727        mr = memory_region_from_host(buffer, &addr1);
3728        assert(mr != NULL);
3729        if (is_write) {
3730            invalidate_and_set_dirty(mr, addr1, access_len);
3731        }
3732        if (xen_enabled()) {
3733            xen_invalidate_map_cache_entry(buffer);
3734        }
3735        memory_region_unref(mr);
3736        return;
3737    }
3738    if (is_write) {
3739        address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3740                            bounce.buffer, access_len);
3741    }
3742    qemu_vfree(bounce.buffer);
3743    bounce.buffer = NULL;
3744    memory_region_unref(bounce.mr);
3745    atomic_mb_set(&bounce.in_use, false);
3746    cpu_notify_map_clients();
3747}
3748
3749void *cpu_physical_memory_map(hwaddr addr,
3750                              hwaddr *plen,
3751                              bool is_write)
3752{
3753    return address_space_map(&address_space_memory, addr, plen, is_write,
3754                             MEMTXATTRS_UNSPECIFIED);
3755}
3756
3757void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3758                               bool is_write, hwaddr access_len)
3759{
3760    return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3761}
3762
3763#define ARG1_DECL                AddressSpace *as
3764#define ARG1                     as
3765#define SUFFIX
3766#define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3767#define RCU_READ_LOCK(...)       rcu_read_lock()
3768#define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3769#include "memory_ldst.inc.c"
3770
3771int64_t address_space_cache_init(MemoryRegionCache *cache,
3772                                 AddressSpace *as,
3773                                 hwaddr addr,
3774                                 hwaddr len,
3775                                 bool is_write)
3776{
3777    AddressSpaceDispatch *d;
3778    hwaddr l;
3779    MemoryRegion *mr;
3780
3781    assert(len > 0);
3782
3783    l = len;
3784    cache->fv = address_space_get_flatview(as);
3785    d = flatview_to_dispatch(cache->fv);
3786    cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
3787
3788    mr = cache->mrs.mr;
3789    memory_region_ref(mr);
3790    if (memory_access_is_direct(mr, is_write)) {
3791        /* We don't care about the memory attributes here as we're only
3792         * doing this if we found actual RAM, which behaves the same
3793         * regardless of attributes; so UNSPECIFIED is fine.
3794         */
3795        l = flatview_extend_translation(cache->fv, addr, len, mr,
3796                                        cache->xlat, l, is_write,
3797                                        MEMTXATTRS_UNSPECIFIED);
3798        cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
3799    } else {
3800        cache->ptr = NULL;
3801    }
3802
3803    cache->len = l;
3804    cache->is_write = is_write;
3805    return l;
3806}
3807
3808void address_space_cache_invalidate(MemoryRegionCache *cache,
3809                                    hwaddr addr,
3810                                    hwaddr access_len)
3811{
3812    assert(cache->is_write);
3813    if (likely(cache->ptr)) {
3814        invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
3815    }
3816}
3817
3818void address_space_cache_destroy(MemoryRegionCache *cache)
3819{
3820    if (!cache->mrs.mr) {
3821        return;
3822    }
3823
3824    if (xen_enabled()) {
3825        xen_invalidate_map_cache_entry(cache->ptr);
3826    }
3827    memory_region_unref(cache->mrs.mr);
3828    flatview_unref(cache->fv);
3829    cache->mrs.mr = NULL;
3830    cache->fv = NULL;
3831}
3832
3833/* Called from RCU critical section.  This function has the same
3834 * semantics as address_space_translate, but it only works on a
3835 * predefined range of a MemoryRegion that was mapped with
3836 * address_space_cache_init.
3837 */
3838static inline MemoryRegion *address_space_translate_cached(
3839    MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
3840    hwaddr *plen, bool is_write, MemTxAttrs attrs)
3841{
3842    MemoryRegionSection section;
3843    MemoryRegion *mr;
3844    IOMMUMemoryRegion *iommu_mr;
3845    AddressSpace *target_as;
3846
3847    assert(!cache->ptr);
3848    *xlat = addr + cache->xlat;
3849
3850    mr = cache->mrs.mr;
3851    iommu_mr = memory_region_get_iommu(mr);
3852    if (!iommu_mr) {
3853        /* MMIO region.  */
3854        return mr;
3855    }
3856
3857    section = address_space_translate_iommu(iommu_mr, xlat, plen,
3858                                            NULL, is_write, true,
3859                                            &target_as, attrs);
3860    return section.mr;
3861}
3862
3863/* Called from RCU critical section. address_space_read_cached uses this
3864 * out of line function when the target is an MMIO or IOMMU region.
3865 */
3866void
3867address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3868                                   void *buf, hwaddr len)
3869{
3870    hwaddr addr1, l;
3871    MemoryRegion *mr;
3872
3873    l = len;
3874    mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
3875                                        MEMTXATTRS_UNSPECIFIED);
3876    flatview_read_continue(cache->fv,
3877                           addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3878                           addr1, l, mr);
3879}
3880
3881/* Called from RCU critical section. address_space_write_cached uses this
3882 * out of line function when the target is an MMIO or IOMMU region.
3883 */
3884void
3885address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3886                                    const void *buf, hwaddr len)
3887{
3888    hwaddr addr1, l;
3889    MemoryRegion *mr;
3890
3891    l = len;
3892    mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
3893                                        MEMTXATTRS_UNSPECIFIED);
3894    flatview_write_continue(cache->fv,
3895                            addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3896                            addr1, l, mr);
3897}
3898
3899#define ARG1_DECL                MemoryRegionCache *cache
3900#define ARG1                     cache
3901#define SUFFIX                   _cached_slow
3902#define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
3903#define RCU_READ_LOCK()          ((void)0)
3904#define RCU_READ_UNLOCK()        ((void)0)
3905#include "memory_ldst.inc.c"
3906
3907/* virtual memory access for debug (includes writing to ROM) */
3908int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3909                        void *ptr, target_ulong len, bool is_write)
3910{
3911    hwaddr phys_addr;
3912    target_ulong l, page;
3913    uint8_t *buf = ptr;
3914
3915    cpu_synchronize_state(cpu);
3916    while (len > 0) {
3917        int asidx;
3918        MemTxAttrs attrs;
3919
3920        page = addr & TARGET_PAGE_MASK;
3921        phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3922        asidx = cpu_asidx_from_attrs(cpu, attrs);
3923        /* if no physical page mapped, return an error */
3924        if (phys_addr == -1)
3925            return -1;
3926        l = (page + TARGET_PAGE_SIZE) - addr;
3927        if (l > len)
3928            l = len;
3929        phys_addr += (addr & ~TARGET_PAGE_MASK);
3930        if (is_write) {
3931            address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
3932                                    attrs, buf, l);
3933        } else {
3934            address_space_read(cpu->cpu_ases[asidx].as, phys_addr, attrs, buf,
3935                               l);
3936        }
3937        len -= l;
3938        buf += l;
3939        addr += l;
3940    }
3941    return 0;
3942}
3943
3944/*
3945 * Allows code that needs to deal with migration bitmaps etc to still be built
3946 * target independent.
3947 */
3948size_t qemu_target_page_size(void)
3949{
3950    return TARGET_PAGE_SIZE;
3951}
3952
3953int qemu_target_page_bits(void)
3954{
3955    return TARGET_PAGE_BITS;
3956}
3957
3958int qemu_target_page_bits_min(void)
3959{
3960    return TARGET_PAGE_BITS_MIN;
3961}
3962#endif
3963
3964bool target_words_bigendian(void)
3965{
3966#if defined(TARGET_WORDS_BIGENDIAN)
3967    return true;
3968#else
3969    return false;
3970#endif
3971}
3972
3973#ifndef CONFIG_USER_ONLY
3974bool cpu_physical_memory_is_io(hwaddr phys_addr)
3975{
3976    MemoryRegion*mr;
3977    hwaddr l = 1;
3978    bool res;
3979
3980    RCU_READ_LOCK_GUARD();
3981    mr = address_space_translate(&address_space_memory,
3982                                 phys_addr, &phys_addr, &l, false,
3983                                 MEMTXATTRS_UNSPECIFIED);
3984
3985    res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3986    return res;
3987}
3988
3989int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3990{
3991    RAMBlock *block;
3992    int ret = 0;
3993
3994    RCU_READ_LOCK_GUARD();
3995    RAMBLOCK_FOREACH(block) {
3996        ret = func(block, opaque);
3997        if (ret) {
3998            break;
3999        }
4000    }
4001    return ret;
4002}
4003
4004/*
4005 * Unmap pages of memory from start to start+length such that
4006 * they a) read as 0, b) Trigger whatever fault mechanism
4007 * the OS provides for postcopy.
4008 * The pages must be unmapped by the end of the function.
4009 * Returns: 0 on success, none-0 on failure
4010 *
4011 */
4012int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
4013{
4014    int ret = -1;
4015
4016    uint8_t *host_startaddr = rb->host + start;
4017
4018    if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) {
4019        error_report("ram_block_discard_range: Unaligned start address: %p",
4020                     host_startaddr);
4021        goto err;
4022    }
4023
4024    if ((start + length) <= rb->used_length) {
4025        bool need_madvise, need_fallocate;
4026        if (!QEMU_IS_ALIGNED(length, rb->page_size)) {
4027            error_report("ram_block_discard_range: Unaligned length: %zx",
4028                         length);
4029            goto err;
4030        }
4031
4032        errno = ENOTSUP; /* If we are missing MADVISE etc */
4033
4034        /* The logic here is messy;
4035         *    madvise DONTNEED fails for hugepages
4036         *    fallocate works on hugepages and shmem
4037         */
4038        need_madvise = (rb->page_size == qemu_host_page_size);
4039        need_fallocate = rb->fd != -1;
4040        if (need_fallocate) {
4041            /* For a file, this causes the area of the file to be zero'd
4042             * if read, and for hugetlbfs also causes it to be unmapped
4043             * so a userfault will trigger.
4044             */
4045#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
4046            ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
4047                            start, length);
4048            if (ret) {
4049                ret = -errno;
4050                error_report("ram_block_discard_range: Failed to fallocate "
4051                             "%s:%" PRIx64 " +%zx (%d)",
4052                             rb->idstr, start, length, ret);
4053                goto err;
4054            }
4055#else
4056            ret = -ENOSYS;
4057            error_report("ram_block_discard_range: fallocate not available/file"
4058                         "%s:%" PRIx64 " +%zx (%d)",
4059                         rb->idstr, start, length, ret);
4060            goto err;
4061#endif
4062        }
4063        if (need_madvise) {
4064            /* For normal RAM this causes it to be unmapped,
4065             * for shared memory it causes the local mapping to disappear
4066             * and to fall back on the file contents (which we just
4067             * fallocate'd away).
4068             */
4069#if defined(CONFIG_MADVISE)
4070            ret =  madvise(host_startaddr, length, MADV_DONTNEED);
4071            if (ret) {
4072                ret = -errno;
4073                error_report("ram_block_discard_range: Failed to discard range "
4074                             "%s:%" PRIx64 " +%zx (%d)",
4075                             rb->idstr, start, length, ret);
4076                goto err;
4077            }
4078#else
4079            ret = -ENOSYS;
4080            error_report("ram_block_discard_range: MADVISE not available"
4081                         "%s:%" PRIx64 " +%zx (%d)",
4082                         rb->idstr, start, length, ret);
4083            goto err;
4084#endif
4085        }
4086        trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
4087                                      need_madvise, need_fallocate, ret);
4088    } else {
4089        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
4090                     "/%zx/" RAM_ADDR_FMT")",
4091                     rb->idstr, start, length, rb->used_length);
4092    }
4093
4094err:
4095    return ret;
4096}
4097
4098bool ramblock_is_pmem(RAMBlock *rb)
4099{
4100    return rb->flags & RAM_PMEM;
4101}
4102
4103#endif
4104
4105void page_size_init(void)
4106{
4107    /* NOTE: we can always suppose that qemu_host_page_size >=
4108       TARGET_PAGE_SIZE */
4109    if (qemu_host_page_size == 0) {
4110        qemu_host_page_size = qemu_real_host_page_size;
4111    }
4112    if (qemu_host_page_size < TARGET_PAGE_SIZE) {
4113        qemu_host_page_size = TARGET_PAGE_SIZE;
4114    }
4115    qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
4116}
4117
4118#if !defined(CONFIG_USER_ONLY)
4119
4120static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
4121{
4122    if (start == end - 1) {
4123        qemu_printf("\t%3d      ", start);
4124    } else {
4125        qemu_printf("\t%3d..%-3d ", start, end - 1);
4126    }
4127    qemu_printf(" skip=%d ", skip);
4128    if (ptr == PHYS_MAP_NODE_NIL) {
4129        qemu_printf(" ptr=NIL");
4130    } else if (!skip) {
4131        qemu_printf(" ptr=#%d", ptr);
4132    } else {
4133        qemu_printf(" ptr=[%d]", ptr);
4134    }
4135    qemu_printf("\n");
4136}
4137
4138#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
4139                           int128_sub((size), int128_one())) : 0)
4140
4141void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
4142{
4143    int i;
4144
4145    qemu_printf("  Dispatch\n");
4146    qemu_printf("    Physical sections\n");
4147
4148    for (i = 0; i < d->map.sections_nb; ++i) {
4149        MemoryRegionSection *s = d->map.sections + i;
4150        const char *names[] = { " [unassigned]", " [not dirty]",
4151                                " [ROM]", " [watch]" };
4152
4153        qemu_printf("      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx
4154                    " %s%s%s%s%s",
4155            i,
4156            s->offset_within_address_space,
4157            s->offset_within_address_space + MR_SIZE(s->mr->size),
4158            s->mr->name ? s->mr->name : "(noname)",
4159            i < ARRAY_SIZE(names) ? names[i] : "",
4160            s->mr == root ? " [ROOT]" : "",
4161            s == d->mru_section ? " [MRU]" : "",
4162            s->mr->is_iommu ? " [iommu]" : "");
4163
4164        if (s->mr->alias) {
4165            qemu_printf(" alias=%s", s->mr->alias->name ?
4166                    s->mr->alias->name : "noname");
4167        }
4168        qemu_printf("\n");
4169    }
4170
4171    qemu_printf("    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
4172               P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
4173    for (i = 0; i < d->map.nodes_nb; ++i) {
4174        int j, jprev;
4175        PhysPageEntry prev;
4176        Node *n = d->map.nodes + i;
4177
4178        qemu_printf("      [%d]\n", i);
4179
4180        for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
4181            PhysPageEntry *pe = *n + j;
4182
4183            if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
4184                continue;
4185            }
4186
4187            mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4188
4189            jprev = j;
4190            prev = *pe;
4191        }
4192
4193        if (jprev != ARRAY_SIZE(*n)) {
4194            mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4195        }
4196    }
4197}
4198
4199#endif
4200