qemu/block.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator block driver
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 * Copyright (c) 2020 Virtuozzo International GmbH.
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu/osdep.h"
  27#include "block/trace.h"
  28#include "block/block_int.h"
  29#include "block/blockjob.h"
  30#include "block/dirty-bitmap.h"
  31#include "block/fuse.h"
  32#include "block/nbd.h"
  33#include "block/qdict.h"
  34#include "qemu/error-report.h"
  35#include "block/module_block.h"
  36#include "qemu/main-loop.h"
  37#include "qemu/module.h"
  38#include "qapi/error.h"
  39#include "qobject/qdict.h"
  40#include "qobject/qjson.h"
  41#include "qobject/qnull.h"
  42#include "qobject/qstring.h"
  43#include "qapi/qobject-output-visitor.h"
  44#include "qapi/qapi-visit-block-core.h"
  45#include "system/block-backend.h"
  46#include "qemu/notify.h"
  47#include "qemu/option.h"
  48#include "qemu/coroutine.h"
  49#include "block/qapi.h"
  50#include "qemu/timer.h"
  51#include "qemu/cutils.h"
  52#include "qemu/id.h"
  53#include "qemu/range.h"
  54#include "qemu/rcu.h"
  55#include "block/coroutines.h"
  56
  57#ifdef CONFIG_BSD
  58#include <sys/ioctl.h>
  59#include <sys/queue.h>
  60#if defined(HAVE_SYS_DISK_H)
  61#include <sys/disk.h>
  62#endif
  63#endif
  64
  65#ifdef _WIN32
  66#include <windows.h>
  67#endif
  68
  69#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  70
  71/* Protected by BQL */
  72static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
  73    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
  74
  75/* Protected by BQL */
  76static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
  77    QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
  78
  79/* Protected by BQL */
  80static QLIST_HEAD(, BlockDriver) bdrv_drivers =
  81    QLIST_HEAD_INITIALIZER(bdrv_drivers);
  82
  83static BlockDriverState *bdrv_open_inherit(const char *filename,
  84                                           const char *reference,
  85                                           QDict *options, int flags,
  86                                           BlockDriverState *parent,
  87                                           const BdrvChildClass *child_class,
  88                                           BdrvChildRole child_role,
  89                                           bool parse_filename,
  90                                           Error **errp);
  91
  92static bool bdrv_recurse_has_child(BlockDriverState *bs,
  93                                   BlockDriverState *child);
  94
  95static void GRAPH_WRLOCK
  96bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs);
  97
  98static void GRAPH_WRLOCK
  99bdrv_remove_child(BdrvChild *child, Transaction *tran);
 100
 101static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
 102                               BlockReopenQueue *queue,
 103                               Transaction *change_child_tran, Error **errp);
 104static void bdrv_reopen_commit(BDRVReopenState *reopen_state);
 105static void bdrv_reopen_abort(BDRVReopenState *reopen_state);
 106
 107static bool bdrv_backing_overridden(BlockDriverState *bs);
 108
 109static bool GRAPH_RDLOCK
 110bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
 111                        GHashTable *visited, Transaction *tran, Error **errp);
 112
 113/* If non-zero, use only whitelisted block drivers */
 114static int use_bdrv_whitelist;
 115
 116#ifdef _WIN32
 117static int is_windows_drive_prefix(const char *filename)
 118{
 119    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
 120             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
 121            filename[1] == ':');
 122}
 123
 124int is_windows_drive(const char *filename)
 125{
 126    if (is_windows_drive_prefix(filename) &&
 127        filename[2] == '\0')
 128        return 1;
 129    if (strstart(filename, "\\\\.\\", NULL) ||
 130        strstart(filename, "//./", NULL))
 131        return 1;
 132    return 0;
 133}
 134#endif
 135
 136size_t bdrv_opt_mem_align(BlockDriverState *bs)
 137{
 138    if (!bs || !bs->drv) {
 139        /* page size or 4k (hdd sector size) should be on the safe side */
 140        return MAX(4096, qemu_real_host_page_size());
 141    }
 142    IO_CODE();
 143
 144    return bs->bl.opt_mem_alignment;
 145}
 146
 147size_t bdrv_min_mem_align(BlockDriverState *bs)
 148{
 149    if (!bs || !bs->drv) {
 150        /* page size or 4k (hdd sector size) should be on the safe side */
 151        return MAX(4096, qemu_real_host_page_size());
 152    }
 153    IO_CODE();
 154
 155    return bs->bl.min_mem_alignment;
 156}
 157
 158/* check if the path starts with "<protocol>:" */
 159int path_has_protocol(const char *path)
 160{
 161    const char *p;
 162
 163#ifdef _WIN32
 164    if (is_windows_drive(path) ||
 165        is_windows_drive_prefix(path)) {
 166        return 0;
 167    }
 168    p = path + strcspn(path, ":/\\");
 169#else
 170    p = path + strcspn(path, ":/");
 171#endif
 172
 173    return *p == ':';
 174}
 175
 176int path_is_absolute(const char *path)
 177{
 178#ifdef _WIN32
 179    /* specific case for names like: "\\.\d:" */
 180    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
 181        return 1;
 182    }
 183    return (*path == '/' || *path == '\\');
 184#else
 185    return (*path == '/');
 186#endif
 187}
 188
 189/* if filename is absolute, just return its duplicate. Otherwise, build a
 190   path to it by considering it is relative to base_path. URL are
 191   supported. */
 192char *path_combine(const char *base_path, const char *filename)
 193{
 194    const char *protocol_stripped = NULL;
 195    const char *p, *p1;
 196    char *result;
 197    int len;
 198
 199    if (path_is_absolute(filename)) {
 200        return g_strdup(filename);
 201    }
 202
 203    if (path_has_protocol(base_path)) {
 204        protocol_stripped = strchr(base_path, ':');
 205        if (protocol_stripped) {
 206            protocol_stripped++;
 207        }
 208    }
 209    p = protocol_stripped ?: base_path;
 210
 211    p1 = strrchr(base_path, '/');
 212#ifdef _WIN32
 213    {
 214        const char *p2;
 215        p2 = strrchr(base_path, '\\');
 216        if (!p1 || p2 > p1) {
 217            p1 = p2;
 218        }
 219    }
 220#endif
 221    if (p1) {
 222        p1++;
 223    } else {
 224        p1 = base_path;
 225    }
 226    if (p1 > p) {
 227        p = p1;
 228    }
 229    len = p - base_path;
 230
 231    result = g_malloc(len + strlen(filename) + 1);
 232    memcpy(result, base_path, len);
 233    strcpy(result + len, filename);
 234
 235    return result;
 236}
 237
 238/*
 239 * Helper function for bdrv_parse_filename() implementations to remove optional
 240 * protocol prefixes (especially "file:") from a filename and for putting the
 241 * stripped filename into the options QDict if there is such a prefix.
 242 */
 243void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
 244                                      QDict *options)
 245{
 246    if (strstart(filename, prefix, &filename)) {
 247        /* Stripping the explicit protocol prefix may result in a protocol
 248         * prefix being (wrongly) detected (if the filename contains a colon) */
 249        if (path_has_protocol(filename)) {
 250            GString *fat_filename;
 251
 252            /* This means there is some colon before the first slash; therefore,
 253             * this cannot be an absolute path */
 254            assert(!path_is_absolute(filename));
 255
 256            /* And we can thus fix the protocol detection issue by prefixing it
 257             * by "./" */
 258            fat_filename = g_string_new("./");
 259            g_string_append(fat_filename, filename);
 260
 261            assert(!path_has_protocol(fat_filename->str));
 262
 263            qdict_put(options, "filename",
 264                      qstring_from_gstring(fat_filename));
 265        } else {
 266            /* If no protocol prefix was detected, we can use the shortened
 267             * filename as-is */
 268            qdict_put_str(options, "filename", filename);
 269        }
 270    }
 271}
 272
 273
 274/* Returns whether the image file is opened as read-only. Note that this can
 275 * return false and writing to the image file is still not possible because the
 276 * image is inactivated. */
 277bool bdrv_is_read_only(BlockDriverState *bs)
 278{
 279    IO_CODE();
 280    return !(bs->open_flags & BDRV_O_RDWR);
 281}
 282
 283static int GRAPH_RDLOCK
 284bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
 285                       bool ignore_allow_rdw, Error **errp)
 286{
 287    IO_CODE();
 288
 289    /* Do not set read_only if copy_on_read is enabled */
 290    if (bs->copy_on_read && read_only) {
 291        error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled",
 292                   bdrv_get_device_or_node_name(bs));
 293        return -EINVAL;
 294    }
 295
 296    /* Do not clear read_only if it is prohibited */
 297    if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
 298        !ignore_allow_rdw)
 299    {
 300        error_setg(errp, "Node '%s' is read only",
 301                   bdrv_get_device_or_node_name(bs));
 302        return -EPERM;
 303    }
 304
 305    return 0;
 306}
 307
 308/*
 309 * Called by a driver that can only provide a read-only image.
 310 *
 311 * Returns 0 if the node is already read-only or it could switch the node to
 312 * read-only because BDRV_O_AUTO_RDONLY is set.
 313 *
 314 * Returns -EACCES if the node is read-write and BDRV_O_AUTO_RDONLY is not set
 315 * or bdrv_can_set_read_only() forbids making the node read-only. If @errmsg
 316 * is not NULL, it is used as the error message for the Error object.
 317 */
 318int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
 319                              Error **errp)
 320{
 321    int ret = 0;
 322    IO_CODE();
 323
 324    if (!(bs->open_flags & BDRV_O_RDWR)) {
 325        return 0;
 326    }
 327    if (!(bs->open_flags & BDRV_O_AUTO_RDONLY)) {
 328        goto fail;
 329    }
 330
 331    ret = bdrv_can_set_read_only(bs, true, false, NULL);
 332    if (ret < 0) {
 333        goto fail;
 334    }
 335
 336    bs->open_flags &= ~BDRV_O_RDWR;
 337
 338    return 0;
 339
 340fail:
 341    error_setg(errp, "%s", errmsg ?: "Image is read-only");
 342    return -EACCES;
 343}
 344
 345/*
 346 * If @backing is empty, this function returns NULL without setting
 347 * @errp.  In all other cases, NULL will only be returned with @errp
 348 * set.
 349 *
 350 * Therefore, a return value of NULL without @errp set means that
 351 * there is no backing file; if @errp is set, there is one but its
 352 * absolute filename cannot be generated.
 353 */
 354char *bdrv_get_full_backing_filename_from_filename(const char *backed,
 355                                                   const char *backing,
 356                                                   Error **errp)
 357{
 358    if (backing[0] == '\0') {
 359        return NULL;
 360    } else if (path_has_protocol(backing) || path_is_absolute(backing)) {
 361        return g_strdup(backing);
 362    } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
 363        error_setg(errp, "Cannot use relative backing file names for '%s'",
 364                   backed);
 365        return NULL;
 366    } else {
 367        return path_combine(backed, backing);
 368    }
 369}
 370
 371/*
 372 * If @filename is empty or NULL, this function returns NULL without
 373 * setting @errp.  In all other cases, NULL will only be returned with
 374 * @errp set.
 375 */
 376static char * GRAPH_RDLOCK
 377bdrv_make_absolute_filename(BlockDriverState *relative_to,
 378                            const char *filename, Error **errp)
 379{
 380    char *dir, *full_name;
 381
 382    if (!filename || filename[0] == '\0') {
 383        return NULL;
 384    } else if (path_has_protocol(filename) || path_is_absolute(filename)) {
 385        return g_strdup(filename);
 386    }
 387
 388    dir = bdrv_dirname(relative_to, errp);
 389    if (!dir) {
 390        return NULL;
 391    }
 392
 393    full_name = g_strconcat(dir, filename, NULL);
 394    g_free(dir);
 395    return full_name;
 396}
 397
 398char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
 399{
 400    GLOBAL_STATE_CODE();
 401    return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
 402}
 403
 404void bdrv_register(BlockDriver *bdrv)
 405{
 406    assert(bdrv->format_name);
 407    GLOBAL_STATE_CODE();
 408    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 409}
 410
 411BlockDriverState *bdrv_new(void)
 412{
 413    BlockDriverState *bs;
 414    int i;
 415
 416    GLOBAL_STATE_CODE();
 417
 418    bs = g_new0(BlockDriverState, 1);
 419    QLIST_INIT(&bs->dirty_bitmaps);
 420    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
 421        QLIST_INIT(&bs->op_blockers[i]);
 422    }
 423    qemu_mutex_init(&bs->reqs_lock);
 424    qemu_mutex_init(&bs->dirty_bitmap_mutex);
 425    bs->refcnt = 1;
 426    bs->aio_context = qemu_get_aio_context();
 427
 428    qemu_co_queue_init(&bs->flush_queue);
 429
 430    qemu_co_mutex_init(&bs->bsc_modify_lock);
 431    bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1);
 432
 433    for (i = 0; i < bdrv_drain_all_count; i++) {
 434        bdrv_do_drained_begin_quiesce(bs, NULL);
 435    }
 436
 437    QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
 438
 439    return bs;
 440}
 441
 442static BlockDriver *bdrv_do_find_format(const char *format_name)
 443{
 444    BlockDriver *drv1;
 445    GLOBAL_STATE_CODE();
 446
 447    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 448        if (!strcmp(drv1->format_name, format_name)) {
 449            return drv1;
 450        }
 451    }
 452
 453    return NULL;
 454}
 455
 456BlockDriver *bdrv_find_format(const char *format_name)
 457{
 458    BlockDriver *drv1;
 459    int i;
 460
 461    GLOBAL_STATE_CODE();
 462
 463    drv1 = bdrv_do_find_format(format_name);
 464    if (drv1) {
 465        return drv1;
 466    }
 467
 468    /* The driver isn't registered, maybe we need to load a module */
 469    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
 470        if (!strcmp(block_driver_modules[i].format_name, format_name)) {
 471            Error *local_err = NULL;
 472            int rv = block_module_load(block_driver_modules[i].library_name,
 473                                       &local_err);
 474            if (rv > 0) {
 475                return bdrv_do_find_format(format_name);
 476            } else if (rv < 0) {
 477                error_report_err(local_err);
 478            }
 479            break;
 480        }
 481    }
 482    return NULL;
 483}
 484
 485static int bdrv_format_is_whitelisted(const char *format_name, bool read_only)
 486{
 487    static const char *whitelist_rw[] = {
 488        CONFIG_BDRV_RW_WHITELIST
 489        NULL
 490    };
 491    static const char *whitelist_ro[] = {
 492        CONFIG_BDRV_RO_WHITELIST
 493        NULL
 494    };
 495    const char **p;
 496
 497    if (!whitelist_rw[0] && !whitelist_ro[0]) {
 498        return 1;               /* no whitelist, anything goes */
 499    }
 500
 501    for (p = whitelist_rw; *p; p++) {
 502        if (!strcmp(format_name, *p)) {
 503            return 1;
 504        }
 505    }
 506    if (read_only) {
 507        for (p = whitelist_ro; *p; p++) {
 508            if (!strcmp(format_name, *p)) {
 509                return 1;
 510            }
 511        }
 512    }
 513    return 0;
 514}
 515
 516int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
 517{
 518    GLOBAL_STATE_CODE();
 519    return bdrv_format_is_whitelisted(drv->format_name, read_only);
 520}
 521
 522bool bdrv_uses_whitelist(void)
 523{
 524    return use_bdrv_whitelist;
 525}
 526
 527typedef struct CreateCo {
 528    BlockDriver *drv;
 529    char *filename;
 530    QemuOpts *opts;
 531    int ret;
 532    Error *err;
 533} CreateCo;
 534
 535int coroutine_fn bdrv_co_create(BlockDriver *drv, const char *filename,
 536                                QemuOpts *opts, Error **errp)
 537{
 538    ERRP_GUARD();
 539    int ret;
 540    GLOBAL_STATE_CODE();
 541
 542    if (!drv->bdrv_co_create_opts) {
 543        error_setg(errp, "Driver '%s' does not support image creation",
 544                   drv->format_name);
 545        return -ENOTSUP;
 546    }
 547
 548    ret = drv->bdrv_co_create_opts(drv, filename, opts, errp);
 549    if (ret < 0 && !*errp) {
 550        error_setg_errno(errp, -ret, "Could not create image");
 551    }
 552
 553    return ret;
 554}
 555
 556/**
 557 * Helper function for bdrv_create_file_fallback(): Resize @blk to at
 558 * least the given @minimum_size.
 559 *
 560 * On success, return @blk's actual length.
 561 * Otherwise, return -errno.
 562 */
 563static int64_t coroutine_fn GRAPH_UNLOCKED
 564create_file_fallback_truncate(BlockBackend *blk, int64_t minimum_size,
 565                              Error **errp)
 566{
 567    Error *local_err = NULL;
 568    int64_t size;
 569    int ret;
 570
 571    GLOBAL_STATE_CODE();
 572
 573    ret = blk_co_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
 574                          &local_err);
 575    if (ret < 0 && ret != -ENOTSUP) {
 576        error_propagate(errp, local_err);
 577        return ret;
 578    }
 579
 580    size = blk_co_getlength(blk);
 581    if (size < 0) {
 582        error_free(local_err);
 583        error_setg_errno(errp, -size,
 584                         "Failed to inquire the new image file's length");
 585        return size;
 586    }
 587
 588    if (size < minimum_size) {
 589        /* Need to grow the image, but we failed to do that */
 590        error_propagate(errp, local_err);
 591        return -ENOTSUP;
 592    }
 593
 594    error_free(local_err);
 595    local_err = NULL;
 596
 597    return size;
 598}
 599
 600/**
 601 * Helper function for bdrv_create_file_fallback(): Zero the first
 602 * sector to remove any potentially pre-existing image header.
 603 */
 604static int coroutine_fn
 605create_file_fallback_zero_first_sector(BlockBackend *blk,
 606                                       int64_t current_size,
 607                                       Error **errp)
 608{
 609    int64_t bytes_to_clear;
 610    int ret;
 611
 612    GLOBAL_STATE_CODE();
 613
 614    bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
 615    if (bytes_to_clear) {
 616        ret = blk_co_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
 617        if (ret < 0) {
 618            error_setg_errno(errp, -ret,
 619                             "Failed to clear the new image's first sector");
 620            return ret;
 621        }
 622    }
 623
 624    return 0;
 625}
 626
 627/**
 628 * Simple implementation of bdrv_co_create_opts for protocol drivers
 629 * which only support creation via opening a file
 630 * (usually existing raw storage device)
 631 */
 632int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
 633                                            const char *filename,
 634                                            QemuOpts *opts,
 635                                            Error **errp)
 636{
 637    ERRP_GUARD();
 638    BlockBackend *blk;
 639    QDict *options;
 640    int64_t size = 0;
 641    char *buf = NULL;
 642    PreallocMode prealloc;
 643    Error *local_err = NULL;
 644    int ret;
 645
 646    GLOBAL_STATE_CODE();
 647
 648    size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
 649    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
 650    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
 651                               PREALLOC_MODE_OFF, &local_err);
 652    g_free(buf);
 653    if (local_err) {
 654        error_propagate(errp, local_err);
 655        return -EINVAL;
 656    }
 657
 658    if (prealloc != PREALLOC_MODE_OFF) {
 659        error_setg(errp, "Unsupported preallocation mode '%s'",
 660                   PreallocMode_str(prealloc));
 661        return -ENOTSUP;
 662    }
 663
 664    options = qdict_new();
 665    qdict_put_str(options, "driver", drv->format_name);
 666
 667    blk = blk_co_new_open(filename, NULL, options,
 668                          BDRV_O_RDWR | BDRV_O_RESIZE, errp);
 669    if (!blk) {
 670        error_prepend(errp, "Protocol driver '%s' does not support creating "
 671                      "new images, so an existing image must be selected as "
 672                      "the target; however, opening the given target as an "
 673                      "existing image failed: ",
 674                      drv->format_name);
 675        return -EINVAL;
 676    }
 677
 678    size = create_file_fallback_truncate(blk, size, errp);
 679    if (size < 0) {
 680        ret = size;
 681        goto out;
 682    }
 683
 684    ret = create_file_fallback_zero_first_sector(blk, size, errp);
 685    if (ret < 0) {
 686        goto out;
 687    }
 688
 689    ret = 0;
 690out:
 691    blk_co_unref(blk);
 692    return ret;
 693}
 694
 695int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
 696                                     Error **errp)
 697{
 698    QemuOpts *protocol_opts;
 699    BlockDriver *drv;
 700    QDict *qdict;
 701    int ret;
 702
 703    GLOBAL_STATE_CODE();
 704
 705    drv = bdrv_find_protocol(filename, true, errp);
 706    if (drv == NULL) {
 707        return -ENOENT;
 708    }
 709
 710    if (!drv->create_opts) {
 711        error_setg(errp, "Driver '%s' does not support image creation",
 712                   drv->format_name);
 713        return -ENOTSUP;
 714    }
 715
 716    /*
 717     * 'opts' contains a QemuOptsList with a combination of format and protocol
 718     * default values.
 719     *
 720     * The format properly removes its options, but the default values remain
 721     * in 'opts->list'.  So if the protocol has options with the same name
 722     * (e.g. rbd has 'cluster_size' as qcow2), it will see the default values
 723     * of the format, since for overlapping options, the format wins.
 724     *
 725     * To avoid this issue, lets convert QemuOpts to QDict, in this way we take
 726     * only the set options, and then convert it back to QemuOpts, using the
 727     * create_opts of the protocol. So the new QemuOpts, will contain only the
 728     * protocol defaults.
 729     */
 730    qdict = qemu_opts_to_qdict(opts, NULL);
 731    protocol_opts = qemu_opts_from_qdict(drv->create_opts, qdict, errp);
 732    if (protocol_opts == NULL) {
 733        ret = -EINVAL;
 734        goto out;
 735    }
 736
 737    ret = bdrv_co_create(drv, filename, protocol_opts, errp);
 738out:
 739    qemu_opts_del(protocol_opts);
 740    qobject_unref(qdict);
 741    return ret;
 742}
 743
 744int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
 745{
 746    Error *local_err = NULL;
 747    int ret;
 748
 749    IO_CODE();
 750    assert(bs != NULL);
 751    assert_bdrv_graph_readable();
 752
 753    if (!bs->drv) {
 754        error_setg(errp, "Block node '%s' is not opened", bs->filename);
 755        return -ENOMEDIUM;
 756    }
 757
 758    if (!bs->drv->bdrv_co_delete_file) {
 759        error_setg(errp, "Driver '%s' does not support image deletion",
 760                   bs->drv->format_name);
 761        return -ENOTSUP;
 762    }
 763
 764    ret = bs->drv->bdrv_co_delete_file(bs, &local_err);
 765    if (ret < 0) {
 766        error_propagate(errp, local_err);
 767    }
 768
 769    return ret;
 770}
 771
 772void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs)
 773{
 774    Error *local_err = NULL;
 775    int ret;
 776    IO_CODE();
 777
 778    if (!bs) {
 779        return;
 780    }
 781
 782    ret = bdrv_co_delete_file(bs, &local_err);
 783    /*
 784     * ENOTSUP will happen if the block driver doesn't support
 785     * the 'bdrv_co_delete_file' interface. This is a predictable
 786     * scenario and shouldn't be reported back to the user.
 787     */
 788    if (ret == -ENOTSUP) {
 789        error_free(local_err);
 790    } else if (ret < 0) {
 791        error_report_err(local_err);
 792    }
 793}
 794
 795/**
 796 * Try to get @bs's logical and physical block size.
 797 * On success, store them in @bsz struct and return 0.
 798 * On failure return -errno.
 799 * @bs must not be empty.
 800 */
 801int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 802{
 803    BlockDriver *drv = bs->drv;
 804    BlockDriverState *filtered = bdrv_filter_bs(bs);
 805    GLOBAL_STATE_CODE();
 806
 807    if (drv && drv->bdrv_probe_blocksizes) {
 808        return drv->bdrv_probe_blocksizes(bs, bsz);
 809    } else if (filtered) {
 810        return bdrv_probe_blocksizes(filtered, bsz);
 811    }
 812
 813    return -ENOTSUP;
 814}
 815
 816/**
 817 * Try to get @bs's geometry (cyls, heads, sectors).
 818 * On success, store them in @geo struct and return 0.
 819 * On failure return -errno.
 820 * @bs must not be empty.
 821 */
 822int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 823{
 824    BlockDriver *drv = bs->drv;
 825    BlockDriverState *filtered;
 826
 827    GLOBAL_STATE_CODE();
 828    GRAPH_RDLOCK_GUARD_MAINLOOP();
 829
 830    if (drv && drv->bdrv_probe_geometry) {
 831        return drv->bdrv_probe_geometry(bs, geo);
 832    }
 833
 834    filtered = bdrv_filter_bs(bs);
 835    if (filtered) {
 836        return bdrv_probe_geometry(filtered, geo);
 837    }
 838
 839    return -ENOTSUP;
 840}
 841
 842/*
 843 * Create a uniquely-named empty temporary file.
 844 * Return the actual file name used upon success, otherwise NULL.
 845 * This string should be freed with g_free() when not needed any longer.
 846 *
 847 * Note: creating a temporary file for the caller to (re)open is
 848 * inherently racy. Use g_file_open_tmp() instead whenever practical.
 849 */
 850char *create_tmp_file(Error **errp)
 851{
 852    int fd;
 853    const char *tmpdir;
 854    g_autofree char *filename = NULL;
 855
 856    tmpdir = g_get_tmp_dir();
 857#ifndef _WIN32
 858    /*
 859     * See commit 69bef79 ("block: use /var/tmp instead of /tmp for -snapshot")
 860     *
 861     * This function is used to create temporary disk images (like -snapshot),
 862     * so the files can become very large. /tmp is often a tmpfs where as
 863     * /var/tmp is usually on a disk, so more appropriate for disk images.
 864     */
 865    if (!g_strcmp0(tmpdir, "/tmp")) {
 866        tmpdir = "/var/tmp";
 867    }
 868#endif
 869
 870    filename = g_strdup_printf("%s/vl.XXXXXX", tmpdir);
 871    fd = g_mkstemp(filename);
 872    if (fd < 0) {
 873        error_setg_errno(errp, errno, "Could not open temporary file '%s'",
 874                         filename);
 875        return NULL;
 876    }
 877    close(fd);
 878
 879    return g_steal_pointer(&filename);
 880}
 881
 882/*
 883 * Detect host devices. By convention, /dev/cdrom[N] is always
 884 * recognized as a host CDROM.
 885 */
 886static BlockDriver *find_hdev_driver(const char *filename)
 887{
 888    int score_max = 0, score;
 889    BlockDriver *drv = NULL, *d;
 890    GLOBAL_STATE_CODE();
 891
 892    QLIST_FOREACH(d, &bdrv_drivers, list) {
 893        if (d->bdrv_probe_device) {
 894            score = d->bdrv_probe_device(filename);
 895            if (score > score_max) {
 896                score_max = score;
 897                drv = d;
 898            }
 899        }
 900    }
 901
 902    return drv;
 903}
 904
 905static BlockDriver *bdrv_do_find_protocol(const char *protocol)
 906{
 907    BlockDriver *drv1;
 908    GLOBAL_STATE_CODE();
 909
 910    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 911        if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
 912            return drv1;
 913        }
 914    }
 915
 916    return NULL;
 917}
 918
 919BlockDriver *bdrv_find_protocol(const char *filename,
 920                                bool allow_protocol_prefix,
 921                                Error **errp)
 922{
 923    BlockDriver *drv1;
 924    char protocol[128];
 925    int len;
 926    const char *p;
 927    int i;
 928
 929    GLOBAL_STATE_CODE();
 930
 931    /*
 932     * XXX(hch): we really should not let host device detection
 933     * override an explicit protocol specification, but moving this
 934     * later breaks access to device names with colons in them.
 935     * Thanks to the brain-dead persistent naming schemes on udev-
 936     * based Linux systems those actually are quite common.
 937     */
 938    drv1 = find_hdev_driver(filename);
 939    if (drv1) {
 940        return drv1;
 941    }
 942
 943    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
 944        return &bdrv_file;
 945    }
 946
 947    p = strchr(filename, ':');
 948    assert(p != NULL);
 949    len = p - filename;
 950    if (len > sizeof(protocol) - 1)
 951        len = sizeof(protocol) - 1;
 952    memcpy(protocol, filename, len);
 953    protocol[len] = '\0';
 954
 955    drv1 = bdrv_do_find_protocol(protocol);
 956    if (drv1) {
 957        return drv1;
 958    }
 959
 960    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
 961        if (block_driver_modules[i].protocol_name &&
 962            !strcmp(block_driver_modules[i].protocol_name, protocol)) {
 963            int rv = block_module_load(block_driver_modules[i].library_name, errp);
 964            if (rv > 0) {
 965                drv1 = bdrv_do_find_protocol(protocol);
 966            } else if (rv < 0) {
 967                return NULL;
 968            }
 969            break;
 970        }
 971    }
 972
 973    if (!drv1) {
 974        error_setg(errp, "Unknown protocol '%s'", protocol);
 975    }
 976    return drv1;
 977}
 978
 979/*
 980 * Guess image format by probing its contents.
 981 * This is not a good idea when your image is raw (CVE-2008-2004), but
 982 * we do it anyway for backward compatibility.
 983 *
 984 * @buf         contains the image's first @buf_size bytes.
 985 * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
 986 *              but can be smaller if the image file is smaller)
 987 * @filename    is its filename.
 988 *
 989 * For all block drivers, call the bdrv_probe() method to get its
 990 * probing score.
 991 * Return the first block driver with the highest probing score.
 992 */
 993BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
 994                            const char *filename)
 995{
 996    int score_max = 0, score;
 997    BlockDriver *drv = NULL, *d;
 998    IO_CODE();
 999
1000    QLIST_FOREACH(d, &bdrv_drivers, list) {
1001        if (d->bdrv_probe) {
1002            score = d->bdrv_probe(buf, buf_size, filename);
1003            if (score > score_max) {
1004                score_max = score;
1005                drv = d;
1006            }
1007        }
1008    }
1009
1010    return drv;
1011}
1012
1013static int find_image_format(BlockBackend *file, const char *filename,
1014                             BlockDriver **pdrv, Error **errp)
1015{
1016    BlockDriver *drv;
1017    uint8_t buf[BLOCK_PROBE_BUF_SIZE];
1018    int ret = 0;
1019
1020    GLOBAL_STATE_CODE();
1021
1022    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
1023    if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) {
1024        *pdrv = &bdrv_raw;
1025        return ret;
1026    }
1027
1028    ret = blk_pread(file, 0, sizeof(buf), buf, 0);
1029    if (ret < 0) {
1030        error_setg_errno(errp, -ret, "Could not read image for determining its "
1031                         "format");
1032        *pdrv = NULL;
1033        return ret;
1034    }
1035
1036    drv = bdrv_probe_all(buf, sizeof(buf), filename);
1037    if (!drv) {
1038        error_setg(errp, "Could not determine image format: No compatible "
1039                   "driver found");
1040        *pdrv = NULL;
1041        return -ENOENT;
1042    }
1043
1044    *pdrv = drv;
1045    return 0;
1046}
1047
1048/**
1049 * Set the current 'total_sectors' value
1050 * Return 0 on success, -errno on error.
1051 */
1052int coroutine_fn bdrv_co_refresh_total_sectors(BlockDriverState *bs,
1053                                               int64_t hint)
1054{
1055    BlockDriver *drv = bs->drv;
1056    IO_CODE();
1057    assert_bdrv_graph_readable();
1058
1059    if (!drv) {
1060        return -ENOMEDIUM;
1061    }
1062
1063    /* Do not attempt drv->bdrv_co_getlength() on scsi-generic devices */
1064    if (bdrv_is_sg(bs))
1065        return 0;
1066
1067    /* query actual device if possible, otherwise just trust the hint */
1068    if (drv->bdrv_co_getlength) {
1069        int64_t length = drv->bdrv_co_getlength(bs);
1070        if (length < 0) {
1071            return length;
1072        }
1073        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
1074    }
1075
1076    bs->total_sectors = hint;
1077
1078    if (bs->total_sectors * BDRV_SECTOR_SIZE > BDRV_MAX_LENGTH) {
1079        return -EFBIG;
1080    }
1081
1082    return 0;
1083}
1084
1085/**
1086 * Combines a QDict of new block driver @options with any missing options taken
1087 * from @old_options, so that leaving out an option defaults to its old value.
1088 */
1089static void bdrv_join_options(BlockDriverState *bs, QDict *options,
1090                              QDict *old_options)
1091{
1092    GLOBAL_STATE_CODE();
1093    if (bs->drv && bs->drv->bdrv_join_options) {
1094        bs->drv->bdrv_join_options(options, old_options);
1095    } else {
1096        qdict_join(options, old_options, false);
1097    }
1098}
1099
1100static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
1101                                                            int open_flags,
1102                                                            Error **errp)
1103{
1104    Error *local_err = NULL;
1105    char *value = qemu_opt_get_del(opts, "detect-zeroes");
1106    BlockdevDetectZeroesOptions detect_zeroes =
1107        qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
1108                        BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
1109    GLOBAL_STATE_CODE();
1110    g_free(value);
1111    if (local_err) {
1112        error_propagate(errp, local_err);
1113        return detect_zeroes;
1114    }
1115
1116    if (detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
1117        !(open_flags & BDRV_O_UNMAP))
1118    {
1119        error_setg(errp, "setting detect-zeroes to unmap is not allowed "
1120                   "without setting discard operation to unmap");
1121    }
1122
1123    return detect_zeroes;
1124}
1125
1126/**
1127 * Set open flags for aio engine
1128 *
1129 * Return 0 on success, -1 if the engine specified is invalid
1130 */
1131int bdrv_parse_aio(const char *mode, int *flags)
1132{
1133    if (!strcmp(mode, "threads")) {
1134        /* do nothing, default */
1135    } else if (!strcmp(mode, "native")) {
1136        *flags |= BDRV_O_NATIVE_AIO;
1137#ifdef CONFIG_LINUX_IO_URING
1138    } else if (!strcmp(mode, "io_uring")) {
1139        *flags |= BDRV_O_IO_URING;
1140#endif
1141    } else {
1142        return -1;
1143    }
1144
1145    return 0;
1146}
1147
1148/**
1149 * Set open flags for a given discard mode
1150 *
1151 * Return 0 on success, -1 if the discard mode was invalid.
1152 */
1153int bdrv_parse_discard_flags(const char *mode, int *flags)
1154{
1155    *flags &= ~BDRV_O_UNMAP;
1156
1157    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
1158        /* do nothing */
1159    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
1160        *flags |= BDRV_O_UNMAP;
1161    } else {
1162        return -1;
1163    }
1164
1165    return 0;
1166}
1167
1168/**
1169 * Set open flags for a given cache mode
1170 *
1171 * Return 0 on success, -1 if the cache mode was invalid.
1172 */
1173int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
1174{
1175    *flags &= ~BDRV_O_CACHE_MASK;
1176
1177    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
1178        *writethrough = false;
1179        *flags |= BDRV_O_NOCACHE;
1180    } else if (!strcmp(mode, "directsync")) {
1181        *writethrough = true;
1182        *flags |= BDRV_O_NOCACHE;
1183    } else if (!strcmp(mode, "writeback")) {
1184        *writethrough = false;
1185    } else if (!strcmp(mode, "unsafe")) {
1186        *writethrough = false;
1187        *flags |= BDRV_O_NO_FLUSH;
1188    } else if (!strcmp(mode, "writethrough")) {
1189        *writethrough = true;
1190    } else {
1191        return -1;
1192    }
1193
1194    return 0;
1195}
1196
1197static char *bdrv_child_get_parent_desc(BdrvChild *c)
1198{
1199    BlockDriverState *parent = c->opaque;
1200    return g_strdup_printf("node '%s'", bdrv_get_node_name(parent));
1201}
1202
1203static void GRAPH_RDLOCK bdrv_child_cb_drained_begin(BdrvChild *child)
1204{
1205    BlockDriverState *bs = child->opaque;
1206    bdrv_do_drained_begin_quiesce(bs, NULL);
1207}
1208
1209static bool GRAPH_RDLOCK bdrv_child_cb_drained_poll(BdrvChild *child)
1210{
1211    BlockDriverState *bs = child->opaque;
1212    return bdrv_drain_poll(bs, NULL, false);
1213}
1214
1215static void GRAPH_RDLOCK bdrv_child_cb_drained_end(BdrvChild *child)
1216{
1217    BlockDriverState *bs = child->opaque;
1218    bdrv_drained_end(bs);
1219}
1220
1221static int bdrv_child_cb_inactivate(BdrvChild *child)
1222{
1223    BlockDriverState *bs = child->opaque;
1224    GLOBAL_STATE_CODE();
1225    assert(bs->open_flags & BDRV_O_INACTIVE);
1226    return 0;
1227}
1228
1229static bool GRAPH_RDLOCK
1230bdrv_child_cb_change_aio_ctx(BdrvChild *child, AioContext *ctx,
1231                             GHashTable *visited, Transaction *tran,
1232                             Error **errp)
1233{
1234    BlockDriverState *bs = child->opaque;
1235    return bdrv_change_aio_context(bs, ctx, visited, tran, errp);
1236}
1237
1238/*
1239 * Returns the options and flags that a temporary snapshot should get, based on
1240 * the originally requested flags (the originally requested image will have
1241 * flags like a backing file)
1242 */
1243static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
1244                                       int parent_flags, QDict *parent_options)
1245{
1246    GLOBAL_STATE_CODE();
1247    *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
1248
1249    /* For temporary files, unconditional cache=unsafe is fine */
1250    qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
1251    qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
1252
1253    /* Copy the read-only and discard options from the parent */
1254    qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1255    qdict_copy_default(child_options, parent_options, BDRV_OPT_DISCARD);
1256
1257    /* aio=native doesn't work for cache.direct=off, so disable it for the
1258     * temporary snapshot */
1259    *child_flags &= ~BDRV_O_NATIVE_AIO;
1260}
1261
1262static void GRAPH_WRLOCK bdrv_backing_attach(BdrvChild *c)
1263{
1264    BlockDriverState *parent = c->opaque;
1265    BlockDriverState *backing_hd = c->bs;
1266
1267    GLOBAL_STATE_CODE();
1268    assert(!parent->backing_blocker);
1269    error_setg(&parent->backing_blocker,
1270               "node is used as backing hd of '%s'",
1271               bdrv_get_device_or_node_name(parent));
1272
1273    bdrv_refresh_filename(backing_hd);
1274
1275    parent->open_flags &= ~BDRV_O_NO_BACKING;
1276
1277    bdrv_op_block_all(backing_hd, parent->backing_blocker);
1278    /* Otherwise we won't be able to commit or stream */
1279    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1280                    parent->backing_blocker);
1281    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
1282                    parent->backing_blocker);
1283    /*
1284     * We do backup in 3 ways:
1285     * 1. drive backup
1286     *    The target bs is new opened, and the source is top BDS
1287     * 2. blockdev backup
1288     *    Both the source and the target are top BDSes.
1289     * 3. internal backup(used for block replication)
1290     *    Both the source and the target are backing file
1291     *
1292     * In case 1 and 2, neither the source nor the target is the backing file.
1293     * In case 3, we will block the top BDS, so there is only one block job
1294     * for the top BDS and its backing chain.
1295     */
1296    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
1297                    parent->backing_blocker);
1298    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
1299                    parent->backing_blocker);
1300}
1301
1302static void bdrv_backing_detach(BdrvChild *c)
1303{
1304    BlockDriverState *parent = c->opaque;
1305
1306    GLOBAL_STATE_CODE();
1307    assert(parent->backing_blocker);
1308    bdrv_op_unblock_all(c->bs, parent->backing_blocker);
1309    error_free(parent->backing_blocker);
1310    parent->backing_blocker = NULL;
1311}
1312
1313static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
1314                                        const char *filename,
1315                                        bool backing_mask_protocol,
1316                                        Error **errp)
1317{
1318    BlockDriverState *parent = c->opaque;
1319    bool read_only = bdrv_is_read_only(parent);
1320    int ret;
1321    const char *format_name;
1322    GLOBAL_STATE_CODE();
1323
1324    if (read_only) {
1325        ret = bdrv_reopen_set_read_only(parent, false, errp);
1326        if (ret < 0) {
1327            return ret;
1328        }
1329    }
1330
1331    if (base->drv) {
1332        /*
1333         * If the new base image doesn't have a format driver layer, which we
1334         * detect by the fact that @base is a protocol driver, we record
1335         * 'raw' as the format instead of putting the protocol name as the
1336         * backing format
1337         */
1338        if (backing_mask_protocol && base->drv->protocol_name) {
1339            format_name = "raw";
1340        } else {
1341            format_name = base->drv->format_name;
1342        }
1343    } else {
1344        format_name = "";
1345    }
1346
1347    ret = bdrv_change_backing_file(parent, filename, format_name, false);
1348    if (ret < 0) {
1349        error_setg_errno(errp, -ret, "Could not update backing file link");
1350    }
1351
1352    if (read_only) {
1353        bdrv_reopen_set_read_only(parent, true, NULL);
1354    }
1355
1356    return ret;
1357}
1358
1359/*
1360 * Returns the options and flags that a generic child of a BDS should
1361 * get, based on the given options and flags for the parent BDS.
1362 */
1363static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
1364                                   int *child_flags, QDict *child_options,
1365                                   int parent_flags, QDict *parent_options)
1366{
1367    int flags = parent_flags;
1368    GLOBAL_STATE_CODE();
1369
1370    /*
1371     * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
1372     * Generally, the question to answer is: Should this child be
1373     * format-probed by default?
1374     */
1375
1376    /*
1377     * Pure and non-filtered data children of non-format nodes should
1378     * be probed by default (even when the node itself has BDRV_O_PROTOCOL
1379     * set).  This only affects a very limited set of drivers (namely
1380     * quorum and blkverify when this comment was written).
1381     * Force-clear BDRV_O_PROTOCOL then.
1382     */
1383    if (!parent_is_format &&
1384        (role & BDRV_CHILD_DATA) &&
1385        !(role & (BDRV_CHILD_METADATA | BDRV_CHILD_FILTERED)))
1386    {
1387        flags &= ~BDRV_O_PROTOCOL;
1388    }
1389
1390    /*
1391     * All children of format nodes (except for COW children) and all
1392     * metadata children in general should never be format-probed.
1393     * Force-set BDRV_O_PROTOCOL then.
1394     */
1395    if ((parent_is_format && !(role & BDRV_CHILD_COW)) ||
1396        (role & BDRV_CHILD_METADATA))
1397    {
1398        flags |= BDRV_O_PROTOCOL;
1399    }
1400
1401    /*
1402     * If the cache mode isn't explicitly set, inherit direct and no-flush from
1403     * the parent.
1404     */
1405    qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_DIRECT);
1406    qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_NO_FLUSH);
1407    qdict_copy_default(child_options, parent_options, BDRV_OPT_FORCE_SHARE);
1408
1409    if (role & BDRV_CHILD_COW) {
1410        /* backing files are opened read-only by default */
1411        qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "on");
1412        qdict_set_default_str(child_options, BDRV_OPT_AUTO_READ_ONLY, "off");
1413    } else {
1414        /* Inherit the read-only option from the parent if it's not set */
1415        qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1416        qdict_copy_default(child_options, parent_options,
1417                           BDRV_OPT_AUTO_READ_ONLY);
1418    }
1419
1420    /*
1421     * bdrv_co_pdiscard() respects unmap policy for the parent, so we
1422     * can default to enable it on lower layers regardless of the
1423     * parent option.
1424     */
1425    qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");
1426
1427    /* Clear flags that only apply to the top layer */
1428    flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
1429
1430    if (role & BDRV_CHILD_METADATA) {
1431        flags &= ~BDRV_O_NO_IO;
1432    }
1433    if (role & BDRV_CHILD_COW) {
1434        flags &= ~BDRV_O_TEMPORARY;
1435    }
1436
1437    *child_flags = flags;
1438}
1439
1440static void GRAPH_WRLOCK bdrv_child_cb_attach(BdrvChild *child)
1441{
1442    BlockDriverState *bs = child->opaque;
1443
1444    assert_bdrv_graph_writable();
1445    QLIST_INSERT_HEAD(&bs->children, child, next);
1446    if (bs->drv->is_filter || (child->role & BDRV_CHILD_FILTERED)) {
1447        /*
1448         * Here we handle filters and block/raw-format.c when it behave like
1449         * filter. They generally have a single PRIMARY child, which is also the
1450         * FILTERED child, and that they may have multiple more children, which
1451         * are neither PRIMARY nor FILTERED. And never we have a COW child here.
1452         * So bs->file will be the PRIMARY child, unless the PRIMARY child goes
1453         * into bs->backing on exceptional cases; and bs->backing will be
1454         * nothing else.
1455         */
1456        assert(!(child->role & BDRV_CHILD_COW));
1457        if (child->role & BDRV_CHILD_PRIMARY) {
1458            assert(child->role & BDRV_CHILD_FILTERED);
1459            assert(!bs->backing);
1460            assert(!bs->file);
1461
1462            if (bs->drv->filtered_child_is_backing) {
1463                bs->backing = child;
1464            } else {
1465                bs->file = child;
1466            }
1467        } else {
1468            assert(!(child->role & BDRV_CHILD_FILTERED));
1469        }
1470    } else if (child->role & BDRV_CHILD_COW) {
1471        assert(bs->drv->supports_backing);
1472        assert(!(child->role & BDRV_CHILD_PRIMARY));
1473        assert(!bs->backing);
1474        bs->backing = child;
1475        bdrv_backing_attach(child);
1476    } else if (child->role & BDRV_CHILD_PRIMARY) {
1477        assert(!bs->file);
1478        bs->file = child;
1479    }
1480}
1481
1482static void GRAPH_WRLOCK bdrv_child_cb_detach(BdrvChild *child)
1483{
1484    BlockDriverState *bs = child->opaque;
1485
1486    if (child->role & BDRV_CHILD_COW) {
1487        bdrv_backing_detach(child);
1488    }
1489
1490    assert_bdrv_graph_writable();
1491    QLIST_REMOVE(child, next);
1492    if (child == bs->backing) {
1493        assert(child != bs->file);
1494        bs->backing = NULL;
1495    } else if (child == bs->file) {
1496        bs->file = NULL;
1497    }
1498}
1499
1500static int bdrv_child_cb_update_filename(BdrvChild *c, BlockDriverState *base,
1501                                         const char *filename,
1502                                         bool backing_mask_protocol,
1503                                         Error **errp)
1504{
1505    if (c->role & BDRV_CHILD_COW) {
1506        return bdrv_backing_update_filename(c, base, filename,
1507                                            backing_mask_protocol,
1508                                            errp);
1509    }
1510    return 0;
1511}
1512
1513AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
1514{
1515    BlockDriverState *bs = c->opaque;
1516    IO_CODE();
1517
1518    return bdrv_get_aio_context(bs);
1519}
1520
1521const BdrvChildClass child_of_bds = {
1522    .parent_is_bds   = true,
1523    .get_parent_desc = bdrv_child_get_parent_desc,
1524    .inherit_options = bdrv_inherited_options,
1525    .drained_begin   = bdrv_child_cb_drained_begin,
1526    .drained_poll    = bdrv_child_cb_drained_poll,
1527    .drained_end     = bdrv_child_cb_drained_end,
1528    .attach          = bdrv_child_cb_attach,
1529    .detach          = bdrv_child_cb_detach,
1530    .inactivate      = bdrv_child_cb_inactivate,
1531    .change_aio_ctx  = bdrv_child_cb_change_aio_ctx,
1532    .update_filename = bdrv_child_cb_update_filename,
1533    .get_parent_aio_context = child_of_bds_get_parent_aio_context,
1534};
1535
1536AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
1537{
1538    IO_CODE();
1539    return c->klass->get_parent_aio_context(c);
1540}
1541
1542static int bdrv_open_flags(BlockDriverState *bs, int flags)
1543{
1544    int open_flags = flags;
1545    GLOBAL_STATE_CODE();
1546
1547    /*
1548     * Clear flags that are internal to the block layer before opening the
1549     * image.
1550     */
1551    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
1552
1553    return open_flags;
1554}
1555
1556static void update_flags_from_options(int *flags, QemuOpts *opts)
1557{
1558    GLOBAL_STATE_CODE();
1559
1560    *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY);
1561
1562    if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) {
1563        *flags |= BDRV_O_NO_FLUSH;
1564    }
1565
1566    if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) {
1567        *flags |= BDRV_O_NOCACHE;
1568    }
1569
1570    if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) {
1571        *flags |= BDRV_O_RDWR;
1572    }
1573
1574    if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) {
1575        *flags |= BDRV_O_AUTO_RDONLY;
1576    }
1577
1578    if (!qemu_opt_get_bool_del(opts, BDRV_OPT_ACTIVE, true)) {
1579        *flags |= BDRV_O_INACTIVE;
1580    }
1581}
1582
1583static void update_options_from_flags(QDict *options, int flags)
1584{
1585    GLOBAL_STATE_CODE();
1586    if (!qdict_haskey(options, BDRV_OPT_CACHE_DIRECT)) {
1587        qdict_put_bool(options, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE);
1588    }
1589    if (!qdict_haskey(options, BDRV_OPT_CACHE_NO_FLUSH)) {
1590        qdict_put_bool(options, BDRV_OPT_CACHE_NO_FLUSH,
1591                       flags & BDRV_O_NO_FLUSH);
1592    }
1593    if (!qdict_haskey(options, BDRV_OPT_READ_ONLY)) {
1594        qdict_put_bool(options, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR));
1595    }
1596    if (!qdict_haskey(options, BDRV_OPT_AUTO_READ_ONLY)) {
1597        qdict_put_bool(options, BDRV_OPT_AUTO_READ_ONLY,
1598                       flags & BDRV_O_AUTO_RDONLY);
1599    }
1600}
1601
1602static void bdrv_assign_node_name(BlockDriverState *bs,
1603                                  const char *node_name,
1604                                  Error **errp)
1605{
1606    char *gen_node_name = NULL;
1607    GLOBAL_STATE_CODE();
1608
1609    if (!node_name) {
1610        node_name = gen_node_name = id_generate(ID_BLOCK);
1611    } else if (!id_wellformed(node_name)) {
1612        /*
1613         * Check for empty string or invalid characters, but not if it is
1614         * generated (generated names use characters not available to the user)
1615         */
1616        error_setg(errp, "Invalid node-name: '%s'", node_name);
1617        return;
1618    }
1619
1620    /* takes care of avoiding namespaces collisions */
1621    if (blk_by_name(node_name)) {
1622        error_setg(errp, "node-name=%s is conflicting with a device id",
1623                   node_name);
1624        goto out;
1625    }
1626
1627    /* takes care of avoiding duplicates node names */
1628    if (bdrv_find_node(node_name)) {
1629        error_setg(errp, "Duplicate nodes with node-name='%s'", node_name);
1630        goto out;
1631    }
1632
1633    /* Make sure that the node name isn't truncated */
1634    if (strlen(node_name) >= sizeof(bs->node_name)) {
1635        error_setg(errp, "Node name too long");
1636        goto out;
1637    }
1638
1639    /* copy node name into the bs and insert it into the graph list */
1640    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
1641    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
1642out:
1643    g_free(gen_node_name);
1644}
1645
1646static int no_coroutine_fn GRAPH_UNLOCKED
1647bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
1648                 QDict *options, int open_flags, Error **errp)
1649{
1650    Error *local_err = NULL;
1651    int i, ret;
1652    GLOBAL_STATE_CODE();
1653
1654    bdrv_assign_node_name(bs, node_name, &local_err);
1655    if (local_err) {
1656        error_propagate(errp, local_err);
1657        return -EINVAL;
1658    }
1659
1660    bs->drv = drv;
1661    bs->opaque = g_malloc0(drv->instance_size);
1662
1663    assert(!drv->bdrv_needs_filename || bs->filename[0]);
1664    if (drv->bdrv_open) {
1665        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1666    } else {
1667        ret = 0;
1668    }
1669
1670    if (ret < 0) {
1671        if (local_err) {
1672            error_propagate(errp, local_err);
1673        } else if (bs->filename[0]) {
1674            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1675        } else {
1676            error_setg_errno(errp, -ret, "Could not open image");
1677        }
1678        goto open_failed;
1679    }
1680
1681    assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
1682    assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));
1683
1684    /*
1685     * Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
1686     * drivers that pass read/write requests through to a child the trouble of
1687     * declaring support explicitly.
1688     *
1689     * Drivers must not propagate this flag accidentally when they initiate I/O
1690     * to a bounce buffer. That case should be rare though.
1691     */
1692    bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
1693    bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
1694
1695    ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
1696    if (ret < 0) {
1697        error_setg_errno(errp, -ret, "Could not refresh total sector count");
1698        return ret;
1699    }
1700
1701    bdrv_graph_rdlock_main_loop();
1702    bdrv_refresh_limits(bs, NULL, &local_err);
1703    bdrv_graph_rdunlock_main_loop();
1704
1705    if (local_err) {
1706        error_propagate(errp, local_err);
1707        return -EINVAL;
1708    }
1709
1710    assert(bdrv_opt_mem_align(bs) != 0);
1711    assert(bdrv_min_mem_align(bs) != 0);
1712    assert(is_power_of_2(bs->bl.request_alignment));
1713
1714    for (i = 0; i < bs->quiesce_counter; i++) {
1715        if (drv->bdrv_drain_begin) {
1716            drv->bdrv_drain_begin(bs);
1717        }
1718    }
1719
1720    return 0;
1721open_failed:
1722    bs->drv = NULL;
1723
1724    bdrv_graph_wrlock_drained();
1725    if (bs->file != NULL) {
1726        bdrv_unref_child(bs, bs->file);
1727        assert(!bs->file);
1728    }
1729    bdrv_graph_wrunlock();
1730
1731    g_free(bs->opaque);
1732    bs->opaque = NULL;
1733    return ret;
1734}
1735
1736/*
1737 * Create and open a block node.
1738 *
1739 * @options is a QDict of options to pass to the block drivers, or NULL for an
1740 * empty set of options. The reference to the QDict belongs to the block layer
1741 * after the call (even on failure), so if the caller intends to reuse the
1742 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
1743 */
1744BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv,
1745                                            const char *node_name,
1746                                            QDict *options, int flags,
1747                                            Error **errp)
1748{
1749    BlockDriverState *bs;
1750    int ret;
1751
1752    GLOBAL_STATE_CODE();
1753
1754    bs = bdrv_new();
1755    bs->open_flags = flags;
1756    bs->options = options ?: qdict_new();
1757    bs->explicit_options = qdict_clone_shallow(bs->options);
1758    bs->opaque = NULL;
1759
1760    update_options_from_flags(bs->options, flags);
1761
1762    ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
1763    if (ret < 0) {
1764        qobject_unref(bs->explicit_options);
1765        bs->explicit_options = NULL;
1766        qobject_unref(bs->options);
1767        bs->options = NULL;
1768        bdrv_unref(bs);
1769        return NULL;
1770    }
1771
1772    return bs;
1773}
1774
1775/* Create and open a block node. */
1776BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
1777                                       int flags, Error **errp)
1778{
1779    GLOBAL_STATE_CODE();
1780    return bdrv_new_open_driver_opts(drv, node_name, NULL, flags, errp);
1781}
1782
1783QemuOptsList bdrv_runtime_opts = {
1784    .name = "bdrv_common",
1785    .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
1786    .desc = {
1787        {
1788            .name = "node-name",
1789            .type = QEMU_OPT_STRING,
1790            .help = "Node name of the block device node",
1791        },
1792        {
1793            .name = "driver",
1794            .type = QEMU_OPT_STRING,
1795            .help = "Block driver to use for the node",
1796        },
1797        {
1798            .name = BDRV_OPT_CACHE_DIRECT,
1799            .type = QEMU_OPT_BOOL,
1800            .help = "Bypass software writeback cache on the host",
1801        },
1802        {
1803            .name = BDRV_OPT_CACHE_NO_FLUSH,
1804            .type = QEMU_OPT_BOOL,
1805            .help = "Ignore flush requests",
1806        },
1807        {
1808            .name = BDRV_OPT_ACTIVE,
1809            .type = QEMU_OPT_BOOL,
1810            .help = "Node is activated",
1811        },
1812        {
1813            .name = BDRV_OPT_READ_ONLY,
1814            .type = QEMU_OPT_BOOL,
1815            .help = "Node is opened in read-only mode",
1816        },
1817        {
1818            .name = BDRV_OPT_AUTO_READ_ONLY,
1819            .type = QEMU_OPT_BOOL,
1820            .help = "Node can become read-only if opening read-write fails",
1821        },
1822        {
1823            .name = "detect-zeroes",
1824            .type = QEMU_OPT_STRING,
1825            .help = "try to optimize zero writes (off, on, unmap)",
1826        },
1827        {
1828            .name = BDRV_OPT_DISCARD,
1829            .type = QEMU_OPT_STRING,
1830            .help = "discard operation (ignore/off, unmap/on)",
1831        },
1832        {
1833            .name = BDRV_OPT_FORCE_SHARE,
1834            .type = QEMU_OPT_BOOL,
1835            .help = "always accept other writers (default: off)",
1836        },
1837        { /* end of list */ }
1838    },
1839};
1840
1841QemuOptsList bdrv_create_opts_simple = {
1842    .name = "simple-create-opts",
1843    .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
1844    .desc = {
1845        {
1846            .name = BLOCK_OPT_SIZE,
1847            .type = QEMU_OPT_SIZE,
1848            .help = "Virtual disk size"
1849        },
1850        {
1851            .name = BLOCK_OPT_PREALLOC,
1852            .type = QEMU_OPT_STRING,
1853            .help = "Preallocation mode (allowed values: off)"
1854        },
1855        { /* end of list */ }
1856    }
1857};
1858
1859/*
1860 * Common part for opening disk images and files
1861 *
1862 * Removes all processed options from *options.
1863 */
1864static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
1865                            QDict *options, Error **errp)
1866{
1867    int ret, open_flags;
1868    const char *filename;
1869    const char *driver_name = NULL;
1870    const char *node_name = NULL;
1871    const char *discard;
1872    QemuOpts *opts;
1873    BlockDriver *drv;
1874    Error *local_err = NULL;
1875    bool ro;
1876
1877    GLOBAL_STATE_CODE();
1878
1879    bdrv_graph_rdlock_main_loop();
1880    assert(bs->file == NULL);
1881    assert(options != NULL && bs->options != options);
1882    bdrv_graph_rdunlock_main_loop();
1883
1884    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
1885    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
1886        ret = -EINVAL;
1887        goto fail_opts;
1888    }
1889
1890    update_flags_from_options(&bs->open_flags, opts);
1891
1892    driver_name = qemu_opt_get(opts, "driver");
1893    drv = bdrv_find_format(driver_name);
1894    assert(drv != NULL);
1895
1896    bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
1897
1898    if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
1899        error_setg(errp,
1900                   BDRV_OPT_FORCE_SHARE
1901                   "=on can only be used with read-only images");
1902        ret = -EINVAL;
1903        goto fail_opts;
1904    }
1905
1906    if (file != NULL) {
1907        bdrv_graph_rdlock_main_loop();
1908        bdrv_refresh_filename(blk_bs(file));
1909        bdrv_graph_rdunlock_main_loop();
1910
1911        filename = blk_bs(file)->filename;
1912    } else {
1913        /*
1914         * Caution: while qdict_get_try_str() is fine, getting
1915         * non-string types would require more care.  When @options
1916         * come from -blockdev or blockdev_add, its members are typed
1917         * according to the QAPI schema, but when they come from
1918         * -drive, they're all QString.
1919         */
1920        filename = qdict_get_try_str(options, "filename");
1921    }
1922
1923    if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
1924        error_setg(errp, "The '%s' block driver requires a file name",
1925                   drv->format_name);
1926        ret = -EINVAL;
1927        goto fail_opts;
1928    }
1929
1930    trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
1931                           drv->format_name);
1932
1933    ro = bdrv_is_read_only(bs);
1934
1935    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, ro)) {
1936        if (!ro && bdrv_is_whitelisted(drv, true)) {
1937            bdrv_graph_rdlock_main_loop();
1938            ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
1939            bdrv_graph_rdunlock_main_loop();
1940        } else {
1941            ret = -ENOTSUP;
1942        }
1943        if (ret < 0) {
1944            error_setg(errp,
1945                       !ro && bdrv_is_whitelisted(drv, true)
1946                       ? "Driver '%s' can only be used for read-only devices"
1947                       : "Driver '%s' is not whitelisted",
1948                       drv->format_name);
1949            goto fail_opts;
1950        }
1951    }
1952
1953    /* bdrv_new() and bdrv_close() make it so */
1954    assert(qatomic_read(&bs->copy_on_read) == 0);
1955
1956    if (bs->open_flags & BDRV_O_COPY_ON_READ) {
1957        if (!ro) {
1958            bdrv_enable_copy_on_read(bs);
1959        } else {
1960            error_setg(errp, "Can't use copy-on-read on read-only device");
1961            ret = -EINVAL;
1962            goto fail_opts;
1963        }
1964    }
1965
1966    discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
1967    if (discard != NULL) {
1968        if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
1969            error_setg(errp, "Invalid discard option");
1970            ret = -EINVAL;
1971            goto fail_opts;
1972        }
1973    }
1974
1975    bs->detect_zeroes =
1976        bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
1977    if (local_err) {
1978        error_propagate(errp, local_err);
1979        ret = -EINVAL;
1980        goto fail_opts;
1981    }
1982
1983    if (filename != NULL) {
1984        pstrcpy(bs->filename, sizeof(bs->filename), filename);
1985    } else {
1986        bs->filename[0] = '\0';
1987    }
1988    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1989
1990    /* Open the image, either directly or using a protocol */
1991    open_flags = bdrv_open_flags(bs, bs->open_flags);
1992    node_name = qemu_opt_get(opts, "node-name");
1993
1994    assert(!drv->protocol_name || file == NULL);
1995    ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
1996    if (ret < 0) {
1997        goto fail_opts;
1998    }
1999
2000    qemu_opts_del(opts);
2001    return 0;
2002
2003fail_opts:
2004    qemu_opts_del(opts);
2005    return ret;
2006}
2007
2008static QDict *parse_json_filename(const char *filename, Error **errp)
2009{
2010    ERRP_GUARD();
2011    QObject *options_obj;
2012    QDict *options;
2013    int ret;
2014    GLOBAL_STATE_CODE();
2015
2016    ret = strstart(filename, "json:", &filename);
2017    assert(ret);
2018
2019    options_obj = qobject_from_json(filename, errp);
2020    if (!options_obj) {
2021        error_prepend(errp, "Could not parse the JSON options: ");
2022        return NULL;
2023    }
2024
2025    options = qobject_to(QDict, options_obj);
2026    if (!options) {
2027        qobject_unref(options_obj);
2028        error_setg(errp, "Invalid JSON object given");
2029        return NULL;
2030    }
2031
2032    qdict_flatten(options);
2033
2034    return options;
2035}
2036
2037static void parse_json_protocol(QDict *options, const char **pfilename,
2038                                Error **errp)
2039{
2040    QDict *json_options;
2041    Error *local_err = NULL;
2042    GLOBAL_STATE_CODE();
2043
2044    /* Parse json: pseudo-protocol */
2045    if (!*pfilename || !g_str_has_prefix(*pfilename, "json:")) {
2046        return;
2047    }
2048
2049    json_options = parse_json_filename(*pfilename, &local_err);
2050    if (local_err) {
2051        error_propagate(errp, local_err);
2052        return;
2053    }
2054
2055    /* Options given in the filename have lower priority than options
2056     * specified directly */
2057    qdict_join(options, json_options, false);
2058    qobject_unref(json_options);
2059    *pfilename = NULL;
2060}
2061
2062/*
2063 * Fills in default options for opening images and converts the legacy
2064 * filename/flags pair to option QDict entries.
2065 * The BDRV_O_PROTOCOL flag in *flags will be set or cleared accordingly if a
2066 * block driver has been specified explicitly.
2067 */
2068static int bdrv_fill_options(QDict **options, const char *filename,
2069                             int *flags, bool allow_parse_filename,
2070                             Error **errp)
2071{
2072    const char *drvname;
2073    bool protocol = *flags & BDRV_O_PROTOCOL;
2074    bool parse_filename = false;
2075    BlockDriver *drv = NULL;
2076    Error *local_err = NULL;
2077
2078    GLOBAL_STATE_CODE();
2079
2080    /*
2081     * Caution: while qdict_get_try_str() is fine, getting non-string
2082     * types would require more care.  When @options come from
2083     * -blockdev or blockdev_add, its members are typed according to
2084     * the QAPI schema, but when they come from -drive, they're all
2085     * QString.
2086     */
2087    drvname = qdict_get_try_str(*options, "driver");
2088    if (drvname) {
2089        drv = bdrv_find_format(drvname);
2090        if (!drv) {
2091            error_setg(errp, "Unknown driver '%s'", drvname);
2092            return -ENOENT;
2093        }
2094        /* If the user has explicitly specified the driver, this choice should
2095         * override the BDRV_O_PROTOCOL flag */
2096        protocol = drv->protocol_name;
2097    }
2098
2099    if (protocol) {
2100        *flags |= BDRV_O_PROTOCOL;
2101    } else {
2102        *flags &= ~BDRV_O_PROTOCOL;
2103    }
2104
2105    /* Translate cache options from flags into options */
2106    update_options_from_flags(*options, *flags);
2107
2108    /* Fetch the file name from the options QDict if necessary */
2109    if (protocol && filename) {
2110        if (!qdict_haskey(*options, "filename")) {
2111            qdict_put_str(*options, "filename", filename);
2112            parse_filename = allow_parse_filename;
2113        } else {
2114            error_setg(errp, "Can't specify 'file' and 'filename' options at "
2115                             "the same time");
2116            return -EINVAL;
2117        }
2118    }
2119
2120    /* Find the right block driver */
2121    /* See cautionary note on accessing @options above */
2122    filename = qdict_get_try_str(*options, "filename");
2123
2124    if (!drvname && protocol) {
2125        if (filename) {
2126            drv = bdrv_find_protocol(filename, parse_filename, errp);
2127            if (!drv) {
2128                return -EINVAL;
2129            }
2130
2131            drvname = drv->format_name;
2132            qdict_put_str(*options, "driver", drvname);
2133        } else {
2134            error_setg(errp, "Must specify either driver or file");
2135            return -EINVAL;
2136        }
2137    }
2138
2139    assert(drv || !protocol);
2140
2141    /* Driver-specific filename parsing */
2142    if (drv && drv->bdrv_parse_filename && parse_filename) {
2143        drv->bdrv_parse_filename(filename, *options, &local_err);
2144        if (local_err) {
2145            error_propagate(errp, local_err);
2146            return -EINVAL;
2147        }
2148
2149        if (!drv->bdrv_needs_filename) {
2150            qdict_del(*options, "filename");
2151        }
2152    }
2153
2154    return 0;
2155}
2156
2157typedef struct BlockReopenQueueEntry {
2158     bool prepared;
2159     BDRVReopenState state;
2160     QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
2161} BlockReopenQueueEntry;
2162
2163/*
2164 * Return the flags that @bs will have after the reopens in @q have
2165 * successfully completed. If @q is NULL (or @bs is not contained in @q),
2166 * return the current flags.
2167 */
2168static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
2169{
2170    BlockReopenQueueEntry *entry;
2171
2172    if (q != NULL) {
2173        QTAILQ_FOREACH(entry, q, entry) {
2174            if (entry->state.bs == bs) {
2175                return entry->state.flags;
2176            }
2177        }
2178    }
2179
2180    return bs->open_flags;
2181}
2182
2183/* Returns whether the image file can be written to after the reopen queue @q
2184 * has been successfully applied, or right now if @q is NULL. */
2185static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
2186                                          BlockReopenQueue *q)
2187{
2188    int flags = bdrv_reopen_get_flags(q, bs);
2189
2190    return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
2191}
2192
2193/*
2194 * Return whether the BDS can be written to.  This is not necessarily
2195 * the same as !bdrv_is_read_only(bs), as inactivated images may not
2196 * be written to but do not count as read-only images.
2197 */
2198bool bdrv_is_writable(BlockDriverState *bs)
2199{
2200    IO_CODE();
2201    return bdrv_is_writable_after_reopen(bs, NULL);
2202}
2203
2204static char *bdrv_child_user_desc(BdrvChild *c)
2205{
2206    GLOBAL_STATE_CODE();
2207    return c->klass->get_parent_desc(c);
2208}
2209
2210/*
2211 * Check that @a allows everything that @b needs. @a and @b must reference same
2212 * child node.
2213 */
2214static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
2215{
2216    const char *child_bs_name;
2217    g_autofree char *a_user = NULL;
2218    g_autofree char *b_user = NULL;
2219    g_autofree char *perms = NULL;
2220
2221    assert(a->bs);
2222    assert(a->bs == b->bs);
2223    GLOBAL_STATE_CODE();
2224
2225    if ((b->perm & a->shared_perm) == b->perm) {
2226        return true;
2227    }
2228
2229    child_bs_name = bdrv_get_node_name(b->bs);
2230    a_user = bdrv_child_user_desc(a);
2231    b_user = bdrv_child_user_desc(b);
2232    perms = bdrv_perm_names(b->perm & ~a->shared_perm);
2233
2234    error_setg(errp, "Permission conflict on node '%s': permissions '%s' are "
2235               "both required by %s (uses node '%s' as '%s' child) and "
2236               "unshared by %s (uses node '%s' as '%s' child).",
2237               child_bs_name, perms,
2238               b_user, child_bs_name, b->name,
2239               a_user, child_bs_name, a->name);
2240
2241    return false;
2242}
2243
2244static bool GRAPH_RDLOCK
2245bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
2246{
2247    BdrvChild *a, *b;
2248    GLOBAL_STATE_CODE();
2249
2250    /*
2251     * During the loop we'll look at each pair twice. That's correct because
2252     * bdrv_a_allow_b() is asymmetric and we should check each pair in both
2253     * directions.
2254     */
2255    QLIST_FOREACH(a, &bs->parents, next_parent) {
2256        QLIST_FOREACH(b, &bs->parents, next_parent) {
2257            if (a == b) {
2258                continue;
2259            }
2260
2261            if (!bdrv_a_allow_b(a, b, errp)) {
2262                return true;
2263            }
2264        }
2265    }
2266
2267    return false;
2268}
2269
2270static void GRAPH_RDLOCK
2271bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
2272                BdrvChild *c, BdrvChildRole role,
2273                BlockReopenQueue *reopen_queue,
2274                uint64_t parent_perm, uint64_t parent_shared,
2275                uint64_t *nperm, uint64_t *nshared)
2276{
2277    assert(bs->drv && bs->drv->bdrv_child_perm);
2278    GLOBAL_STATE_CODE();
2279    bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
2280                             parent_perm, parent_shared,
2281                             nperm, nshared);
2282    /* TODO Take force_share from reopen_queue */
2283    if (child_bs && child_bs->force_share) {
2284        *nshared = BLK_PERM_ALL;
2285    }
2286}
2287
2288/*
2289 * Adds the whole subtree of @bs (including @bs itself) to the @list (except for
2290 * nodes that are already in the @list, of course) so that final list is
2291 * topologically sorted. Return the result (GSList @list object is updated, so
2292 * don't use old reference after function call).
2293 *
2294 * On function start @list must be already topologically sorted and for any node
2295 * in the @list the whole subtree of the node must be in the @list as well. The
2296 * simplest way to satisfy this criteria: use only result of
2297 * bdrv_topological_dfs() or NULL as @list parameter.
2298 */
2299static GSList * GRAPH_RDLOCK
2300bdrv_topological_dfs(GSList *list, GHashTable *found, BlockDriverState *bs)
2301{
2302    BdrvChild *child;
2303    g_autoptr(GHashTable) local_found = NULL;
2304
2305    GLOBAL_STATE_CODE();
2306
2307    if (!found) {
2308        assert(!list);
2309        found = local_found = g_hash_table_new(NULL, NULL);
2310    }
2311
2312    if (g_hash_table_contains(found, bs)) {
2313        return list;
2314    }
2315    g_hash_table_add(found, bs);
2316
2317    QLIST_FOREACH(child, &bs->children, next) {
2318        list = bdrv_topological_dfs(list, found, child->bs);
2319    }
2320
2321    return g_slist_prepend(list, bs);
2322}
2323
2324typedef struct BdrvChildSetPermState {
2325    BdrvChild *child;
2326    uint64_t old_perm;
2327    uint64_t old_shared_perm;
2328} BdrvChildSetPermState;
2329
2330static void bdrv_child_set_perm_abort(void *opaque)
2331{
2332    BdrvChildSetPermState *s = opaque;
2333
2334    GLOBAL_STATE_CODE();
2335
2336    s->child->perm = s->old_perm;
2337    s->child->shared_perm = s->old_shared_perm;
2338}
2339
2340static TransactionActionDrv bdrv_child_set_pem_drv = {
2341    .abort = bdrv_child_set_perm_abort,
2342    .clean = g_free,
2343};
2344
2345static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
2346                                uint64_t shared, Transaction *tran)
2347{
2348    BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
2349    GLOBAL_STATE_CODE();
2350
2351    *s = (BdrvChildSetPermState) {
2352        .child = c,
2353        .old_perm = c->perm,
2354        .old_shared_perm = c->shared_perm,
2355    };
2356
2357    c->perm = perm;
2358    c->shared_perm = shared;
2359
2360    tran_add(tran, &bdrv_child_set_pem_drv, s);
2361}
2362
2363static void GRAPH_RDLOCK bdrv_drv_set_perm_commit(void *opaque)
2364{
2365    BlockDriverState *bs = opaque;
2366    uint64_t cumulative_perms, cumulative_shared_perms;
2367    GLOBAL_STATE_CODE();
2368
2369    if (bs->drv->bdrv_set_perm) {
2370        bdrv_get_cumulative_perm(bs, &cumulative_perms,
2371                                 &cumulative_shared_perms);
2372        bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
2373    }
2374}
2375
2376static void GRAPH_RDLOCK bdrv_drv_set_perm_abort(void *opaque)
2377{
2378    BlockDriverState *bs = opaque;
2379    GLOBAL_STATE_CODE();
2380
2381    if (bs->drv->bdrv_abort_perm_update) {
2382        bs->drv->bdrv_abort_perm_update(bs);
2383    }
2384}
2385
2386TransactionActionDrv bdrv_drv_set_perm_drv = {
2387    .abort = bdrv_drv_set_perm_abort,
2388    .commit = bdrv_drv_set_perm_commit,
2389};
2390
2391/*
2392 * After calling this function, the transaction @tran may only be completed
2393 * while holding a reader lock for the graph.
2394 */
2395static int GRAPH_RDLOCK
2396bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared_perm,
2397                  Transaction *tran, Error **errp)
2398{
2399    GLOBAL_STATE_CODE();
2400    if (!bs->drv) {
2401        return 0;
2402    }
2403
2404    if (bs->drv->bdrv_check_perm) {
2405        int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
2406        if (ret < 0) {
2407            return ret;
2408        }
2409    }
2410
2411    if (tran) {
2412        tran_add(tran, &bdrv_drv_set_perm_drv, bs);
2413    }
2414
2415    return 0;
2416}
2417
2418typedef struct BdrvReplaceChildState {
2419    BdrvChild *child;
2420    BlockDriverState *old_bs;
2421} BdrvReplaceChildState;
2422
2423static void GRAPH_WRLOCK bdrv_replace_child_commit(void *opaque)
2424{
2425    BdrvReplaceChildState *s = opaque;
2426    GLOBAL_STATE_CODE();
2427
2428    bdrv_schedule_unref(s->old_bs);
2429}
2430
2431static void GRAPH_WRLOCK bdrv_replace_child_abort(void *opaque)
2432{
2433    BdrvReplaceChildState *s = opaque;
2434    BlockDriverState *new_bs = s->child->bs;
2435
2436    GLOBAL_STATE_CODE();
2437    assert_bdrv_graph_writable();
2438
2439    /* old_bs reference is transparently moved from @s to @s->child */
2440    if (!s->child->bs) {
2441        /*
2442         * The parents were undrained when removing old_bs from the child. New
2443         * requests can't have been made, though, because the child was empty.
2444         *
2445         * TODO Make bdrv_replace_child_noperm() transactionable to avoid
2446         * undraining the parent in the first place. Once this is done, having
2447         * new_bs drained when calling bdrv_replace_child_tran() is not a
2448         * requirement any more.
2449         */
2450        bdrv_parent_drained_begin_single(s->child);
2451        assert(!bdrv_parent_drained_poll_single(s->child));
2452    }
2453    assert(s->child->quiesced_parent);
2454    bdrv_replace_child_noperm(s->child, s->old_bs);
2455
2456    bdrv_unref(new_bs);
2457}
2458
2459static TransactionActionDrv bdrv_replace_child_drv = {
2460    .commit = bdrv_replace_child_commit,
2461    .abort = bdrv_replace_child_abort,
2462    .clean = g_free,
2463};
2464
2465/*
2466 * bdrv_replace_child_tran
2467 *
2468 * Note: real unref of old_bs is done only on commit.
2469 *
2470 * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be
2471 * kept drained until the transaction is completed.
2472 *
2473 * After calling this function, the transaction @tran may only be completed
2474 * while holding a writer lock for the graph.
2475 *
2476 * The function doesn't update permissions, caller is responsible for this.
2477 */
2478static void GRAPH_WRLOCK
2479bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
2480                        Transaction *tran)
2481{
2482    BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
2483
2484    assert(child->quiesced_parent);
2485    assert(!new_bs || new_bs->quiesce_counter);
2486
2487    *s = (BdrvReplaceChildState) {
2488        .child = child,
2489        .old_bs = child->bs,
2490    };
2491    tran_add(tran, &bdrv_replace_child_drv, s);
2492
2493    if (new_bs) {
2494        bdrv_ref(new_bs);
2495    }
2496
2497    bdrv_replace_child_noperm(child, new_bs);
2498    /* old_bs reference is transparently moved from @child to @s */
2499}
2500
2501/*
2502 * Refresh permissions in @bs subtree. The function is intended to be called
2503 * after some graph modification that was done without permission update.
2504 *
2505 * After calling this function, the transaction @tran may only be completed
2506 * while holding a reader lock for the graph.
2507 */
2508static int GRAPH_RDLOCK
2509bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
2510                       Transaction *tran, Error **errp)
2511{
2512    BlockDriver *drv = bs->drv;
2513    BdrvChild *c;
2514    int ret;
2515    uint64_t cumulative_perms, cumulative_shared_perms;
2516    GLOBAL_STATE_CODE();
2517
2518    bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
2519
2520    /* Write permissions never work with read-only images */
2521    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2522        !bdrv_is_writable_after_reopen(bs, q))
2523    {
2524        if (!bdrv_is_writable_after_reopen(bs, NULL)) {
2525            error_setg(errp, "Block node is read-only");
2526        } else {
2527            error_setg(errp, "Read-only block node '%s' cannot support "
2528                       "read-write users", bdrv_get_node_name(bs));
2529        }
2530
2531        return -EPERM;
2532    }
2533
2534    /*
2535     * Unaligned requests will automatically be aligned to bl.request_alignment
2536     * and without RESIZE we can't extend requests to write to space beyond the
2537     * end of the image, so it's required that the image size is aligned.
2538     */
2539    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2540        !(cumulative_perms & BLK_PERM_RESIZE))
2541    {
2542        if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) {
2543            error_setg(errp, "Cannot get 'write' permission without 'resize': "
2544                             "Image size is not a multiple of request "
2545                             "alignment");
2546            return -EPERM;
2547        }
2548    }
2549
2550    /* Check this node */
2551    if (!drv) {
2552        return 0;
2553    }
2554
2555    ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, tran,
2556                            errp);
2557    if (ret < 0) {
2558        return ret;
2559    }
2560
2561    /* Drivers that never have children can omit .bdrv_child_perm() */
2562    if (!drv->bdrv_child_perm) {
2563        assert(QLIST_EMPTY(&bs->children));
2564        return 0;
2565    }
2566
2567    /* Check all children */
2568    QLIST_FOREACH(c, &bs->children, next) {
2569        uint64_t cur_perm, cur_shared;
2570
2571        bdrv_child_perm(bs, c->bs, c, c->role, q,
2572                        cumulative_perms, cumulative_shared_perms,
2573                        &cur_perm, &cur_shared);
2574        bdrv_child_set_perm(c, cur_perm, cur_shared, tran);
2575    }
2576
2577    return 0;
2578}
2579
2580/*
2581 * @list is a product of bdrv_topological_dfs() (may be called several times) -
2582 * a topologically sorted subgraph.
2583 *
2584 * After calling this function, the transaction @tran may only be completed
2585 * while holding a reader lock for the graph.
2586 */
2587static int GRAPH_RDLOCK
2588bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q, Transaction *tran,
2589                      Error **errp)
2590{
2591    int ret;
2592    BlockDriverState *bs;
2593    GLOBAL_STATE_CODE();
2594
2595    for ( ; list; list = list->next) {
2596        bs = list->data;
2597
2598        if (bdrv_parent_perms_conflict(bs, errp)) {
2599            return -EINVAL;
2600        }
2601
2602        ret = bdrv_node_refresh_perm(bs, q, tran, errp);
2603        if (ret < 0) {
2604            return ret;
2605        }
2606    }
2607
2608    return 0;
2609}
2610
2611/*
2612 * @list is any list of nodes. List is completed by all subtrees and
2613 * topologically sorted. It's not a problem if some node occurs in the @list
2614 * several times.
2615 *
2616 * After calling this function, the transaction @tran may only be completed
2617 * while holding a reader lock for the graph.
2618 */
2619static int GRAPH_RDLOCK
2620bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q, Transaction *tran,
2621                        Error **errp)
2622{
2623    g_autoptr(GHashTable) found = g_hash_table_new(NULL, NULL);
2624    g_autoptr(GSList) refresh_list = NULL;
2625
2626    for ( ; list; list = list->next) {
2627        refresh_list = bdrv_topological_dfs(refresh_list, found, list->data);
2628    }
2629
2630    return bdrv_do_refresh_perms(refresh_list, q, tran, errp);
2631}
2632
2633void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
2634                              uint64_t *shared_perm)
2635{
2636    BdrvChild *c;
2637    uint64_t cumulative_perms = 0;
2638    uint64_t cumulative_shared_perms = BLK_PERM_ALL;
2639
2640    GLOBAL_STATE_CODE();
2641
2642    QLIST_FOREACH(c, &bs->parents, next_parent) {
2643        cumulative_perms |= c->perm;
2644        cumulative_shared_perms &= c->shared_perm;
2645    }
2646
2647    *perm = cumulative_perms;
2648    *shared_perm = cumulative_shared_perms;
2649}
2650
2651char *bdrv_perm_names(uint64_t perm)
2652{
2653    struct perm_name {
2654        uint64_t perm;
2655        const char *name;
2656    } permissions[] = {
2657        { BLK_PERM_CONSISTENT_READ, "consistent read" },
2658        { BLK_PERM_WRITE,           "write" },
2659        { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
2660        { BLK_PERM_RESIZE,          "resize" },
2661        { 0, NULL }
2662    };
2663
2664    GString *result = g_string_sized_new(30);
2665    struct perm_name *p;
2666
2667    for (p = permissions; p->name; p++) {
2668        if (perm & p->perm) {
2669            if (result->len > 0) {
2670                g_string_append(result, ", ");
2671            }
2672            g_string_append(result, p->name);
2673        }
2674    }
2675
2676    return g_string_free(result, FALSE);
2677}
2678
2679
2680/*
2681 * @tran is allowed to be NULL. In this case no rollback is possible.
2682 *
2683 * After calling this function, the transaction @tran may only be completed
2684 * while holding a reader lock for the graph.
2685 */
2686static int GRAPH_RDLOCK
2687bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran, Error **errp)
2688{
2689    int ret;
2690    Transaction *local_tran = NULL;
2691    g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
2692    GLOBAL_STATE_CODE();
2693
2694    if (!tran) {
2695        tran = local_tran = tran_new();
2696    }
2697
2698    ret = bdrv_do_refresh_perms(list, NULL, tran, errp);
2699
2700    if (local_tran) {
2701        tran_finalize(local_tran, ret);
2702    }
2703
2704    return ret;
2705}
2706
2707int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
2708                            Error **errp)
2709{
2710    Error *local_err = NULL;
2711    Transaction *tran = tran_new();
2712    int ret;
2713
2714    GLOBAL_STATE_CODE();
2715
2716    bdrv_child_set_perm(c, perm, shared, tran);
2717
2718    ret = bdrv_refresh_perms(c->bs, tran, &local_err);
2719
2720    tran_finalize(tran, ret);
2721
2722    if (ret < 0) {
2723        if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
2724            /* tighten permissions */
2725            error_propagate(errp, local_err);
2726        } else {
2727            /*
2728             * Our caller may intend to only loosen restrictions and
2729             * does not expect this function to fail.  Errors are not
2730             * fatal in such a case, so we can just hide them from our
2731             * caller.
2732             */
2733            error_free(local_err);
2734            ret = 0;
2735        }
2736    }
2737
2738    return ret;
2739}
2740
2741int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
2742{
2743    uint64_t parent_perms, parent_shared;
2744    uint64_t perms, shared;
2745
2746    GLOBAL_STATE_CODE();
2747
2748    bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
2749    bdrv_child_perm(bs, c->bs, c, c->role, NULL,
2750                    parent_perms, parent_shared, &perms, &shared);
2751
2752    return bdrv_child_try_set_perm(c, perms, shared, errp);
2753}
2754
2755/*
2756 * Default implementation for .bdrv_child_perm() for block filters:
2757 * Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED, and RESIZE to the
2758 * filtered child.
2759 */
2760static void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
2761                                      BdrvChildRole role,
2762                                      BlockReopenQueue *reopen_queue,
2763                                      uint64_t perm, uint64_t shared,
2764                                      uint64_t *nperm, uint64_t *nshared)
2765{
2766    GLOBAL_STATE_CODE();
2767    *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
2768    *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
2769}
2770
2771static void bdrv_default_perms_for_cow(BlockDriverState *bs, BdrvChild *c,
2772                                       BdrvChildRole role,
2773                                       BlockReopenQueue *reopen_queue,
2774                                       uint64_t perm, uint64_t shared,
2775                                       uint64_t *nperm, uint64_t *nshared)
2776{
2777    assert(role & BDRV_CHILD_COW);
2778    GLOBAL_STATE_CODE();
2779
2780    /*
2781     * We want consistent read from backing files if the parent needs it.
2782     * No other operations are performed on backing files.
2783     */
2784    perm &= BLK_PERM_CONSISTENT_READ;
2785
2786    /*
2787     * If the parent can deal with changing data, we're okay with a
2788     * writable and resizable backing file.
2789     * TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too?
2790     */
2791    if (shared & BLK_PERM_WRITE) {
2792        shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2793    } else {
2794        shared = 0;
2795    }
2796
2797    shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
2798
2799    if (bs->open_flags & BDRV_O_INACTIVE) {
2800        shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2801    }
2802
2803    *nperm = perm;
2804    *nshared = shared;
2805}
2806
2807static void bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
2808                                           BdrvChildRole role,
2809                                           BlockReopenQueue *reopen_queue,
2810                                           uint64_t perm, uint64_t shared,
2811                                           uint64_t *nperm, uint64_t *nshared)
2812{
2813    int flags;
2814
2815    GLOBAL_STATE_CODE();
2816    assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
2817
2818    flags = bdrv_reopen_get_flags(reopen_queue, bs);
2819
2820    /*
2821     * Apart from the modifications below, the same permissions are
2822     * forwarded and left alone as for filters
2823     */
2824    bdrv_filter_default_perms(bs, c, role, reopen_queue,
2825                              perm, shared, &perm, &shared);
2826
2827    if (role & BDRV_CHILD_METADATA) {
2828        /* Format drivers may touch metadata even if the guest doesn't write */
2829        if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
2830            perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2831        }
2832
2833        /*
2834         * bs->file always needs to be consistent because of the
2835         * metadata. We can never allow other users to resize or write
2836         * to it.
2837         */
2838        if (!(flags & BDRV_O_NO_IO)) {
2839            perm |= BLK_PERM_CONSISTENT_READ;
2840        }
2841        shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
2842    }
2843
2844    if (role & BDRV_CHILD_DATA) {
2845        /*
2846         * Technically, everything in this block is a subset of the
2847         * BDRV_CHILD_METADATA path taken above, and so this could
2848         * be an "else if" branch.  However, that is not obvious, and
2849         * this function is not performance critical, therefore we let
2850         * this be an independent "if".
2851         */
2852
2853        /*
2854         * We cannot allow other users to resize the file because the
2855         * format driver might have some assumptions about the size
2856         * (e.g. because it is stored in metadata, or because the file
2857         * is split into fixed-size data files).
2858         */
2859        shared &= ~BLK_PERM_RESIZE;
2860
2861        /*
2862         * WRITE_UNCHANGED often cannot be performed as such on the
2863         * data file.  For example, the qcow2 driver may still need to
2864         * write copied clusters on copy-on-read.
2865         */
2866        if (perm & BLK_PERM_WRITE_UNCHANGED) {
2867            perm |= BLK_PERM_WRITE;
2868        }
2869
2870        /*
2871         * If the data file is written to, the format driver may
2872         * expect to be able to resize it by writing beyond the EOF.
2873         */
2874        if (perm & BLK_PERM_WRITE) {
2875            perm |= BLK_PERM_RESIZE;
2876        }
2877    }
2878
2879    if (bs->open_flags & BDRV_O_INACTIVE) {
2880        shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2881    }
2882
2883    *nperm = perm;
2884    *nshared = shared;
2885}
2886
2887void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
2888                        BdrvChildRole role, BlockReopenQueue *reopen_queue,
2889                        uint64_t perm, uint64_t shared,
2890                        uint64_t *nperm, uint64_t *nshared)
2891{
2892    GLOBAL_STATE_CODE();
2893    if (role & BDRV_CHILD_FILTERED) {
2894        assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
2895                         BDRV_CHILD_COW)));
2896        bdrv_filter_default_perms(bs, c, role, reopen_queue,
2897                                  perm, shared, nperm, nshared);
2898    } else if (role & BDRV_CHILD_COW) {
2899        assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA)));
2900        bdrv_default_perms_for_cow(bs, c, role, reopen_queue,
2901                                   perm, shared, nperm, nshared);
2902    } else if (role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA)) {
2903        bdrv_default_perms_for_storage(bs, c, role, reopen_queue,
2904                                       perm, shared, nperm, nshared);
2905    } else {
2906        g_assert_not_reached();
2907    }
2908}
2909
2910uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
2911{
2912    static const uint64_t permissions[] = {
2913        [BLOCK_PERMISSION_CONSISTENT_READ]  = BLK_PERM_CONSISTENT_READ,
2914        [BLOCK_PERMISSION_WRITE]            = BLK_PERM_WRITE,
2915        [BLOCK_PERMISSION_WRITE_UNCHANGED]  = BLK_PERM_WRITE_UNCHANGED,
2916        [BLOCK_PERMISSION_RESIZE]           = BLK_PERM_RESIZE,
2917    };
2918
2919    QEMU_BUILD_BUG_ON(ARRAY_SIZE(permissions) != BLOCK_PERMISSION__MAX);
2920    QEMU_BUILD_BUG_ON(1UL << ARRAY_SIZE(permissions) != BLK_PERM_ALL + 1);
2921
2922    assert(qapi_perm < BLOCK_PERMISSION__MAX);
2923
2924    return permissions[qapi_perm];
2925}
2926
2927/*
2928 * Replaces the node that a BdrvChild points to without updating permissions.
2929 *
2930 * If @new_bs is non-NULL, the parent of @child must already be drained through
2931 * @child.
2932 */
2933static void GRAPH_WRLOCK
2934bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs)
2935{
2936    BlockDriverState *old_bs = child->bs;
2937    int new_bs_quiesce_counter;
2938
2939    assert(!child->frozen);
2940
2941    /*
2942     * If we want to change the BdrvChild to point to a drained node as its new
2943     * child->bs, we need to make sure that its new parent is drained, too. In
2944     * other words, either child->quiesce_parent must already be true or we must
2945     * be able to set it and keep the parent's quiesce_counter consistent with
2946     * that, but without polling or starting new requests (this function
2947     * guarantees that it doesn't poll, and starting new requests would be
2948     * against the invariants of drain sections).
2949     *
2950     * To keep things simple, we pick the first option (child->quiesce_parent
2951     * must already be true). We also generalise the rule a bit to make it
2952     * easier to verify in callers and more likely to be covered in test cases:
2953     * The parent must be quiesced through this child even if new_bs isn't
2954     * currently drained.
2955     *
2956     * The only exception is for callers that always pass new_bs == NULL. In
2957     * this case, we obviously never need to consider the case of a drained
2958     * new_bs, so we can keep the callers simpler by allowing them not to drain
2959     * the parent.
2960     */
2961    assert(!new_bs || child->quiesced_parent);
2962    assert(old_bs != new_bs);
2963    GLOBAL_STATE_CODE();
2964
2965    if (old_bs && new_bs) {
2966        assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
2967    }
2968
2969    if (old_bs) {
2970        if (child->klass->detach) {
2971            child->klass->detach(child);
2972        }
2973        QLIST_REMOVE(child, next_parent);
2974    }
2975
2976    child->bs = new_bs;
2977
2978    if (new_bs) {
2979        QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
2980        if (child->klass->attach) {
2981            child->klass->attach(child);
2982        }
2983    }
2984
2985    /*
2986     * If the parent was drained through this BdrvChild previously, but new_bs
2987     * is not drained, allow requests to come in only after the new node has
2988     * been attached.
2989     */
2990    new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
2991    if (!new_bs_quiesce_counter && child->quiesced_parent) {
2992        bdrv_parent_drained_end_single(child);
2993    }
2994}
2995
2996/**
2997 * Free the given @child.
2998 *
2999 * The child must be empty (i.e. `child->bs == NULL`) and it must be
3000 * unused (i.e. not in a children list).
3001 */
3002static void bdrv_child_free(BdrvChild *child)
3003{
3004    assert(!child->bs);
3005    GLOBAL_STATE_CODE();
3006    GRAPH_RDLOCK_GUARD_MAINLOOP();
3007
3008    assert(!child->next.le_prev); /* not in children list */
3009
3010    g_free(child->name);
3011    g_free(child);
3012}
3013
3014typedef struct BdrvAttachChildCommonState {
3015    BdrvChild *child;
3016    AioContext *old_parent_ctx;
3017    AioContext *old_child_ctx;
3018} BdrvAttachChildCommonState;
3019
3020static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque)
3021{
3022    BdrvAttachChildCommonState *s = opaque;
3023    BlockDriverState *bs = s->child->bs;
3024
3025    GLOBAL_STATE_CODE();
3026    assert_bdrv_graph_writable();
3027
3028    bdrv_replace_child_noperm(s->child, NULL);
3029
3030    if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
3031        bdrv_try_change_aio_context_locked(bs, s->old_child_ctx, NULL,
3032                                           &error_abort);
3033    }
3034
3035    if (bdrv_child_get_parent_aio_context(s->child) != s->old_parent_ctx) {
3036        Transaction *tran;
3037        GHashTable *visited;
3038        bool ret;
3039
3040        tran = tran_new();
3041
3042        /* No need to visit `child`, because it has been detached already */
3043        visited = g_hash_table_new(NULL, NULL);
3044        ret = s->child->klass->change_aio_ctx(s->child, s->old_parent_ctx,
3045                                              visited, tran, &error_abort);
3046        g_hash_table_destroy(visited);
3047
3048        /* transaction is supposed to always succeed */
3049        assert(ret == true);
3050        tran_commit(tran);
3051    }
3052
3053    bdrv_schedule_unref(bs);
3054    bdrv_child_free(s->child);
3055}
3056
3057static TransactionActionDrv bdrv_attach_child_common_drv = {
3058    .abort = bdrv_attach_child_common_abort,
3059    .clean = g_free,
3060};
3061
3062/*
3063 * Common part of attaching bdrv child to bs or to blk or to job
3064 *
3065 * Function doesn't update permissions, caller is responsible for this.
3066 *
3067 * After calling this function, the transaction @tran may only be completed
3068 * while holding a writer lock for the graph.
3069 *
3070 * Returns new created child.
3071 *
3072 * Both @parent_bs and @child_bs can move to a different AioContext in this
3073 * function.
3074 *
3075 * All block nodes must be drained before this function is called until after
3076 * the transaction is finalized.
3077 */
3078static BdrvChild * GRAPH_WRLOCK
3079bdrv_attach_child_common(BlockDriverState *child_bs,
3080                         const char *child_name,
3081                         const BdrvChildClass *child_class,
3082                         BdrvChildRole child_role,
3083                         uint64_t perm, uint64_t shared_perm,
3084                         void *opaque,
3085                         Transaction *tran, Error **errp)
3086{
3087    BdrvChild *new_child;
3088    AioContext *parent_ctx;
3089    AioContext *child_ctx = bdrv_get_aio_context(child_bs);
3090
3091    assert(child_class->get_parent_desc);
3092    GLOBAL_STATE_CODE();
3093
3094    if (bdrv_is_inactive(child_bs) && (perm & ~BLK_PERM_CONSISTENT_READ)) {
3095        g_autofree char *perm_names = bdrv_perm_names(perm);
3096        error_setg(errp, "Permission '%s' unavailable on inactive node",
3097                   perm_names);
3098        return NULL;
3099    }
3100
3101    new_child = g_new(BdrvChild, 1);
3102    *new_child = (BdrvChild) {
3103        .bs             = NULL,
3104        .name           = g_strdup(child_name),
3105        .klass          = child_class,
3106        .role           = child_role,
3107        .perm           = perm,
3108        .shared_perm    = shared_perm,
3109        .opaque         = opaque,
3110    };
3111
3112    /*
3113     * If the AioContexts don't match, first try to move the subtree of
3114     * child_bs into the AioContext of the new parent. If this doesn't work,
3115     * try moving the parent into the AioContext of child_bs instead.
3116     */
3117    parent_ctx = bdrv_child_get_parent_aio_context(new_child);
3118    if (child_ctx != parent_ctx) {
3119        Error *local_err = NULL;
3120        int ret = bdrv_try_change_aio_context_locked(child_bs, parent_ctx, NULL,
3121                                                     &local_err);
3122
3123        if (ret < 0 && child_class->change_aio_ctx) {
3124            Transaction *aio_ctx_tran = tran_new();
3125            GHashTable *visited = g_hash_table_new(NULL, NULL);
3126            bool ret_child;
3127
3128            g_hash_table_add(visited, new_child);
3129            ret_child = child_class->change_aio_ctx(new_child, child_ctx,
3130                                                    visited, aio_ctx_tran,
3131                                                    NULL);
3132            if (ret_child == true) {
3133                error_free(local_err);
3134                ret = 0;
3135            }
3136            tran_finalize(aio_ctx_tran, ret_child == true ? 0 : -1);
3137            g_hash_table_destroy(visited);
3138        }
3139
3140        if (ret < 0) {
3141            error_propagate(errp, local_err);
3142            bdrv_child_free(new_child);
3143            return NULL;
3144        }
3145    }
3146
3147    bdrv_ref(child_bs);
3148    /*
3149     * Let every new BdrvChild start with a drained parent. Inserting the child
3150     * in the graph with bdrv_replace_child_noperm() will undrain it if
3151     * @child_bs is not drained.
3152     *
3153     * The child was only just created and is not yet visible in global state
3154     * until bdrv_replace_child_noperm() inserts it into the graph, so nobody
3155     * could have sent requests and polling is not necessary.
3156     *
3157     * Note that this means that the parent isn't fully drained yet, we only
3158     * stop new requests from coming in. This is fine, we don't care about the
3159     * old requests here, they are not for this child. If another place enters a
3160     * drain section for the same parent, but wants it to be fully quiesced, it
3161     * will not run most of the code in .drained_begin() again (which is not
3162     * a problem, we already did this), but it will still poll until the parent
3163     * is fully quiesced, so it will not be negatively affected either.
3164     */
3165    bdrv_parent_drained_begin_single(new_child);
3166    bdrv_replace_child_noperm(new_child, child_bs);
3167
3168    BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
3169    *s = (BdrvAttachChildCommonState) {
3170        .child = new_child,
3171        .old_parent_ctx = parent_ctx,
3172        .old_child_ctx = child_ctx,
3173    };
3174    tran_add(tran, &bdrv_attach_child_common_drv, s);
3175
3176    return new_child;
3177}
3178
3179/*
3180 * Function doesn't update permissions, caller is responsible for this.
3181 *
3182 * Both @parent_bs and @child_bs can move to a different AioContext in this
3183 * function.
3184 *
3185 * After calling this function, the transaction @tran may only be completed
3186 * while holding a writer lock for the graph.
3187 *
3188 * All block nodes must be drained before this function is called until after
3189 * the transaction is finalized.
3190 */
3191static BdrvChild * GRAPH_WRLOCK
3192bdrv_attach_child_noperm(BlockDriverState *parent_bs,
3193                         BlockDriverState *child_bs,
3194                         const char *child_name,
3195                         const BdrvChildClass *child_class,
3196                         BdrvChildRole child_role,
3197                         Transaction *tran,
3198                         Error **errp)
3199{
3200    uint64_t perm, shared_perm;
3201
3202    assert(parent_bs->drv);
3203    GLOBAL_STATE_CODE();
3204
3205    if (bdrv_recurse_has_child(child_bs, parent_bs)) {
3206        error_setg(errp, "Making '%s' a %s child of '%s' would create a cycle",
3207                   child_bs->node_name, child_name, parent_bs->node_name);
3208        return NULL;
3209    }
3210    if (bdrv_is_inactive(child_bs) && !bdrv_is_inactive(parent_bs)) {
3211        error_setg(errp, "Inactive '%s' can't be a %s child of active '%s'",
3212                   child_bs->node_name, child_name, parent_bs->node_name);
3213        return NULL;
3214    }
3215
3216    bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
3217    bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
3218                    perm, shared_perm, &perm, &shared_perm);
3219
3220    return bdrv_attach_child_common(child_bs, child_name, child_class,
3221                                    child_role, perm, shared_perm, parent_bs,
3222                                    tran, errp);
3223}
3224
3225/*
3226 * This function steals the reference to child_bs from the caller.
3227 * That reference is later dropped by bdrv_root_unref_child().
3228 *
3229 * On failure NULL is returned, errp is set and the reference to
3230 * child_bs is also dropped.
3231 *
3232 * All block nodes must be drained.
3233 */
3234BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
3235                                  const char *child_name,
3236                                  const BdrvChildClass *child_class,
3237                                  BdrvChildRole child_role,
3238                                  uint64_t perm, uint64_t shared_perm,
3239                                  void *opaque, Error **errp)
3240{
3241    int ret;
3242    BdrvChild *child;
3243    Transaction *tran = tran_new();
3244
3245    GLOBAL_STATE_CODE();
3246
3247    child = bdrv_attach_child_common(child_bs, child_name, child_class,
3248                                   child_role, perm, shared_perm, opaque,
3249                                   tran, errp);
3250    if (!child) {
3251        ret = -EINVAL;
3252        goto out;
3253    }
3254
3255    ret = bdrv_refresh_perms(child_bs, tran, errp);
3256
3257out:
3258    tran_finalize(tran, ret);
3259
3260    bdrv_schedule_unref(child_bs);
3261
3262    return ret < 0 ? NULL : child;
3263}
3264
3265/*
3266 * This function transfers the reference to child_bs from the caller
3267 * to parent_bs. That reference is later dropped by parent_bs on
3268 * bdrv_close() or if someone calls bdrv_unref_child().
3269 *
3270 * On failure NULL is returned, errp is set and the reference to
3271 * child_bs is also dropped.
3272 *
3273 * All block nodes must be drained.
3274 */
3275BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
3276                             BlockDriverState *child_bs,
3277                             const char *child_name,
3278                             const BdrvChildClass *child_class,
3279                             BdrvChildRole child_role,
3280                             Error **errp)
3281{
3282    int ret;
3283    BdrvChild *child;
3284    Transaction *tran = tran_new();
3285
3286    GLOBAL_STATE_CODE();
3287
3288    child = bdrv_attach_child_noperm(parent_bs, child_bs, child_name,
3289                                     child_class, child_role, tran, errp);
3290    if (!child) {
3291        ret = -EINVAL;
3292        goto out;
3293    }
3294
3295    ret = bdrv_refresh_perms(parent_bs, tran, errp);
3296    if (ret < 0) {
3297        goto out;
3298    }
3299
3300out:
3301    tran_finalize(tran, ret);
3302
3303    bdrv_schedule_unref(child_bs);
3304
3305    return ret < 0 ? NULL : child;
3306}
3307
3308/*
3309 * Callers must ensure that child->frozen is false.
3310 *
3311 * All block nodes must be drained.
3312 */
3313void bdrv_root_unref_child(BdrvChild *child)
3314{
3315    BlockDriverState *child_bs = child->bs;
3316
3317    GLOBAL_STATE_CODE();
3318    bdrv_replace_child_noperm(child, NULL);
3319    bdrv_child_free(child);
3320
3321    if (child_bs) {
3322        /*
3323         * Update permissions for old node. We're just taking a parent away, so
3324         * we're loosening restrictions. Errors of permission update are not
3325         * fatal in this case, ignore them.
3326         */
3327        bdrv_refresh_perms(child_bs, NULL, NULL);
3328
3329        /*
3330         * When the parent requiring a non-default AioContext is removed, the
3331         * node moves back to the main AioContext
3332         */
3333        bdrv_try_change_aio_context_locked(child_bs, qemu_get_aio_context(),
3334                                           NULL, NULL);
3335    }
3336
3337    bdrv_schedule_unref(child_bs);
3338}
3339
3340typedef struct BdrvSetInheritsFrom {
3341    BlockDriverState *bs;
3342    BlockDriverState *old_inherits_from;
3343} BdrvSetInheritsFrom;
3344
3345static void bdrv_set_inherits_from_abort(void *opaque)
3346{
3347    BdrvSetInheritsFrom *s = opaque;
3348
3349    s->bs->inherits_from = s->old_inherits_from;
3350}
3351
3352static TransactionActionDrv bdrv_set_inherits_from_drv = {
3353    .abort = bdrv_set_inherits_from_abort,
3354    .clean = g_free,
3355};
3356
3357/* @tran is allowed to be NULL. In this case no rollback is possible */
3358static void bdrv_set_inherits_from(BlockDriverState *bs,
3359                                   BlockDriverState *new_inherits_from,
3360                                   Transaction *tran)
3361{
3362    if (tran) {
3363        BdrvSetInheritsFrom *s = g_new(BdrvSetInheritsFrom, 1);
3364
3365        *s = (BdrvSetInheritsFrom) {
3366            .bs = bs,
3367            .old_inherits_from = bs->inherits_from,
3368        };
3369
3370        tran_add(tran, &bdrv_set_inherits_from_drv, s);
3371    }
3372
3373    bs->inherits_from = new_inherits_from;
3374}
3375
3376/**
3377 * Clear all inherits_from pointers from children and grandchildren of
3378 * @root that point to @root, where necessary.
3379 * @tran is allowed to be NULL. In this case no rollback is possible
3380 */
3381static void GRAPH_WRLOCK
3382bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
3383                         Transaction *tran)
3384{
3385    BdrvChild *c;
3386
3387    if (child->bs->inherits_from == root) {
3388        /*
3389         * Remove inherits_from only when the last reference between root and
3390         * child->bs goes away.
3391         */
3392        QLIST_FOREACH(c, &root->children, next) {
3393            if (c != child && c->bs == child->bs) {
3394                break;
3395            }
3396        }
3397        if (c == NULL) {
3398            bdrv_set_inherits_from(child->bs, NULL, tran);
3399        }
3400    }
3401
3402    QLIST_FOREACH(c, &child->bs->children, next) {
3403        bdrv_unset_inherits_from(root, c, tran);
3404    }
3405}
3406
3407/*
3408 * Callers must ensure that child->frozen is false.
3409 *
3410 * All block nodes must be drained.
3411 */
3412void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
3413{
3414    GLOBAL_STATE_CODE();
3415    if (child == NULL) {
3416        return;
3417    }
3418
3419    bdrv_unset_inherits_from(parent, child, NULL);
3420    bdrv_root_unref_child(child);
3421}
3422
3423
3424static void GRAPH_RDLOCK
3425bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
3426{
3427    BdrvChild *c;
3428    GLOBAL_STATE_CODE();
3429    QLIST_FOREACH(c, &bs->parents, next_parent) {
3430        if (c->klass->change_media) {
3431            c->klass->change_media(c, load);
3432        }
3433    }
3434}
3435
3436/* Return true if you can reach parent going through child->inherits_from
3437 * recursively. If parent or child are NULL, return false */
3438static bool bdrv_inherits_from_recursive(BlockDriverState *child,
3439                                         BlockDriverState *parent)
3440{
3441    while (child && child != parent) {
3442        child = child->inherits_from;
3443    }
3444
3445    return child != NULL;
3446}
3447
3448/*
3449 * Return the BdrvChildRole for @bs's backing child.  bs->backing is
3450 * mostly used for COW backing children (role = COW), but also for
3451 * filtered children (role = FILTERED | PRIMARY).
3452 */
3453static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
3454{
3455    if (bs->drv && bs->drv->is_filter) {
3456        return BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3457    } else {
3458        return BDRV_CHILD_COW;
3459    }
3460}
3461
3462/*
3463 * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
3464 * callers which don't need their own reference any more must call bdrv_unref().
3465 *
3466 * If the respective child is already present (i.e. we're detaching a node),
3467 * that child node must be drained.
3468 *
3469 * Function doesn't update permissions, caller is responsible for this.
3470 *
3471 * Both @parent_bs and @child_bs can move to a different AioContext in this
3472 * function.
3473 *
3474 * After calling this function, the transaction @tran may only be completed
3475 * while holding a writer lock for the graph.
3476 *
3477 * All block nodes must be drained before this function is called until after
3478 * the transaction is finalized.
3479 */
3480static int GRAPH_WRLOCK
3481bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
3482                                BlockDriverState *child_bs,
3483                                bool is_backing,
3484                                Transaction *tran, Error **errp)
3485{
3486    bool update_inherits_from =
3487        bdrv_inherits_from_recursive(child_bs, parent_bs);
3488    BdrvChild *child = is_backing ? parent_bs->backing : parent_bs->file;
3489    BdrvChildRole role;
3490
3491    GLOBAL_STATE_CODE();
3492
3493    if (!parent_bs->drv) {
3494        /*
3495         * Node without drv is an object without a class :/. TODO: finally fix
3496         * qcow2 driver to never clear bs->drv and implement format corruption
3497         * handling in other way.
3498         */
3499        error_setg(errp, "Node corrupted");
3500        return -EINVAL;
3501    }
3502
3503    if (child && child->frozen) {
3504        error_setg(errp, "Cannot change frozen '%s' link from '%s' to '%s'",
3505                   child->name, parent_bs->node_name, child->bs->node_name);
3506        return -EPERM;
3507    }
3508
3509    if (is_backing && !parent_bs->drv->is_filter &&
3510        !parent_bs->drv->supports_backing)
3511    {
3512        error_setg(errp, "Driver '%s' of node '%s' does not support backing "
3513                   "files", parent_bs->drv->format_name, parent_bs->node_name);
3514        return -EINVAL;
3515    }
3516
3517    if (parent_bs->drv->is_filter) {
3518        role = BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3519    } else if (is_backing) {
3520        role = BDRV_CHILD_COW;
3521    } else {
3522        /*
3523         * We only can use same role as it is in existing child. We don't have
3524         * infrastructure to determine role of file child in generic way
3525         */
3526        if (!child) {
3527            error_setg(errp, "Cannot set file child to format node without "
3528                       "file child");
3529            return -EINVAL;
3530        }
3531        role = child->role;
3532    }
3533
3534    if (child) {
3535        assert(child->bs->quiesce_counter);
3536        bdrv_unset_inherits_from(parent_bs, child, tran);
3537        bdrv_remove_child(child, tran);
3538    }
3539
3540    if (!child_bs) {
3541        goto out;
3542    }
3543
3544    child = bdrv_attach_child_noperm(parent_bs, child_bs,
3545                                     is_backing ? "backing" : "file",
3546                                     &child_of_bds, role,
3547                                     tran, errp);
3548    if (!child) {
3549        return -EINVAL;
3550    }
3551
3552
3553    /*
3554     * If inherits_from pointed recursively to bs then let's update it to
3555     * point directly to bs (else it will become NULL).
3556     */
3557    if (update_inherits_from) {
3558        bdrv_set_inherits_from(child_bs, parent_bs, tran);
3559    }
3560
3561out:
3562    bdrv_refresh_limits(parent_bs, tran, NULL);
3563
3564    return 0;
3565}
3566
3567/*
3568 * Both @bs and @backing_hd can move to a different AioContext in this
3569 * function.
3570 *
3571 * All block nodes must be drained.
3572 */
3573int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
3574                        Error **errp)
3575{
3576    int ret;
3577    Transaction *tran = tran_new();
3578
3579    GLOBAL_STATE_CODE();
3580    assert(bs->quiesce_counter > 0);
3581    if (bs->backing) {
3582        assert(bs->backing->bs->quiesce_counter > 0);
3583    }
3584
3585    ret = bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
3586    if (ret < 0) {
3587        goto out;
3588    }
3589
3590    ret = bdrv_refresh_perms(bs, tran, errp);
3591out:
3592    tran_finalize(tran, ret);
3593    return ret;
3594}
3595
3596/*
3597 * Opens the backing file for a BlockDriverState if not yet open
3598 *
3599 * bdref_key specifies the key for the image's BlockdevRef in the options QDict.
3600 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3601 * itself, all options starting with "${bdref_key}." are considered part of the
3602 * BlockdevRef.
3603 *
3604 * TODO Can this be unified with bdrv_open_image()?
3605 */
3606int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
3607                           const char *bdref_key, Error **errp)
3608{
3609    ERRP_GUARD();
3610    char *backing_filename = NULL;
3611    char *bdref_key_dot;
3612    const char *reference = NULL;
3613    int ret = 0;
3614    bool implicit_backing = false;
3615    BlockDriverState *backing_hd;
3616    QDict *options;
3617    QDict *tmp_parent_options = NULL;
3618    Error *local_err = NULL;
3619
3620    GLOBAL_STATE_CODE();
3621
3622    bdrv_graph_rdlock_main_loop();
3623
3624    if (bs->backing != NULL) {
3625        goto free_exit;
3626    }
3627
3628    /* NULL means an empty set of options */
3629    if (parent_options == NULL) {
3630        tmp_parent_options = qdict_new();
3631        parent_options = tmp_parent_options;
3632    }
3633
3634    bs->open_flags &= ~BDRV_O_NO_BACKING;
3635
3636    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3637    qdict_extract_subqdict(parent_options, &options, bdref_key_dot);
3638    g_free(bdref_key_dot);
3639
3640    /*
3641     * Caution: while qdict_get_try_str() is fine, getting non-string
3642     * types would require more care.  When @parent_options come from
3643     * -blockdev or blockdev_add, its members are typed according to
3644     * the QAPI schema, but when they come from -drive, they're all
3645     * QString.
3646     */
3647    reference = qdict_get_try_str(parent_options, bdref_key);
3648    if (reference || qdict_haskey(options, "file.filename")) {
3649        /* keep backing_filename NULL */
3650    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
3651        qobject_unref(options);
3652        goto free_exit;
3653    } else {
3654        if (qdict_size(options) == 0) {
3655            /* If the user specifies options that do not modify the
3656             * backing file's behavior, we might still consider it the
3657             * implicit backing file.  But it's easier this way, and
3658             * just specifying some of the backing BDS's options is
3659             * only possible with -drive anyway (otherwise the QAPI
3660             * schema forces the user to specify everything). */
3661            implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
3662        }
3663
3664        backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
3665        if (local_err) {
3666            ret = -EINVAL;
3667            error_propagate(errp, local_err);
3668            qobject_unref(options);
3669            goto free_exit;
3670        }
3671    }
3672
3673    if (!bs->drv || !bs->drv->supports_backing) {
3674        ret = -EINVAL;
3675        error_setg(errp, "Driver doesn't support backing files");
3676        qobject_unref(options);
3677        goto free_exit;
3678    }
3679
3680    if (!reference &&
3681        bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
3682        qdict_put_str(options, "driver", bs->backing_format);
3683    }
3684
3685    backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
3686                                   &child_of_bds, bdrv_backing_role(bs), true,
3687                                   errp);
3688    if (!backing_hd) {
3689        bs->open_flags |= BDRV_O_NO_BACKING;
3690        error_prepend(errp, "Could not open backing file: ");
3691        ret = -EINVAL;
3692        goto free_exit;
3693    }
3694
3695    if (implicit_backing) {
3696        bdrv_refresh_filename(backing_hd);
3697        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
3698                backing_hd->filename);
3699    }
3700
3701    /* Hook up the backing file link; drop our reference, bs owns the
3702     * backing_hd reference now */
3703    bdrv_graph_rdunlock_main_loop();
3704    bdrv_graph_wrlock_drained();
3705    ret = bdrv_set_backing_hd(bs, backing_hd, errp);
3706    bdrv_graph_wrunlock();
3707    bdrv_graph_rdlock_main_loop();
3708    bdrv_unref(backing_hd);
3709
3710    if (ret < 0) {
3711        goto free_exit;
3712    }
3713
3714    qdict_del(parent_options, bdref_key);
3715
3716free_exit:
3717    g_free(backing_filename);
3718    qobject_unref(tmp_parent_options);
3719    bdrv_graph_rdunlock_main_loop();
3720    return ret;
3721}
3722
3723static BlockDriverState *
3724bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
3725                   BlockDriverState *parent, const BdrvChildClass *child_class,
3726                   BdrvChildRole child_role, bool allow_none,
3727                   bool parse_filename, Error **errp)
3728{
3729    BlockDriverState *bs = NULL;
3730    QDict *image_options;
3731    char *bdref_key_dot;
3732    const char *reference;
3733
3734    assert(child_class != NULL);
3735
3736    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3737    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
3738    g_free(bdref_key_dot);
3739
3740    /*
3741     * Caution: while qdict_get_try_str() is fine, getting non-string
3742     * types would require more care.  When @options come from
3743     * -blockdev or blockdev_add, its members are typed according to
3744     * the QAPI schema, but when they come from -drive, they're all
3745     * QString.
3746     */
3747    reference = qdict_get_try_str(options, bdref_key);
3748    if (!filename && !reference && !qdict_size(image_options)) {
3749        if (!allow_none) {
3750            error_setg(errp, "A block device must be specified for \"%s\"",
3751                       bdref_key);
3752        }
3753        qobject_unref(image_options);
3754        goto done;
3755    }
3756
3757    bs = bdrv_open_inherit(filename, reference, image_options, 0,
3758                           parent, child_class, child_role, parse_filename,
3759                           errp);
3760    if (!bs) {
3761        goto done;
3762    }
3763
3764done:
3765    qdict_del(options, bdref_key);
3766    return bs;
3767}
3768
3769static BdrvChild * GRAPH_UNLOCKED
3770bdrv_open_child_common(const char *filename, QDict *options,
3771                       const char *bdref_key, BlockDriverState *parent,
3772                       const BdrvChildClass *child_class,
3773                       BdrvChildRole child_role, bool allow_none,
3774                       bool parse_filename, Error **errp)
3775{
3776    BlockDriverState *bs;
3777    BdrvChild *child;
3778
3779    GLOBAL_STATE_CODE();
3780
3781    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
3782                            child_role, allow_none, parse_filename, errp);
3783    if (bs == NULL) {
3784        return NULL;
3785    }
3786
3787    bdrv_graph_wrlock_drained();
3788    child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
3789                              errp);
3790    bdrv_graph_wrunlock();
3791
3792    return child;
3793}
3794
3795/*
3796 * Opens a disk image whose options are given as BlockdevRef in another block
3797 * device's options.
3798 *
3799 * If allow_none is true, no image will be opened if filename is false and no
3800 * BlockdevRef is given. NULL will be returned, but errp remains unset.
3801 *
3802 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
3803 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3804 * itself, all options starting with "${bdref_key}." are considered part of the
3805 * BlockdevRef.
3806 *
3807 * The BlockdevRef will be removed from the options QDict.
3808 *
3809 * @parent can move to a different AioContext in this function.
3810 */
3811BdrvChild *bdrv_open_child(const char *filename,
3812                           QDict *options, const char *bdref_key,
3813                           BlockDriverState *parent,
3814                           const BdrvChildClass *child_class,
3815                           BdrvChildRole child_role,
3816                           bool allow_none, Error **errp)
3817{
3818    return bdrv_open_child_common(filename, options, bdref_key, parent,
3819                                  child_class, child_role, allow_none, false,
3820                                  errp);
3821}
3822
3823/*
3824 * This does mostly the same as bdrv_open_child(), but for opening the primary
3825 * child of a node. A notable difference from bdrv_open_child() is that it
3826 * enables filename parsing for protocol names (including json:).
3827 *
3828 * @parent can move to a different AioContext in this function.
3829 */
3830int bdrv_open_file_child(const char *filename,
3831                         QDict *options, const char *bdref_key,
3832                         BlockDriverState *parent, Error **errp)
3833{
3834    BdrvChildRole role;
3835
3836    /* commit_top and mirror_top don't use this function */
3837    assert(!parent->drv->filtered_child_is_backing);
3838    role = parent->drv->is_filter ?
3839        (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE;
3840
3841    if (!bdrv_open_child_common(filename, options, bdref_key, parent,
3842                                &child_of_bds, role, false, true, errp))
3843    {
3844        return -EINVAL;
3845    }
3846
3847    return 0;
3848}
3849
3850/*
3851 * TODO Future callers may need to specify parent/child_class in order for
3852 * option inheritance to work. Existing callers use it for the root node.
3853 */
3854BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
3855{
3856    BlockDriverState *bs = NULL;
3857    QObject *obj = NULL;
3858    QDict *qdict = NULL;
3859    const char *reference = NULL;
3860    Visitor *v = NULL;
3861
3862    GLOBAL_STATE_CODE();
3863
3864    if (ref->type == QTYPE_QSTRING) {
3865        reference = ref->u.reference;
3866    } else {
3867        BlockdevOptions *options = &ref->u.definition;
3868        assert(ref->type == QTYPE_QDICT);
3869
3870        v = qobject_output_visitor_new(&obj);
3871        visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
3872        visit_complete(v, &obj);
3873
3874        qdict = qobject_to(QDict, obj);
3875        qdict_flatten(qdict);
3876
3877        /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
3878         * compatibility with other callers) rather than what we want as the
3879         * real defaults. Apply the defaults here instead. */
3880        qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
3881        qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
3882        qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
3883        qdict_set_default_str(qdict, BDRV_OPT_AUTO_READ_ONLY, "off");
3884
3885    }
3886
3887    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, false,
3888                           errp);
3889    obj = NULL;
3890    qobject_unref(obj);
3891    visit_free(v);
3892    return bs;
3893}
3894
3895static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
3896                                                   int flags,
3897                                                   QDict *snapshot_options,
3898                                                   Error **errp)
3899{
3900    ERRP_GUARD();
3901    g_autofree char *tmp_filename = NULL;
3902    int64_t total_size;
3903    QemuOpts *opts = NULL;
3904    BlockDriverState *bs_snapshot = NULL;
3905    int ret;
3906
3907    GLOBAL_STATE_CODE();
3908
3909    /* if snapshot, we create a temporary backing file and open it
3910       instead of opening 'filename' directly */
3911
3912    /* Get the required size from the image */
3913    total_size = bdrv_getlength(bs);
3914
3915    if (total_size < 0) {
3916        error_setg_errno(errp, -total_size, "Could not get image size");
3917        goto out;
3918    }
3919
3920    /* Create the temporary image */
3921    tmp_filename = create_tmp_file(errp);
3922    if (!tmp_filename) {
3923        goto out;
3924    }
3925
3926    opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
3927                            &error_abort);
3928    qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
3929    ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, errp);
3930    qemu_opts_del(opts);
3931    if (ret < 0) {
3932        error_prepend(errp, "Could not create temporary overlay '%s': ",
3933                      tmp_filename);
3934        goto out;
3935    }
3936
3937    /* Prepare options QDict for the temporary file */
3938    qdict_put_str(snapshot_options, "file.driver", "file");
3939    qdict_put_str(snapshot_options, "file.filename", tmp_filename);
3940    qdict_put_str(snapshot_options, "driver", "qcow2");
3941
3942    bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
3943    snapshot_options = NULL;
3944    if (!bs_snapshot) {
3945        goto out;
3946    }
3947
3948    ret = bdrv_append(bs_snapshot, bs, errp);
3949    if (ret < 0) {
3950        bs_snapshot = NULL;
3951        goto out;
3952    }
3953
3954out:
3955    qobject_unref(snapshot_options);
3956    return bs_snapshot;
3957}
3958
3959/*
3960 * Opens a disk image (raw, qcow2, vmdk, ...)
3961 *
3962 * options is a QDict of options to pass to the block drivers, or NULL for an
3963 * empty set of options. The reference to the QDict belongs to the block layer
3964 * after the call (even on failure), so if the caller intends to reuse the
3965 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
3966 *
3967 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
3968 * If it is not NULL, the referenced BDS will be reused.
3969 *
3970 * The reference parameter may be used to specify an existing block device which
3971 * should be opened. If specified, neither options nor a filename may be given,
3972 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
3973 */
3974static BlockDriverState * no_coroutine_fn
3975bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
3976                  int flags, BlockDriverState *parent,
3977                  const BdrvChildClass *child_class, BdrvChildRole child_role,
3978                  bool parse_filename, Error **errp)
3979{
3980    int ret;
3981    BlockBackend *file = NULL;
3982    BlockDriverState *bs;
3983    BlockDriver *drv = NULL;
3984    BdrvChild *child;
3985    const char *drvname;
3986    const char *backing;
3987    Error *local_err = NULL;
3988    QDict *snapshot_options = NULL;
3989    int snapshot_flags = 0;
3990
3991    assert(!child_class || !flags);
3992    assert(!child_class == !parent);
3993    GLOBAL_STATE_CODE();
3994    assert(!qemu_in_coroutine());
3995
3996    /* TODO We'll eventually have to take a writer lock in this function */
3997    GRAPH_RDLOCK_GUARD_MAINLOOP();
3998
3999    if (reference) {
4000        bool options_non_empty = options ? qdict_size(options) : false;
4001        qobject_unref(options);
4002
4003        if (filename || options_non_empty) {
4004            error_setg(errp, "Cannot reference an existing block device with "
4005                       "additional options or a new filename");
4006            return NULL;
4007        }
4008
4009        bs = bdrv_lookup_bs(reference, reference, errp);
4010        if (!bs) {
4011            return NULL;
4012        }
4013
4014        bdrv_ref(bs);
4015        return bs;
4016    }
4017
4018    bs = bdrv_new();
4019
4020    /* NULL means an empty set of options */
4021    if (options == NULL) {
4022        options = qdict_new();
4023    }
4024
4025    /* json: syntax counts as explicit options, as if in the QDict */
4026    if (parse_filename) {
4027        parse_json_protocol(options, &filename, &local_err);
4028        if (local_err) {
4029            goto fail;
4030        }
4031    }
4032
4033    bs->explicit_options = qdict_clone_shallow(options);
4034
4035    if (child_class) {
4036        bool parent_is_format;
4037
4038        if (parent->drv) {
4039            parent_is_format = parent->drv->is_format;
4040        } else {
4041            /*
4042             * parent->drv is not set yet because this node is opened for
4043             * (potential) format probing.  That means that @parent is going
4044             * to be a format node.
4045             */
4046            parent_is_format = true;
4047        }
4048
4049        bs->inherits_from = parent;
4050        child_class->inherit_options(child_role, parent_is_format,
4051                                     &flags, options,
4052                                     parent->open_flags, parent->options);
4053    }
4054
4055    ret = bdrv_fill_options(&options, filename, &flags, parse_filename,
4056                            &local_err);
4057    if (ret < 0) {
4058        goto fail;
4059    }
4060
4061    /*
4062     * Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
4063     * Caution: getting a boolean member of @options requires care.
4064     * When @options come from -blockdev or blockdev_add, members are
4065     * typed according to the QAPI schema, but when they come from
4066     * -drive, they're all QString.
4067     */
4068    if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
4069        !qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
4070        flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
4071    } else {
4072        flags &= ~BDRV_O_RDWR;
4073    }
4074
4075    if (flags & BDRV_O_SNAPSHOT) {
4076        snapshot_options = qdict_new();
4077        bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
4078                                   flags, options);
4079        /* Let bdrv_backing_options() override "read-only" */
4080        qdict_del(options, BDRV_OPT_READ_ONLY);
4081        bdrv_inherited_options(BDRV_CHILD_COW, true,
4082                               &flags, options, flags, options);
4083    }
4084
4085    bs->open_flags = flags;
4086    bs->options = options;
4087    options = qdict_clone_shallow(options);
4088
4089    /* Find the right image format driver */
4090    /* See cautionary note on accessing @options above */
4091    drvname = qdict_get_try_str(options, "driver");
4092    if (drvname) {
4093        drv = bdrv_find_format(drvname);
4094        if (!drv) {
4095            error_setg(errp, "Unknown driver: '%s'", drvname);
4096            goto fail;
4097        }
4098    }
4099
4100    assert(drvname || !(flags & BDRV_O_PROTOCOL));
4101
4102    /* See cautionary note on accessing @options above */
4103    backing = qdict_get_try_str(options, "backing");
4104    if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
4105        (backing && *backing == '\0'))
4106    {
4107        if (backing) {
4108            warn_report("Use of \"backing\": \"\" is deprecated; "
4109                        "use \"backing\": null instead");
4110        }
4111        flags |= BDRV_O_NO_BACKING;
4112        qdict_del(bs->explicit_options, "backing");
4113        qdict_del(bs->options, "backing");
4114        qdict_del(options, "backing");
4115    }
4116
4117    /* Open image file without format layer. This BlockBackend is only used for
4118     * probing, the block drivers will do their own bdrv_open_child() for the
4119     * same BDS, which is why we put the node name back into options. */
4120    if ((flags & BDRV_O_PROTOCOL) == 0) {
4121        BlockDriverState *file_bs;
4122
4123        file_bs = bdrv_open_child_bs(filename, options, "file", bs,
4124                                     &child_of_bds, BDRV_CHILD_IMAGE,
4125                                     true, true, &local_err);
4126        if (local_err) {
4127            goto fail;
4128        }
4129        if (file_bs != NULL) {
4130            /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
4131             * looking at the header to guess the image format. This works even
4132             * in cases where a guest would not see a consistent state. */
4133            AioContext *ctx = bdrv_get_aio_context(file_bs);
4134            file = blk_new(ctx, 0, BLK_PERM_ALL);
4135            blk_insert_bs(file, file_bs, &local_err);
4136            bdrv_unref(file_bs);
4137
4138            if (local_err) {
4139                goto fail;
4140            }
4141
4142            qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
4143        }
4144    }
4145
4146    /* Image format probing */
4147    bs->probed = !drv;
4148    if (!drv && file) {
4149        ret = find_image_format(file, filename, &drv, &local_err);
4150        if (ret < 0) {
4151            goto fail;
4152        }
4153        /*
4154         * This option update would logically belong in bdrv_fill_options(),
4155         * but we first need to open bs->file for the probing to work, while
4156         * opening bs->file already requires the (mostly) final set of options
4157         * so that cache mode etc. can be inherited.
4158         *
4159         * Adding the driver later is somewhat ugly, but it's not an option
4160         * that would ever be inherited, so it's correct. We just need to make
4161         * sure to update both bs->options (which has the full effective
4162         * options for bs) and options (which has file.* already removed).
4163         */
4164        qdict_put_str(bs->options, "driver", drv->format_name);
4165        qdict_put_str(options, "driver", drv->format_name);
4166    } else if (!drv) {
4167        error_setg(errp, "Must specify either driver or file");
4168        goto fail;
4169    }
4170
4171    /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
4172    assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->protocol_name);
4173    /* file must be NULL if a protocol BDS is about to be created
4174     * (the inverse results in an error message from bdrv_open_common()) */
4175    assert(!(flags & BDRV_O_PROTOCOL) || !file);
4176
4177    /* Open the image */
4178    ret = bdrv_open_common(bs, file, options, &local_err);
4179    if (ret < 0) {
4180        goto fail;
4181    }
4182
4183    if (file) {
4184        blk_unref(file);
4185        file = NULL;
4186    }
4187
4188    /* If there is a backing file, use it */
4189    if ((flags & BDRV_O_NO_BACKING) == 0) {
4190        ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
4191        if (ret < 0) {
4192            goto close_and_fail;
4193        }
4194    }
4195
4196    /* Remove all children options and references
4197     * from bs->options and bs->explicit_options */
4198    QLIST_FOREACH(child, &bs->children, next) {
4199        char *child_key_dot;
4200        child_key_dot = g_strdup_printf("%s.", child->name);
4201        qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
4202        qdict_extract_subqdict(bs->options, NULL, child_key_dot);
4203        qdict_del(bs->explicit_options, child->name);
4204        qdict_del(bs->options, child->name);
4205        g_free(child_key_dot);
4206    }
4207
4208    /* Check if any unknown options were used */
4209    if (qdict_size(options) != 0) {
4210        const QDictEntry *entry = qdict_first(options);
4211        if (flags & BDRV_O_PROTOCOL) {
4212            error_setg(errp, "Block protocol '%s' doesn't support the option "
4213                       "'%s'", drv->format_name, entry->key);
4214        } else {
4215            error_setg(errp,
4216                       "Block format '%s' does not support the option '%s'",
4217                       drv->format_name, entry->key);
4218        }
4219
4220        goto close_and_fail;
4221    }
4222
4223    bdrv_parent_cb_change_media(bs, true);
4224
4225    qobject_unref(options);
4226    options = NULL;
4227
4228    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
4229     * temporary snapshot afterwards. */
4230    if (snapshot_flags) {
4231        BlockDriverState *snapshot_bs;
4232        snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
4233                                                snapshot_options, &local_err);
4234        snapshot_options = NULL;
4235        if (local_err) {
4236            goto close_and_fail;
4237        }
4238        /* We are not going to return bs but the overlay on top of it
4239         * (snapshot_bs); thus, we have to drop the strong reference to bs
4240         * (which we obtained by calling bdrv_new()). bs will not be deleted,
4241         * though, because the overlay still has a reference to it. */
4242        bdrv_unref(bs);
4243        bs = snapshot_bs;
4244    }
4245
4246    return bs;
4247
4248fail:
4249    blk_unref(file);
4250    qobject_unref(snapshot_options);
4251    qobject_unref(bs->explicit_options);
4252    qobject_unref(bs->options);
4253    qobject_unref(options);
4254    bs->options = NULL;
4255    bs->explicit_options = NULL;
4256    bdrv_unref(bs);
4257    error_propagate(errp, local_err);
4258    return NULL;
4259
4260close_and_fail:
4261    bdrv_unref(bs);
4262    qobject_unref(snapshot_options);
4263    qobject_unref(options);
4264    error_propagate(errp, local_err);
4265    return NULL;
4266}
4267
4268BlockDriverState *bdrv_open(const char *filename, const char *reference,
4269                            QDict *options, int flags, Error **errp)
4270{
4271    GLOBAL_STATE_CODE();
4272
4273    return bdrv_open_inherit(filename, reference, options, flags, NULL,
4274                             NULL, 0, true, errp);
4275}
4276
4277/* Return true if the NULL-terminated @list contains @str */
4278static bool is_str_in_list(const char *str, const char *const *list)
4279{
4280    if (str && list) {
4281        int i;
4282        for (i = 0; list[i] != NULL; i++) {
4283            if (!strcmp(str, list[i])) {
4284                return true;
4285            }
4286        }
4287    }
4288    return false;
4289}
4290
4291/*
4292 * Check that every option set in @bs->options is also set in
4293 * @new_opts.
4294 *
4295 * Options listed in the common_options list and in
4296 * @bs->drv->mutable_opts are skipped.
4297 *
4298 * Return 0 on success, otherwise return -EINVAL and set @errp.
4299 */
4300static int bdrv_reset_options_allowed(BlockDriverState *bs,
4301                                      const QDict *new_opts, Error **errp)
4302{
4303    const QDictEntry *e;
4304    /* These options are common to all block drivers and are handled
4305     * in bdrv_reopen_prepare() so they can be left out of @new_opts */
4306    const char *const common_options[] = {
4307        "node-name", "discard", "cache.direct", "cache.no-flush",
4308        "read-only", "auto-read-only", "detect-zeroes", NULL
4309    };
4310
4311    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
4312        if (!qdict_haskey(new_opts, e->key) &&
4313            !is_str_in_list(e->key, common_options) &&
4314            !is_str_in_list(e->key, bs->drv->mutable_opts)) {
4315            error_setg(errp, "Option '%s' cannot be reset "
4316                       "to its default value", e->key);
4317            return -EINVAL;
4318        }
4319    }
4320
4321    return 0;
4322}
4323
4324/*
4325 * Returns true if @child can be reached recursively from @bs
4326 */
4327static bool GRAPH_RDLOCK
4328bdrv_recurse_has_child(BlockDriverState *bs, BlockDriverState *child)
4329{
4330    BdrvChild *c;
4331
4332    if (bs == child) {
4333        return true;
4334    }
4335
4336    QLIST_FOREACH(c, &bs->children, next) {
4337        if (bdrv_recurse_has_child(c->bs, child)) {
4338            return true;
4339        }
4340    }
4341
4342    return false;
4343}
4344
4345/*
4346 * Adds a BlockDriverState to a simple queue for an atomic, transactional
4347 * reopen of multiple devices.
4348 *
4349 * bs_queue can either be an existing BlockReopenQueue that has had QTAILQ_INIT
4350 * already performed, or alternatively may be NULL a new BlockReopenQueue will
4351 * be created and initialized. This newly created BlockReopenQueue should be
4352 * passed back in for subsequent calls that are intended to be of the same
4353 * atomic 'set'.
4354 *
4355 * bs is the BlockDriverState to add to the reopen queue.
4356 *
4357 * options contains the changed options for the associated bs
4358 * (the BlockReopenQueue takes ownership)
4359 *
4360 * flags contains the open flags for the associated bs
4361 *
4362 * returns a pointer to bs_queue, which is either the newly allocated
4363 * bs_queue, or the existing bs_queue being used.
4364 *
4365 * bs must be drained.
4366 */
4367static BlockReopenQueue * GRAPH_RDLOCK
4368bdrv_reopen_queue_child(BlockReopenQueue *bs_queue, BlockDriverState *bs,
4369                        QDict *options, const BdrvChildClass *klass,
4370                        BdrvChildRole role, bool parent_is_format,
4371                        QDict *parent_options, int parent_flags,
4372                        bool keep_old_opts)
4373{
4374    assert(bs != NULL);
4375
4376    BlockReopenQueueEntry *bs_entry;
4377    BdrvChild *child;
4378    QDict *old_options, *explicit_options, *options_copy;
4379    int flags;
4380    QemuOpts *opts;
4381
4382    GLOBAL_STATE_CODE();
4383
4384    assert(bs->quiesce_counter > 0);
4385
4386    if (bs_queue == NULL) {
4387        bs_queue = g_new0(BlockReopenQueue, 1);
4388        QTAILQ_INIT(bs_queue);
4389    }
4390
4391    if (!options) {
4392        options = qdict_new();
4393    }
4394
4395    /* Check if this BlockDriverState is already in the queue */
4396    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4397        if (bs == bs_entry->state.bs) {
4398            break;
4399        }
4400    }
4401
4402    /*
4403     * Precedence of options:
4404     * 1. Explicitly passed in options (highest)
4405     * 2. Retained from explicitly set options of bs
4406     * 3. Inherited from parent node
4407     * 4. Retained from effective options of bs
4408     */
4409
4410    /* Old explicitly set values (don't overwrite by inherited value) */
4411    if (bs_entry || keep_old_opts) {
4412        old_options = qdict_clone_shallow(bs_entry ?
4413                                          bs_entry->state.explicit_options :
4414                                          bs->explicit_options);
4415        bdrv_join_options(bs, options, old_options);
4416        qobject_unref(old_options);
4417    }
4418
4419    explicit_options = qdict_clone_shallow(options);
4420
4421    /* Inherit from parent node */
4422    if (parent_options) {
4423        flags = 0;
4424        klass->inherit_options(role, parent_is_format, &flags, options,
4425                               parent_flags, parent_options);
4426    } else {
4427        flags = bdrv_get_flags(bs);
4428    }
4429
4430    if (keep_old_opts) {
4431        /* Old values are used for options that aren't set yet */
4432        old_options = qdict_clone_shallow(bs->options);
4433        bdrv_join_options(bs, options, old_options);
4434        qobject_unref(old_options);
4435    }
4436
4437    /* We have the final set of options so let's update the flags */
4438    options_copy = qdict_clone_shallow(options);
4439    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4440    qemu_opts_absorb_qdict(opts, options_copy, NULL);
4441    update_flags_from_options(&flags, opts);
4442    qemu_opts_del(opts);
4443    qobject_unref(options_copy);
4444
4445    /* bdrv_open_inherit() sets and clears some additional flags internally */
4446    flags &= ~BDRV_O_PROTOCOL;
4447    if (flags & BDRV_O_RDWR) {
4448        flags |= BDRV_O_ALLOW_RDWR;
4449    }
4450
4451    if (!bs_entry) {
4452        bs_entry = g_new0(BlockReopenQueueEntry, 1);
4453        QTAILQ_INSERT_TAIL(bs_queue, bs_entry, entry);
4454    } else {
4455        qobject_unref(bs_entry->state.options);
4456        qobject_unref(bs_entry->state.explicit_options);
4457    }
4458
4459    bs_entry->state.bs = bs;
4460    bs_entry->state.options = options;
4461    bs_entry->state.explicit_options = explicit_options;
4462    bs_entry->state.flags = flags;
4463
4464    /*
4465     * If keep_old_opts is false then it means that unspecified
4466     * options must be reset to their original value. We don't allow
4467     * resetting 'backing' but we need to know if the option is
4468     * missing in order to decide if we have to return an error.
4469     */
4470    if (!keep_old_opts) {
4471        bs_entry->state.backing_missing =
4472            !qdict_haskey(options, "backing") &&
4473            !qdict_haskey(options, "backing.driver");
4474    }
4475
4476    QLIST_FOREACH(child, &bs->children, next) {
4477        QDict *new_child_options = NULL;
4478        bool child_keep_old = keep_old_opts;
4479
4480        /* reopen can only change the options of block devices that were
4481         * implicitly created and inherited options. For other (referenced)
4482         * block devices, a syntax like "backing.foo" results in an error. */
4483        if (child->bs->inherits_from != bs) {
4484            continue;
4485        }
4486
4487        /* Check if the options contain a child reference */
4488        if (qdict_haskey(options, child->name)) {
4489            const char *childref = qdict_get_try_str(options, child->name);
4490            /*
4491             * The current child must not be reopened if the child
4492             * reference is null or points to a different node.
4493             */
4494            if (g_strcmp0(childref, child->bs->node_name)) {
4495                continue;
4496            }
4497            /*
4498             * If the child reference points to the current child then
4499             * reopen it with its existing set of options (note that
4500             * it can still inherit new options from the parent).
4501             */
4502            child_keep_old = true;
4503        } else {
4504            /* Extract child options ("child-name.*") */
4505            char *child_key_dot = g_strdup_printf("%s.", child->name);
4506            qdict_extract_subqdict(explicit_options, NULL, child_key_dot);
4507            qdict_extract_subqdict(options, &new_child_options, child_key_dot);
4508            g_free(child_key_dot);
4509        }
4510
4511        bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options,
4512                                child->klass, child->role, bs->drv->is_format,
4513                                options, flags, child_keep_old);
4514    }
4515
4516    return bs_queue;
4517}
4518
4519BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
4520                                    BlockDriverState *bs,
4521                                    QDict *options, bool keep_old_opts)
4522{
4523    GLOBAL_STATE_CODE();
4524
4525    if (bs_queue == NULL) {
4526        /* Paired with bdrv_drain_all_end() in bdrv_reopen_queue_free(). */
4527        bdrv_drain_all_begin();
4528    }
4529
4530    GRAPH_RDLOCK_GUARD_MAINLOOP();
4531
4532    return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false,
4533                                   NULL, 0, keep_old_opts);
4534}
4535
4536void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
4537{
4538    GLOBAL_STATE_CODE();
4539    if (bs_queue) {
4540        BlockReopenQueueEntry *bs_entry, *next;
4541        QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4542            qobject_unref(bs_entry->state.explicit_options);
4543            qobject_unref(bs_entry->state.options);
4544            g_free(bs_entry);
4545        }
4546        g_free(bs_queue);
4547
4548        /* Paired with bdrv_drain_all_begin() in bdrv_reopen_queue(). */
4549        bdrv_drain_all_end();
4550    }
4551}
4552
4553/*
4554 * Reopen multiple BlockDriverStates atomically & transactionally.
4555 *
4556 * The queue passed in (bs_queue) must have been built up previous
4557 * via bdrv_reopen_queue().
4558 *
4559 * Reopens all BDS specified in the queue, with the appropriate
4560 * flags.  All devices are prepared for reopen, and failure of any
4561 * device will cause all device changes to be abandoned, and intermediate
4562 * data cleaned up.
4563 *
4564 * If all devices prepare successfully, then the changes are committed
4565 * to all devices.
4566 *
4567 * All affected nodes must be drained between bdrv_reopen_queue() and
4568 * bdrv_reopen_multiple().
4569 *
4570 * To be called from the main thread, with all other AioContexts unlocked.
4571 */
4572int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
4573{
4574    int ret = -1;
4575    BlockReopenQueueEntry *bs_entry, *next;
4576    Transaction *tran = tran_new();
4577    g_autoptr(GSList) refresh_list = NULL;
4578
4579    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
4580    assert(bs_queue != NULL);
4581    GLOBAL_STATE_CODE();
4582
4583    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4584        ret = bdrv_flush(bs_entry->state.bs);
4585        if (ret < 0) {
4586            error_setg_errno(errp, -ret, "Error flushing drive");
4587            goto abort;
4588        }
4589    }
4590
4591    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4592        assert(bs_entry->state.bs->quiesce_counter > 0);
4593        ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
4594        if (ret < 0) {
4595            goto abort;
4596        }
4597        bs_entry->prepared = true;
4598    }
4599
4600    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4601        BDRVReopenState *state = &bs_entry->state;
4602
4603        refresh_list = g_slist_prepend(refresh_list, state->bs);
4604        if (state->old_backing_bs) {
4605            refresh_list = g_slist_prepend(refresh_list, state->old_backing_bs);
4606        }
4607        if (state->old_file_bs) {
4608            refresh_list = g_slist_prepend(refresh_list, state->old_file_bs);
4609        }
4610    }
4611
4612    /*
4613     * Note that file-posix driver rely on permission update done during reopen
4614     * (even if no permission changed), because it wants "new" permissions for
4615     * reconfiguring the fd and that's why it does it in raw_check_perm(), not
4616     * in raw_reopen_prepare() which is called with "old" permissions.
4617     */
4618    bdrv_graph_rdlock_main_loop();
4619    ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
4620    bdrv_graph_rdunlock_main_loop();
4621
4622    if (ret < 0) {
4623        goto abort;
4624    }
4625
4626    /*
4627     * If we reach this point, we have success and just need to apply the
4628     * changes.
4629     *
4630     * Reverse order is used to comfort qcow2 driver: on commit it need to write
4631     * IN_USE flag to the image, to mark bitmaps in the image as invalid. But
4632     * children are usually goes after parents in reopen-queue, so go from last
4633     * to first element.
4634     */
4635    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4636        bdrv_reopen_commit(&bs_entry->state);
4637    }
4638
4639    bdrv_graph_wrlock();
4640    tran_commit(tran);
4641    bdrv_graph_wrunlock();
4642
4643    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4644        BlockDriverState *bs = bs_entry->state.bs;
4645
4646        if (bs->drv->bdrv_reopen_commit_post) {
4647            bs->drv->bdrv_reopen_commit_post(&bs_entry->state);
4648        }
4649    }
4650
4651    ret = 0;
4652    goto cleanup;
4653
4654abort:
4655    bdrv_graph_wrlock();
4656    tran_abort(tran);
4657    bdrv_graph_wrunlock();
4658
4659    QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4660        if (bs_entry->prepared) {
4661            bdrv_reopen_abort(&bs_entry->state);
4662        }
4663    }
4664
4665cleanup:
4666    bdrv_reopen_queue_free(bs_queue);
4667
4668    return ret;
4669}
4670
4671int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
4672                Error **errp)
4673{
4674    BlockReopenQueue *queue;
4675
4676    GLOBAL_STATE_CODE();
4677
4678    queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
4679
4680    return bdrv_reopen_multiple(queue, errp);
4681}
4682
4683int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
4684                              Error **errp)
4685{
4686    QDict *opts = qdict_new();
4687
4688    GLOBAL_STATE_CODE();
4689
4690    qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only);
4691
4692    return bdrv_reopen(bs, opts, true, errp);
4693}
4694
4695/*
4696 * Take a BDRVReopenState and check if the value of 'backing' in the
4697 * reopen_state->options QDict is valid or not.
4698 *
4699 * If 'backing' is missing from the QDict then return 0.
4700 *
4701 * If 'backing' contains the node name of the backing file of
4702 * reopen_state->bs then return 0.
4703 *
4704 * If 'backing' contains a different node name (or is null) then check
4705 * whether the current backing file can be replaced with the new one.
4706 * If that's the case then reopen_state->replace_backing_bs is set to
4707 * true and reopen_state->new_backing_bs contains a pointer to the new
4708 * backing BlockDriverState (or NULL).
4709 *
4710 * After calling this function, the transaction @tran may only be completed
4711 * while holding a writer lock for the graph.
4712 *
4713 * Return 0 on success, otherwise return < 0 and set @errp.
4714 *
4715 * @reopen_state->bs can move to a different AioContext in this function.
4716 *
4717 * All block nodes must be drained before this function is called until after
4718 * the transaction is finalized.
4719 */
4720static int GRAPH_UNLOCKED
4721bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
4722                                  bool is_backing, Transaction *tran,
4723                                  Error **errp)
4724{
4725    BlockDriverState *bs = reopen_state->bs;
4726    BlockDriverState *new_child_bs;
4727    BlockDriverState *old_child_bs;
4728
4729    const char *child_name = is_backing ? "backing" : "file";
4730    QObject *value;
4731    const char *str;
4732    bool has_child;
4733    int ret;
4734
4735    GLOBAL_STATE_CODE();
4736
4737    value = qdict_get(reopen_state->options, child_name);
4738    if (value == NULL) {
4739        return 0;
4740    }
4741
4742    bdrv_graph_rdlock_main_loop();
4743
4744    switch (qobject_type(value)) {
4745    case QTYPE_QNULL:
4746        assert(is_backing); /* The 'file' option does not allow a null value */
4747        new_child_bs = NULL;
4748        break;
4749    case QTYPE_QSTRING:
4750        str = qstring_get_str(qobject_to(QString, value));
4751        new_child_bs = bdrv_lookup_bs(NULL, str, errp);
4752        if (new_child_bs == NULL) {
4753            ret = -EINVAL;
4754            goto out_rdlock;
4755        }
4756
4757        has_child = bdrv_recurse_has_child(new_child_bs, bs);
4758        if (has_child) {
4759            error_setg(errp, "Making '%s' a %s child of '%s' would create a "
4760                       "cycle", str, child_name, bs->node_name);
4761            ret = -EINVAL;
4762            goto out_rdlock;
4763        }
4764        break;
4765    default:
4766        /*
4767         * The options QDict has been flattened, so 'backing' and 'file'
4768         * do not allow any other data type here.
4769         */
4770        g_assert_not_reached();
4771    }
4772
4773    old_child_bs = is_backing ? child_bs(bs->backing) : child_bs(bs->file);
4774    if (old_child_bs == new_child_bs) {
4775        ret = 0;
4776        goto out_rdlock;
4777    }
4778
4779    if (old_child_bs) {
4780        if (bdrv_skip_implicit_filters(old_child_bs) == new_child_bs) {
4781            ret = 0;
4782            goto out_rdlock;
4783        }
4784
4785        if (old_child_bs->implicit) {
4786            error_setg(errp, "Cannot replace implicit %s child of %s",
4787                       child_name, bs->node_name);
4788            ret = -EPERM;
4789            goto out_rdlock;
4790        }
4791    }
4792
4793    if (bs->drv->is_filter && !old_child_bs) {
4794        /*
4795         * Filters always have a file or a backing child, so we are trying to
4796         * change wrong child
4797         */
4798        error_setg(errp, "'%s' is a %s filter node that does not support a "
4799                   "%s child", bs->node_name, bs->drv->format_name, child_name);
4800        ret = -EINVAL;
4801        goto out_rdlock;
4802    }
4803
4804    if (is_backing) {
4805        reopen_state->old_backing_bs = old_child_bs;
4806    } else {
4807        reopen_state->old_file_bs = old_child_bs;
4808    }
4809
4810    if (old_child_bs) {
4811        bdrv_ref(old_child_bs);
4812        assert(old_child_bs->quiesce_counter > 0);
4813    }
4814
4815    bdrv_graph_rdunlock_main_loop();
4816    bdrv_graph_wrlock();
4817
4818    ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
4819                                          tran, errp);
4820
4821    bdrv_graph_wrunlock();
4822
4823    if (old_child_bs) {
4824        bdrv_unref(old_child_bs);
4825    }
4826
4827    return ret;
4828
4829out_rdlock:
4830    bdrv_graph_rdunlock_main_loop();
4831    return ret;
4832}
4833
4834/*
4835 * Prepares a BlockDriverState for reopen. All changes are staged in the
4836 * 'opaque' field of the BDRVReopenState, which is used and allocated by
4837 * the block driver layer .bdrv_reopen_prepare()
4838 *
4839 * bs is the BlockDriverState to reopen
4840 * flags are the new open flags
4841 * queue is the reopen queue
4842 *
4843 * Returns 0 on success, non-zero on error.  On error errp will be set
4844 * as well.
4845 *
4846 * On failure, bdrv_reopen_abort() will be called to clean up any data.
4847 * It is the responsibility of the caller to then call the abort() or
4848 * commit() for any other BDS that have been left in a prepare() state
4849 *
4850 * After calling this function, the transaction @change_child_tran may only be
4851 * completed while holding a writer lock for the graph.
4852 *
4853 * All block nodes must be drained before this function is called until after
4854 * the transaction is finalized.
4855 */
4856static int GRAPH_UNLOCKED
4857bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
4858                    Transaction *change_child_tran, Error **errp)
4859{
4860    int ret = -1;
4861    int old_flags;
4862    Error *local_err = NULL;
4863    BlockDriver *drv;
4864    QemuOpts *opts;
4865    QDict *orig_reopen_opts;
4866    char *discard = NULL;
4867    bool read_only;
4868    bool drv_prepared = false;
4869
4870    assert(reopen_state != NULL);
4871    assert(reopen_state->bs->drv != NULL);
4872    GLOBAL_STATE_CODE();
4873    drv = reopen_state->bs->drv;
4874
4875    /* This function and each driver's bdrv_reopen_prepare() remove
4876     * entries from reopen_state->options as they are processed, so
4877     * we need to make a copy of the original QDict. */
4878    orig_reopen_opts = qdict_clone_shallow(reopen_state->options);
4879
4880    /* Process generic block layer options */
4881    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4882    if (!qemu_opts_absorb_qdict(opts, reopen_state->options, errp)) {
4883        ret = -EINVAL;
4884        goto error;
4885    }
4886
4887    /* This was already called in bdrv_reopen_queue_child() so the flags
4888     * are up-to-date. This time we simply want to remove the options from
4889     * QemuOpts in order to indicate that they have been processed. */
4890    old_flags = reopen_state->flags;
4891    update_flags_from_options(&reopen_state->flags, opts);
4892    assert(old_flags == reopen_state->flags);
4893
4894    discard = qemu_opt_get_del(opts, BDRV_OPT_DISCARD);
4895    if (discard != NULL) {
4896        if (bdrv_parse_discard_flags(discard, &reopen_state->flags) != 0) {
4897            error_setg(errp, "Invalid discard option");
4898            ret = -EINVAL;
4899            goto error;
4900        }
4901    }
4902
4903    reopen_state->detect_zeroes =
4904        bdrv_parse_detect_zeroes(opts, reopen_state->flags, &local_err);
4905    if (local_err) {
4906        error_propagate(errp, local_err);
4907        ret = -EINVAL;
4908        goto error;
4909    }
4910
4911    /* All other options (including node-name and driver) must be unchanged.
4912     * Put them back into the QDict, so that they are checked at the end
4913     * of this function. */
4914    qemu_opts_to_qdict(opts, reopen_state->options);
4915
4916    /* If we are to stay read-only, do not allow permission change
4917     * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is
4918     * not set, or if the BDS still has copy_on_read enabled */
4919    read_only = !(reopen_state->flags & BDRV_O_RDWR);
4920
4921    bdrv_graph_rdlock_main_loop();
4922    ret = bdrv_can_set_read_only(reopen_state->bs, read_only, true, &local_err);
4923    bdrv_graph_rdunlock_main_loop();
4924    if (local_err) {
4925        error_propagate(errp, local_err);
4926        goto error;
4927    }
4928
4929    if (drv->bdrv_reopen_prepare) {
4930        /*
4931         * If a driver-specific option is missing, it means that we
4932         * should reset it to its default value.
4933         * But not all options allow that, so we need to check it first.
4934         */
4935        ret = bdrv_reset_options_allowed(reopen_state->bs,
4936                                         reopen_state->options, errp);
4937        if (ret) {
4938            goto error;
4939        }
4940
4941        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
4942        if (ret) {
4943            if (local_err != NULL) {
4944                error_propagate(errp, local_err);
4945            } else {
4946                bdrv_graph_rdlock_main_loop();
4947                bdrv_refresh_filename(reopen_state->bs);
4948                bdrv_graph_rdunlock_main_loop();
4949                error_setg(errp, "failed while preparing to reopen image '%s'",
4950                           reopen_state->bs->filename);
4951            }
4952            goto error;
4953        }
4954    } else {
4955        /* It is currently mandatory to have a bdrv_reopen_prepare()
4956         * handler for each supported drv. */
4957        bdrv_graph_rdlock_main_loop();
4958        error_setg(errp, "Block format '%s' used by node '%s' "
4959                   "does not support reopening files", drv->format_name,
4960                   bdrv_get_device_or_node_name(reopen_state->bs));
4961        bdrv_graph_rdunlock_main_loop();
4962        ret = -1;
4963        goto error;
4964    }
4965
4966    drv_prepared = true;
4967
4968    /*
4969     * We must provide the 'backing' option if the BDS has a backing
4970     * file or if the image file has a backing file name as part of
4971     * its metadata. Otherwise the 'backing' option can be omitted.
4972     */
4973    bdrv_graph_rdlock_main_loop();
4974    if (drv->supports_backing && reopen_state->backing_missing &&
4975        (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
4976        error_setg(errp, "backing is missing for '%s'",
4977                   reopen_state->bs->node_name);
4978        bdrv_graph_rdunlock_main_loop();
4979        ret = -EINVAL;
4980        goto error;
4981    }
4982    bdrv_graph_rdunlock_main_loop();
4983
4984    /*
4985     * Allow changing the 'backing' option. The new value can be
4986     * either a reference to an existing node (using its node name)
4987     * or NULL to simply detach the current backing file.
4988     */
4989    ret = bdrv_reopen_parse_file_or_backing(reopen_state, true,
4990                                            change_child_tran, errp);
4991    if (ret < 0) {
4992        goto error;
4993    }
4994    qdict_del(reopen_state->options, "backing");
4995
4996    /* Allow changing the 'file' option. In this case NULL is not allowed */
4997    ret = bdrv_reopen_parse_file_or_backing(reopen_state, false,
4998                                            change_child_tran, errp);
4999    if (ret < 0) {
5000        goto error;
5001    }
5002    qdict_del(reopen_state->options, "file");
5003
5004    /* Options that are not handled are only okay if they are unchanged
5005     * compared to the old state. It is expected that some options are only
5006     * used for the initial open, but not reopen (e.g. filename) */
5007    if (qdict_size(reopen_state->options)) {
5008        const QDictEntry *entry = qdict_first(reopen_state->options);
5009
5010        GRAPH_RDLOCK_GUARD_MAINLOOP();
5011
5012        do {
5013            QObject *new = entry->value;
5014            QObject *old = qdict_get(reopen_state->bs->options, entry->key);
5015
5016            /* Allow child references (child_name=node_name) as long as they
5017             * point to the current child (i.e. everything stays the same). */
5018            if (qobject_type(new) == QTYPE_QSTRING) {
5019                BdrvChild *child;
5020                QLIST_FOREACH(child, &reopen_state->bs->children, next) {
5021                    if (!strcmp(child->name, entry->key)) {
5022                        break;
5023                    }
5024                }
5025
5026                if (child) {
5027                    if (!strcmp(child->bs->node_name,
5028                                qstring_get_str(qobject_to(QString, new)))) {
5029                        continue; /* Found child with this name, skip option */
5030                    }
5031                }
5032            }
5033
5034            /*
5035             * TODO: When using -drive to specify blockdev options, all values
5036             * will be strings; however, when using -blockdev, blockdev-add or
5037             * filenames using the json:{} pseudo-protocol, they will be
5038             * correctly typed.
5039             * In contrast, reopening options are (currently) always strings
5040             * (because you can only specify them through qemu-io; all other
5041             * callers do not specify any options).
5042             * Therefore, when using anything other than -drive to create a BDS,
5043             * this cannot detect non-string options as unchanged, because
5044             * qobject_is_equal() always returns false for objects of different
5045             * type.  In the future, this should be remedied by correctly typing
5046             * all options.  For now, this is not too big of an issue because
5047             * the user can simply omit options which cannot be changed anyway,
5048             * so they will stay unchanged.
5049             */
5050            if (!qobject_is_equal(new, old)) {
5051                error_setg(errp, "Cannot change the option '%s'", entry->key);
5052                ret = -EINVAL;
5053                goto error;
5054            }
5055        } while ((entry = qdict_next(reopen_state->options, entry)));
5056    }
5057
5058    ret = 0;
5059
5060    /* Restore the original reopen_state->options QDict */
5061    qobject_unref(reopen_state->options);
5062    reopen_state->options = qobject_ref(orig_reopen_opts);
5063
5064error:
5065    if (ret < 0 && drv_prepared) {
5066        /* drv->bdrv_reopen_prepare() has succeeded, so we need to
5067         * call drv->bdrv_reopen_abort() before signaling an error
5068         * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
5069         * when the respective bdrv_reopen_prepare() has failed) */
5070        if (drv->bdrv_reopen_abort) {
5071            drv->bdrv_reopen_abort(reopen_state);
5072        }
5073    }
5074    qemu_opts_del(opts);
5075    qobject_unref(orig_reopen_opts);
5076    g_free(discard);
5077    return ret;
5078}
5079
5080/*
5081 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
5082 * makes them final by swapping the staging BlockDriverState contents into
5083 * the active BlockDriverState contents.
5084 */
5085static void GRAPH_UNLOCKED bdrv_reopen_commit(BDRVReopenState *reopen_state)
5086{
5087    BlockDriver *drv;
5088    BlockDriverState *bs;
5089    BdrvChild *child;
5090
5091    assert(reopen_state != NULL);
5092    bs = reopen_state->bs;
5093    drv = bs->drv;
5094    assert(drv != NULL);
5095    GLOBAL_STATE_CODE();
5096
5097    /* If there are any driver level actions to take */
5098    if (drv->bdrv_reopen_commit) {
5099        drv->bdrv_reopen_commit(reopen_state);
5100    }
5101
5102    GRAPH_RDLOCK_GUARD_MAINLOOP();
5103
5104    /* set BDS specific flags now */
5105    qobject_unref(bs->explicit_options);
5106    qobject_unref(bs->options);
5107    qobject_ref(reopen_state->explicit_options);
5108    qobject_ref(reopen_state->options);
5109
5110    bs->explicit_options   = reopen_state->explicit_options;
5111    bs->options            = reopen_state->options;
5112    bs->open_flags         = reopen_state->flags;
5113    bs->detect_zeroes      = reopen_state->detect_zeroes;
5114
5115    /* Remove child references from bs->options and bs->explicit_options.
5116     * Child options were already removed in bdrv_reopen_queue_child() */
5117    QLIST_FOREACH(child, &bs->children, next) {
5118        qdict_del(bs->explicit_options, child->name);
5119        qdict_del(bs->options, child->name);
5120    }
5121    /* backing is probably removed, so it's not handled by previous loop */
5122    qdict_del(bs->explicit_options, "backing");
5123    qdict_del(bs->options, "backing");
5124
5125    bdrv_refresh_limits(bs, NULL, NULL);
5126    bdrv_refresh_total_sectors(bs, bs->total_sectors);
5127}
5128
5129/*
5130 * Abort the reopen, and delete and free the staged changes in
5131 * reopen_state
5132 */
5133static void GRAPH_UNLOCKED bdrv_reopen_abort(BDRVReopenState *reopen_state)
5134{
5135    BlockDriver *drv;
5136
5137    assert(reopen_state != NULL);
5138    drv = reopen_state->bs->drv;
5139    assert(drv != NULL);
5140    GLOBAL_STATE_CODE();
5141
5142    if (drv->bdrv_reopen_abort) {
5143        drv->bdrv_reopen_abort(reopen_state);
5144    }
5145}
5146
5147
5148static void GRAPH_UNLOCKED bdrv_close(BlockDriverState *bs)
5149{
5150    BdrvAioNotifier *ban, *ban_next;
5151    BdrvChild *child, *next;
5152
5153    GLOBAL_STATE_CODE();
5154    assert(!bs->refcnt);
5155
5156    bdrv_drained_begin(bs); /* complete I/O */
5157    bdrv_flush(bs);
5158    bdrv_drain(bs); /* in case flush left pending I/O */
5159
5160    if (bs->drv) {
5161        if (bs->drv->bdrv_close) {
5162            /* Must unfreeze all children, so bdrv_unref_child() works */
5163            bs->drv->bdrv_close(bs);
5164        }
5165        bs->drv = NULL;
5166    }
5167
5168    bdrv_graph_wrlock_drained();
5169    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
5170        bdrv_unref_child(bs, child);
5171    }
5172
5173    assert(!bs->backing);
5174    assert(!bs->file);
5175    bdrv_graph_wrunlock();
5176
5177    g_free(bs->opaque);
5178    bs->opaque = NULL;
5179    qatomic_set(&bs->copy_on_read, 0);
5180    bs->backing_file[0] = '\0';
5181    bs->backing_format[0] = '\0';
5182    bs->total_sectors = 0;
5183    bs->encrypted = false;
5184    bs->sg = false;
5185    qobject_unref(bs->options);
5186    qobject_unref(bs->explicit_options);
5187    bs->options = NULL;
5188    bs->explicit_options = NULL;
5189    qobject_unref(bs->full_open_options);
5190    bs->full_open_options = NULL;
5191    g_free(bs->block_status_cache);
5192    bs->block_status_cache = NULL;
5193
5194    bdrv_release_named_dirty_bitmaps(bs);
5195    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
5196
5197    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
5198        g_free(ban);
5199    }
5200    QLIST_INIT(&bs->aio_notifiers);
5201    bdrv_drained_end(bs);
5202
5203    /*
5204     * If we're still inside some bdrv_drain_all_begin()/end() sections, end
5205     * them now since this BDS won't exist anymore when bdrv_drain_all_end()
5206     * gets called.
5207     */
5208    if (bs->quiesce_counter) {
5209        bdrv_drain_all_end_quiesce(bs);
5210    }
5211}
5212
5213void bdrv_close_all(void)
5214{
5215    GLOBAL_STATE_CODE();
5216    assert(job_next(NULL) == NULL);
5217
5218    /* Drop references from requests still in flight, such as canceled block
5219     * jobs whose AIO context has not been polled yet */
5220    bdrv_drain_all();
5221
5222    blk_remove_all_bs();
5223    blockdev_close_all_bdrv_states();
5224
5225    assert(QTAILQ_EMPTY(&all_bdrv_states));
5226}
5227
5228static bool GRAPH_RDLOCK should_update_child(BdrvChild *c, BlockDriverState *to)
5229{
5230    GQueue *queue;
5231    GHashTable *found;
5232    bool ret;
5233
5234    if (c->klass->stay_at_node) {
5235        return false;
5236    }
5237
5238    /* If the child @c belongs to the BDS @to, replacing the current
5239     * c->bs by @to would mean to create a loop.
5240     *
5241     * Such a case occurs when appending a BDS to a backing chain.
5242     * For instance, imagine the following chain:
5243     *
5244     *   guest device -> node A -> further backing chain...
5245     *
5246     * Now we create a new BDS B which we want to put on top of this
5247     * chain, so we first attach A as its backing node:
5248     *
5249     *                   node B
5250     *                     |
5251     *                     v
5252     *   guest device -> node A -> further backing chain...
5253     *
5254     * Finally we want to replace A by B.  When doing that, we want to
5255     * replace all pointers to A by pointers to B -- except for the
5256     * pointer from B because (1) that would create a loop, and (2)
5257     * that pointer should simply stay intact:
5258     *
5259     *   guest device -> node B
5260     *                     |
5261     *                     v
5262     *                   node A -> further backing chain...
5263     *
5264     * In general, when replacing a node A (c->bs) by a node B (@to),
5265     * if A is a child of B, that means we cannot replace A by B there
5266     * because that would create a loop.  Silently detaching A from B
5267     * is also not really an option.  So overall just leaving A in
5268     * place there is the most sensible choice.
5269     *
5270     * We would also create a loop in any cases where @c is only
5271     * indirectly referenced by @to. Prevent this by returning false
5272     * if @c is found (by breadth-first search) anywhere in the whole
5273     * subtree of @to.
5274     */
5275
5276    ret = true;
5277    found = g_hash_table_new(NULL, NULL);
5278    g_hash_table_add(found, to);
5279    queue = g_queue_new();
5280    g_queue_push_tail(queue, to);
5281
5282    while (!g_queue_is_empty(queue)) {
5283        BlockDriverState *v = g_queue_pop_head(queue);
5284        BdrvChild *c2;
5285
5286        QLIST_FOREACH(c2, &v->children, next) {
5287            if (c2 == c) {
5288                ret = false;
5289                break;
5290            }
5291
5292            if (g_hash_table_contains(found, c2->bs)) {
5293                continue;
5294            }
5295
5296            g_queue_push_tail(queue, c2->bs);
5297            g_hash_table_add(found, c2->bs);
5298        }
5299    }
5300
5301    g_queue_free(queue);
5302    g_hash_table_destroy(found);
5303
5304    return ret;
5305}
5306
5307static void bdrv_remove_child_commit(void *opaque)
5308{
5309    GLOBAL_STATE_CODE();
5310    bdrv_child_free(opaque);
5311}
5312
5313static TransactionActionDrv bdrv_remove_child_drv = {
5314    .commit = bdrv_remove_child_commit,
5315};
5316
5317/*
5318 * Function doesn't update permissions, caller is responsible for this.
5319 *
5320 * @child->bs (if non-NULL) must be drained.
5321 *
5322 * After calling this function, the transaction @tran may only be completed
5323 * while holding a writer lock for the graph.
5324 */
5325static void GRAPH_WRLOCK bdrv_remove_child(BdrvChild *child, Transaction *tran)
5326{
5327    if (!child) {
5328        return;
5329    }
5330
5331    if (child->bs) {
5332        assert(child->quiesced_parent);
5333        bdrv_replace_child_tran(child, NULL, tran);
5334    }
5335
5336    tran_add(tran, &bdrv_remove_child_drv, child);
5337}
5338
5339/*
5340 * Both @from and @to (if non-NULL) must be drained. @to must be kept drained
5341 * until the transaction is completed.
5342 *
5343 * After calling this function, the transaction @tran may only be completed
5344 * while holding a writer lock for the graph.
5345 */
5346static int GRAPH_WRLOCK
5347bdrv_replace_node_noperm(BlockDriverState *from,
5348                         BlockDriverState *to,
5349                         bool auto_skip, Transaction *tran,
5350                         Error **errp)
5351{
5352    BdrvChild *c, *next;
5353
5354    GLOBAL_STATE_CODE();
5355
5356    assert(from->quiesce_counter);
5357    assert(to->quiesce_counter);
5358
5359    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
5360        assert(c->bs == from);
5361        if (!should_update_child(c, to)) {
5362            if (auto_skip) {
5363                continue;
5364            }
5365            error_setg(errp, "Should not change '%s' link to '%s'",
5366                       c->name, from->node_name);
5367            return -EINVAL;
5368        }
5369        if (c->frozen) {
5370            error_setg(errp, "Cannot change '%s' link to '%s'",
5371                       c->name, from->node_name);
5372            return -EPERM;
5373        }
5374        bdrv_replace_child_tran(c, to, tran);
5375    }
5376
5377    return 0;
5378}
5379
5380/*
5381 * Switch all parents of @from to point to @to instead. @from and @to must be in
5382 * the same AioContext and both must be drained.
5383 *
5384 * With auto_skip=true bdrv_replace_node_common skips updating from parents
5385 * if it creates a parent-child relation loop or if parent is block-job.
5386 *
5387 * With auto_skip=false the error is returned if from has a parent which should
5388 * not be updated.
5389 *
5390 * With @detach_subchain=true @to must be in a backing chain of @from. In this
5391 * case backing link of the cow-parent of @to is removed.
5392 */
5393static int GRAPH_WRLOCK
5394bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to,
5395                         bool auto_skip, bool detach_subchain, Error **errp)
5396{
5397    Transaction *tran = tran_new();
5398    g_autoptr(GSList) refresh_list = NULL;
5399    BlockDriverState *to_cow_parent = NULL;
5400    int ret;
5401
5402    GLOBAL_STATE_CODE();
5403
5404    assert(from->quiesce_counter);
5405    assert(to->quiesce_counter);
5406    assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
5407
5408    if (detach_subchain) {
5409        assert(bdrv_chain_contains(from, to));
5410        assert(from != to);
5411        for (to_cow_parent = from;
5412             bdrv_filter_or_cow_bs(to_cow_parent) != to;
5413             to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
5414        {
5415            ;
5416        }
5417    }
5418
5419    /*
5420     * Do the replacement without permission update.
5421     * Replacement may influence the permissions, we should calculate new
5422     * permissions based on new graph. If we fail, we'll roll-back the
5423     * replacement.
5424     */
5425    ret = bdrv_replace_node_noperm(from, to, auto_skip, tran, errp);
5426    if (ret < 0) {
5427        goto out;
5428    }
5429
5430    if (detach_subchain) {
5431        /* to_cow_parent is already drained because from is drained */
5432        bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
5433    }
5434
5435    refresh_list = g_slist_prepend(refresh_list, to);
5436    refresh_list = g_slist_prepend(refresh_list, from);
5437
5438    ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
5439    if (ret < 0) {
5440        goto out;
5441    }
5442
5443    ret = 0;
5444
5445out:
5446    tran_finalize(tran, ret);
5447    return ret;
5448}
5449
5450int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
5451                      Error **errp)
5452{
5453    return bdrv_replace_node_common(from, to, true, false, errp);
5454}
5455
5456int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
5457{
5458    BlockDriverState *child_bs;
5459    int ret;
5460
5461    GLOBAL_STATE_CODE();
5462
5463    bdrv_graph_rdlock_main_loop();
5464    child_bs = bdrv_filter_or_cow_bs(bs);
5465    bdrv_graph_rdunlock_main_loop();
5466
5467    bdrv_drained_begin(child_bs);
5468    bdrv_graph_wrlock();
5469    ret = bdrv_replace_node_common(bs, child_bs, true, true, errp);
5470    bdrv_graph_wrunlock();
5471    bdrv_drained_end(child_bs);
5472
5473    return ret;
5474}
5475
5476/*
5477 * Add new bs contents at the top of an image chain while the chain is
5478 * live, while keeping required fields on the top layer.
5479 *
5480 * This will modify the BlockDriverState fields, and swap contents
5481 * between bs_new and bs_top. Both bs_new and bs_top are modified.
5482 *
5483 * bs_new must not be attached to a BlockBackend and must not have backing
5484 * child.
5485 *
5486 * This function does not create any image files.
5487 */
5488int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
5489                Error **errp)
5490{
5491    int ret;
5492    BdrvChild *child;
5493    Transaction *tran = tran_new();
5494
5495    GLOBAL_STATE_CODE();
5496
5497    bdrv_graph_rdlock_main_loop();
5498    assert(!bs_new->backing);
5499    bdrv_graph_rdunlock_main_loop();
5500
5501    bdrv_graph_wrlock_drained();
5502
5503    child = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
5504                                     &child_of_bds, bdrv_backing_role(bs_new),
5505                                     tran, errp);
5506    if (!child) {
5507        ret = -EINVAL;
5508        goto out;
5509    }
5510
5511    ret = bdrv_replace_node_noperm(bs_top, bs_new, true, tran, errp);
5512    if (ret < 0) {
5513        goto out;
5514    }
5515
5516    ret = bdrv_refresh_perms(bs_new, tran, errp);
5517out:
5518    tran_finalize(tran, ret);
5519
5520    bdrv_refresh_limits(bs_top, NULL, NULL);
5521    bdrv_graph_wrunlock();
5522
5523    return ret;
5524}
5525
5526/* Not for empty child */
5527int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
5528                          Error **errp)
5529{
5530    int ret;
5531    Transaction *tran = tran_new();
5532    g_autoptr(GSList) refresh_list = NULL;
5533    BlockDriverState *old_bs = child->bs;
5534
5535    GLOBAL_STATE_CODE();
5536
5537    bdrv_ref(old_bs);
5538    bdrv_drained_begin(old_bs);
5539    bdrv_drained_begin(new_bs);
5540    bdrv_graph_wrlock();
5541
5542    bdrv_replace_child_tran(child, new_bs, tran);
5543
5544    refresh_list = g_slist_prepend(refresh_list, old_bs);
5545    refresh_list = g_slist_prepend(refresh_list, new_bs);
5546
5547    ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
5548
5549    tran_finalize(tran, ret);
5550
5551    bdrv_graph_wrunlock();
5552    bdrv_drained_end(old_bs);
5553    bdrv_drained_end(new_bs);
5554    bdrv_unref(old_bs);
5555
5556    return ret;
5557}
5558
5559static void bdrv_delete(BlockDriverState *bs)
5560{
5561    assert(bdrv_op_blocker_is_empty(bs));
5562    assert(!bs->refcnt);
5563    GLOBAL_STATE_CODE();
5564
5565    /* remove from list, if necessary */
5566    if (bs->node_name[0] != '\0') {
5567        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
5568    }
5569    QTAILQ_REMOVE(&all_bdrv_states, bs, bs_list);
5570
5571    bdrv_close(bs);
5572
5573    qemu_mutex_destroy(&bs->reqs_lock);
5574
5575    g_free(bs);
5576}
5577
5578
5579/*
5580 * Replace @bs by newly created block node.
5581 *
5582 * @options is a QDict of options to pass to the block drivers, or NULL for an
5583 * empty set of options. The reference to the QDict belongs to the block layer
5584 * after the call (even on failure), so if the caller intends to reuse the
5585 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
5586 *
5587 * The caller must make sure that @bs stays in the same AioContext, i.e.
5588 * @options must not refer to nodes in a different AioContext.
5589 */
5590BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
5591                                   int flags, Error **errp)
5592{
5593    ERRP_GUARD();
5594    int ret;
5595    AioContext *ctx = bdrv_get_aio_context(bs);
5596    BlockDriverState *new_node_bs = NULL;
5597    const char *drvname, *node_name;
5598    BlockDriver *drv;
5599
5600    drvname = qdict_get_try_str(options, "driver");
5601    if (!drvname) {
5602        error_setg(errp, "driver is not specified");
5603        goto fail;
5604    }
5605
5606    drv = bdrv_find_format(drvname);
5607    if (!drv) {
5608        error_setg(errp, "Unknown driver: '%s'", drvname);
5609        goto fail;
5610    }
5611
5612    node_name = qdict_get_try_str(options, "node-name");
5613
5614    GLOBAL_STATE_CODE();
5615
5616    new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
5617                                            errp);
5618    assert(bdrv_get_aio_context(bs) == ctx);
5619
5620    options = NULL; /* bdrv_new_open_driver() eats options */
5621    if (!new_node_bs) {
5622        error_prepend(errp, "Could not create node: ");
5623        goto fail;
5624    }
5625
5626    /*
5627     * Make sure that @bs doesn't go away until we have successfully attached
5628     * all of its parents to @new_node_bs and undrained it again.
5629     */
5630    bdrv_ref(bs);
5631    bdrv_drained_begin(bs);
5632    bdrv_drained_begin(new_node_bs);
5633    bdrv_graph_wrlock();
5634    ret = bdrv_replace_node(bs, new_node_bs, errp);
5635    bdrv_graph_wrunlock();
5636    bdrv_drained_end(new_node_bs);
5637    bdrv_drained_end(bs);
5638    bdrv_unref(bs);
5639
5640    if (ret < 0) {
5641        error_prepend(errp, "Could not replace node: ");
5642        goto fail;
5643    }
5644
5645    return new_node_bs;
5646
5647fail:
5648    qobject_unref(options);
5649    bdrv_unref(new_node_bs);
5650    return NULL;
5651}
5652
5653/*
5654 * Run consistency checks on an image
5655 *
5656 * Returns 0 if the check could be completed (it doesn't mean that the image is
5657 * free of errors) or -errno when an internal error occurred. The results of the
5658 * check are stored in res.
5659 */
5660int coroutine_fn bdrv_co_check(BlockDriverState *bs,
5661                               BdrvCheckResult *res, BdrvCheckMode fix)
5662{
5663    IO_CODE();
5664    assert_bdrv_graph_readable();
5665    if (bs->drv == NULL) {
5666        return -ENOMEDIUM;
5667    }
5668    if (bs->drv->bdrv_co_check == NULL) {
5669        return -ENOTSUP;
5670    }
5671
5672    memset(res, 0, sizeof(*res));
5673    return bs->drv->bdrv_co_check(bs, res, fix);
5674}
5675
5676/*
5677 * Return values:
5678 * 0        - success
5679 * -EINVAL  - backing format specified, but no file
5680 * -ENOSPC  - can't update the backing file because no space is left in the
5681 *            image file header
5682 * -ENOTSUP - format driver doesn't support changing the backing file
5683 */
5684int coroutine_fn
5685bdrv_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
5686                            const char *backing_fmt, bool require)
5687{
5688    BlockDriver *drv = bs->drv;
5689    int ret;
5690
5691    IO_CODE();
5692
5693    if (!drv) {
5694        return -ENOMEDIUM;
5695    }
5696
5697    /* Backing file format doesn't make sense without a backing file */
5698    if (backing_fmt && !backing_file) {
5699        return -EINVAL;
5700    }
5701
5702    if (require && backing_file && !backing_fmt) {
5703        return -EINVAL;
5704    }
5705
5706    if (drv->bdrv_co_change_backing_file != NULL) {
5707        ret = drv->bdrv_co_change_backing_file(bs, backing_file, backing_fmt);
5708    } else {
5709        ret = -ENOTSUP;
5710    }
5711
5712    if (ret == 0) {
5713        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
5714        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
5715        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
5716                backing_file ?: "");
5717    }
5718    return ret;
5719}
5720
5721/*
5722 * Finds the first non-filter node above bs in the chain between
5723 * active and bs.  The returned node is either an immediate parent of
5724 * bs, or there are only filter nodes between the two.
5725 *
5726 * Returns NULL if bs is not found in active's image chain,
5727 * or if active == bs.
5728 *
5729 * Returns the bottommost base image if bs == NULL.
5730 */
5731BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
5732                                    BlockDriverState *bs)
5733{
5734
5735    GLOBAL_STATE_CODE();
5736
5737    bs = bdrv_skip_filters(bs);
5738    active = bdrv_skip_filters(active);
5739
5740    while (active) {
5741        BlockDriverState *next = bdrv_backing_chain_next(active);
5742        if (bs == next) {
5743            return active;
5744        }
5745        active = next;
5746    }
5747
5748    return NULL;
5749}
5750
5751/* Given a BDS, searches for the base layer. */
5752BlockDriverState *bdrv_find_base(BlockDriverState *bs)
5753{
5754    GLOBAL_STATE_CODE();
5755
5756    return bdrv_find_overlay(bs, NULL);
5757}
5758
5759/*
5760 * Return true if at least one of the COW (backing) and filter links
5761 * between @bs and @base is frozen. @errp is set if that's the case.
5762 * @base must be reachable from @bs, or NULL.
5763 */
5764static bool GRAPH_RDLOCK
5765bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
5766                             Error **errp)
5767{
5768    BlockDriverState *i;
5769    BdrvChild *child;
5770
5771    GLOBAL_STATE_CODE();
5772
5773    for (i = bs; i != base; i = child_bs(child)) {
5774        child = bdrv_filter_or_cow_child(i);
5775
5776        if (child && child->frozen) {
5777            error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
5778                       child->name, i->node_name, child->bs->node_name);
5779            return true;
5780        }
5781    }
5782
5783    return false;
5784}
5785
5786/*
5787 * Freeze all COW (backing) and filter links between @bs and @base.
5788 * If any of the links is already frozen the operation is aborted and
5789 * none of the links are modified.
5790 * @base must be reachable from @bs, or NULL.
5791 * Returns 0 on success. On failure returns < 0 and sets @errp.
5792 */
5793int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
5794                              Error **errp)
5795{
5796    BlockDriverState *i;
5797    BdrvChild *child;
5798
5799    GLOBAL_STATE_CODE();
5800
5801    if (bdrv_is_backing_chain_frozen(bs, base, errp)) {
5802        return -EPERM;
5803    }
5804
5805    for (i = bs; i != base; i = child_bs(child)) {
5806        child = bdrv_filter_or_cow_child(i);
5807        if (child && child->bs->never_freeze) {
5808            error_setg(errp, "Cannot freeze '%s' link to '%s'",
5809                       child->name, child->bs->node_name);
5810            return -EPERM;
5811        }
5812    }
5813
5814    for (i = bs; i != base; i = child_bs(child)) {
5815        child = bdrv_filter_or_cow_child(i);
5816        if (child) {
5817            child->frozen = true;
5818        }
5819    }
5820
5821    return 0;
5822}
5823
5824/*
5825 * Unfreeze all COW (backing) and filter links between @bs and @base.
5826 * The caller must ensure that all links are frozen before using this
5827 * function.
5828 * @base must be reachable from @bs, or NULL.
5829 */
5830void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base)
5831{
5832    BlockDriverState *i;
5833    BdrvChild *child;
5834
5835    GLOBAL_STATE_CODE();
5836
5837    for (i = bs; i != base; i = child_bs(child)) {
5838        child = bdrv_filter_or_cow_child(i);
5839        if (child) {
5840            assert(child->frozen);
5841            child->frozen = false;
5842        }
5843    }
5844}
5845
5846/*
5847 * Drops images above 'base' up to and including 'top', and sets the image
5848 * above 'top' to have base as its backing file.
5849 *
5850 * Requires that the overlay to 'top' is opened r/w, so that the backing file
5851 * information in 'bs' can be properly updated.
5852 *
5853 * E.g., this will convert the following chain:
5854 * bottom <- base <- intermediate <- top <- active
5855 *
5856 * to
5857 *
5858 * bottom <- base <- active
5859 *
5860 * It is allowed for bottom==base, in which case it converts:
5861 *
5862 * base <- intermediate <- top <- active
5863 *
5864 * to
5865 *
5866 * base <- active
5867 *
5868 * If backing_file_str is non-NULL, it will be used when modifying top's
5869 * overlay image metadata.
5870 *
5871 * Error conditions:
5872 *  if active == top, that is considered an error
5873 *
5874 */
5875int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
5876                           const char *backing_file_str,
5877                           bool backing_mask_protocol)
5878{
5879    BlockDriverState *explicit_top = top;
5880    bool update_inherits_from;
5881    BdrvChild *c;
5882    Error *local_err = NULL;
5883    int ret = -EIO;
5884    g_autoptr(GSList) updated_children = NULL;
5885    GSList *p;
5886
5887    GLOBAL_STATE_CODE();
5888
5889    bdrv_ref(top);
5890    bdrv_drained_begin(base);
5891    bdrv_graph_wrlock();
5892
5893    if (!top->drv || !base->drv) {
5894        goto exit_wrlock;
5895    }
5896
5897    /* Make sure that base is in the backing chain of top */
5898    if (!bdrv_chain_contains(top, base)) {
5899        goto exit_wrlock;
5900    }
5901
5902    /* If 'base' recursively inherits from 'top' then we should set
5903     * base->inherits_from to top->inherits_from after 'top' and all
5904     * other intermediate nodes have been dropped.
5905     * If 'top' is an implicit node (e.g. "commit_top") we should skip
5906     * it because no one inherits from it. We use explicit_top for that. */
5907    explicit_top = bdrv_skip_implicit_filters(explicit_top);
5908    update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
5909
5910    /* success - we can delete the intermediate states, and link top->base */
5911    if (!backing_file_str) {
5912        bdrv_refresh_filename(base);
5913        backing_file_str = base->filename;
5914    }
5915
5916    QLIST_FOREACH(c, &top->parents, next_parent) {
5917        updated_children = g_slist_prepend(updated_children, c);
5918    }
5919
5920    /*
5921     * It seems correct to pass detach_subchain=true here, but it triggers
5922     * one more yet not fixed bug, when due to nested aio_poll loop we switch to
5923     * another drained section, which modify the graph (for example, removing
5924     * the child, which we keep in updated_children list). So, it's a TODO.
5925     *
5926     * Note, bug triggered if pass detach_subchain=true here and run
5927     * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
5928     * That's a FIXME.
5929     */
5930    bdrv_replace_node_common(top, base, false, false, &local_err);
5931    bdrv_graph_wrunlock();
5932
5933    if (local_err) {
5934        error_report_err(local_err);
5935        goto exit;
5936    }
5937
5938    for (p = updated_children; p; p = p->next) {
5939        c = p->data;
5940
5941        if (c->klass->update_filename) {
5942            ret = c->klass->update_filename(c, base, backing_file_str,
5943                                            backing_mask_protocol,
5944                                            &local_err);
5945            if (ret < 0) {
5946                /*
5947                 * TODO: Actually, we want to rollback all previous iterations
5948                 * of this loop, and (which is almost impossible) previous
5949                 * bdrv_replace_node()...
5950                 *
5951                 * Note, that c->klass->update_filename may lead to permission
5952                 * update, so it's a bad idea to call it inside permission
5953                 * update transaction of bdrv_replace_node.
5954                 */
5955                error_report_err(local_err);
5956                goto exit;
5957            }
5958        }
5959    }
5960
5961    if (update_inherits_from) {
5962        base->inherits_from = explicit_top->inherits_from;
5963    }
5964
5965    ret = 0;
5966    goto exit;
5967
5968exit_wrlock:
5969    bdrv_graph_wrunlock();
5970exit:
5971    bdrv_drained_end(base);
5972    bdrv_unref(top);
5973    return ret;
5974}
5975
5976/**
5977 * Implementation of BlockDriver.bdrv_co_get_allocated_file_size() that
5978 * sums the size of all data-bearing children.  (This excludes backing
5979 * children.)
5980 */
5981static int64_t coroutine_fn GRAPH_RDLOCK
5982bdrv_sum_allocated_file_size(BlockDriverState *bs)
5983{
5984    BdrvChild *child;
5985    int64_t child_size, sum = 0;
5986
5987    QLIST_FOREACH(child, &bs->children, next) {
5988        if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
5989                           BDRV_CHILD_FILTERED))
5990        {
5991            child_size = bdrv_co_get_allocated_file_size(child->bs);
5992            if (child_size < 0) {
5993                return child_size;
5994            }
5995            sum += child_size;
5996        }
5997    }
5998
5999    return sum;
6000}
6001
6002/**
6003 * Length of a allocated file in bytes. Sparse files are counted by actual
6004 * allocated space. Return < 0 if error or unknown.
6005 */
6006int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs)
6007{
6008    BlockDriver *drv = bs->drv;
6009    IO_CODE();
6010    assert_bdrv_graph_readable();
6011
6012    if (!drv) {
6013        return -ENOMEDIUM;
6014    }
6015    if (drv->bdrv_co_get_allocated_file_size) {
6016        return drv->bdrv_co_get_allocated_file_size(bs);
6017    }
6018
6019    if (drv->protocol_name) {
6020        /*
6021         * Protocol drivers default to -ENOTSUP (most of their data is
6022         * not stored in any of their children (if they even have any),
6023         * so there is no generic way to figure it out).
6024         */
6025        return -ENOTSUP;
6026    } else if (drv->is_filter) {
6027        /* Filter drivers default to the size of their filtered child */
6028        return bdrv_co_get_allocated_file_size(bdrv_filter_bs(bs));
6029    } else {
6030        /* Other drivers default to summing their children's sizes */
6031        return bdrv_sum_allocated_file_size(bs);
6032    }
6033}
6034
6035/*
6036 * bdrv_measure:
6037 * @drv: Format driver
6038 * @opts: Creation options for new image
6039 * @in_bs: Existing image containing data for new image (may be NULL)
6040 * @errp: Error object
6041 * Returns: A #BlockMeasureInfo (free using qapi_free_BlockMeasureInfo())
6042 *          or NULL on error
6043 *
6044 * Calculate file size required to create a new image.
6045 *
6046 * If @in_bs is given then space for allocated clusters and zero clusters
6047 * from that image are included in the calculation.  If @opts contains a
6048 * backing file that is shared by @in_bs then backing clusters may be omitted
6049 * from the calculation.
6050 *
6051 * If @in_bs is NULL then the calculation includes no allocated clusters
6052 * unless a preallocation option is given in @opts.
6053 *
6054 * Note that @in_bs may use a different BlockDriver from @drv.
6055 *
6056 * If an error occurs the @errp pointer is set.
6057 */
6058BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
6059                               BlockDriverState *in_bs, Error **errp)
6060{
6061    IO_CODE();
6062    if (!drv->bdrv_measure) {
6063        error_setg(errp, "Block driver '%s' does not support size measurement",
6064                   drv->format_name);
6065        return NULL;
6066    }
6067
6068    return drv->bdrv_measure(opts, in_bs, errp);
6069}
6070
6071/**
6072 * Return number of sectors on success, -errno on error.
6073 */
6074int64_t coroutine_fn bdrv_co_nb_sectors(BlockDriverState *bs)
6075{
6076    BlockDriver *drv = bs->drv;
6077    IO_CODE();
6078    assert_bdrv_graph_readable();
6079
6080    if (!drv)
6081        return -ENOMEDIUM;
6082
6083    if (bs->bl.has_variable_length) {
6084        int ret = bdrv_co_refresh_total_sectors(bs, bs->total_sectors);
6085        if (ret < 0) {
6086            return ret;
6087        }
6088    }
6089    return bs->total_sectors;
6090}
6091
6092/*
6093 * This wrapper is written by hand because this function is in the hot I/O path,
6094 * via blk_get_geometry.
6095 */
6096int64_t coroutine_mixed_fn bdrv_nb_sectors(BlockDriverState *bs)
6097{
6098    BlockDriver *drv = bs->drv;
6099    IO_CODE();
6100
6101    if (!drv)
6102        return -ENOMEDIUM;
6103
6104    if (bs->bl.has_variable_length) {
6105        int ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
6106        if (ret < 0) {
6107            return ret;
6108        }
6109    }
6110
6111    return bs->total_sectors;
6112}
6113
6114/**
6115 * Return length in bytes on success, -errno on error.
6116 * The length is always a multiple of BDRV_SECTOR_SIZE.
6117 */
6118int64_t coroutine_fn bdrv_co_getlength(BlockDriverState *bs)
6119{
6120    int64_t ret;
6121    IO_CODE();
6122    assert_bdrv_graph_readable();
6123
6124    ret = bdrv_co_nb_sectors(bs);
6125    if (ret < 0) {
6126        return ret;
6127    }
6128    if (ret > INT64_MAX / BDRV_SECTOR_SIZE) {
6129        return -EFBIG;
6130    }
6131    return ret * BDRV_SECTOR_SIZE;
6132}
6133
6134bool bdrv_is_sg(BlockDriverState *bs)
6135{
6136    IO_CODE();
6137    return bs->sg;
6138}
6139
6140/**
6141 * Return whether the given node supports compressed writes.
6142 */
6143bool bdrv_supports_compressed_writes(BlockDriverState *bs)
6144{
6145    BlockDriverState *filtered;
6146    IO_CODE();
6147
6148    if (!bs->drv || !block_driver_can_compress(bs->drv)) {
6149        return false;
6150    }
6151
6152    filtered = bdrv_filter_bs(bs);
6153    if (filtered) {
6154        /*
6155         * Filters can only forward compressed writes, so we have to
6156         * check the child.
6157         */
6158        return bdrv_supports_compressed_writes(filtered);
6159    }
6160
6161    return true;
6162}
6163
6164const char *bdrv_get_format_name(BlockDriverState *bs)
6165{
6166    IO_CODE();
6167    return bs->drv ? bs->drv->format_name : NULL;
6168}
6169
6170static int qsort_strcmp(const void *a, const void *b)
6171{
6172    return strcmp(*(char *const *)a, *(char *const *)b);
6173}
6174
6175void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
6176                         void *opaque, bool read_only)
6177{
6178    BlockDriver *drv;
6179    int count = 0;
6180    int i;
6181    const char **formats = NULL;
6182
6183    GLOBAL_STATE_CODE();
6184
6185    QLIST_FOREACH(drv, &bdrv_drivers, list) {
6186        if (drv->format_name) {
6187            bool found = false;
6188
6189            if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, read_only)) {
6190                continue;
6191            }
6192
6193            i = count;
6194            while (formats && i && !found) {
6195                found = !strcmp(formats[--i], drv->format_name);
6196            }
6197
6198            if (!found) {
6199                formats = g_renew(const char *, formats, count + 1);
6200                formats[count++] = drv->format_name;
6201            }
6202        }
6203    }
6204
6205    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
6206        const char *format_name = block_driver_modules[i].format_name;
6207
6208        if (format_name) {
6209            bool found = false;
6210            int j = count;
6211
6212            if (use_bdrv_whitelist &&
6213                !bdrv_format_is_whitelisted(format_name, read_only)) {
6214                continue;
6215            }
6216
6217            while (formats && j && !found) {
6218                found = !strcmp(formats[--j], format_name);
6219            }
6220
6221            if (!found) {
6222                formats = g_renew(const char *, formats, count + 1);
6223                formats[count++] = format_name;
6224            }
6225        }
6226    }
6227
6228    qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
6229
6230    for (i = 0; i < count; i++) {
6231        it(opaque, formats[i]);
6232    }
6233
6234    g_free(formats);
6235}
6236
6237/* This function is to find a node in the bs graph */
6238BlockDriverState *bdrv_find_node(const char *node_name)
6239{
6240    BlockDriverState *bs;
6241
6242    assert(node_name);
6243    GLOBAL_STATE_CODE();
6244
6245    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6246        if (!strcmp(node_name, bs->node_name)) {
6247            return bs;
6248        }
6249    }
6250    return NULL;
6251}
6252
6253/* Put this QMP function here so it can access the static graph_bdrv_states. */
6254BlockDeviceInfoList *bdrv_named_nodes_list(bool flat,
6255                                           Error **errp)
6256{
6257    BlockDeviceInfoList *list;
6258    BlockDriverState *bs;
6259
6260    GLOBAL_STATE_CODE();
6261    GRAPH_RDLOCK_GUARD_MAINLOOP();
6262
6263    list = NULL;
6264    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6265        BlockDeviceInfo *info = bdrv_block_device_info(NULL, bs, flat, errp);
6266        if (!info) {
6267            qapi_free_BlockDeviceInfoList(list);
6268            return NULL;
6269        }
6270        QAPI_LIST_PREPEND(list, info);
6271    }
6272
6273    return list;
6274}
6275
6276typedef struct XDbgBlockGraphConstructor {
6277    XDbgBlockGraph *graph;
6278    GHashTable *graph_nodes;
6279} XDbgBlockGraphConstructor;
6280
6281static XDbgBlockGraphConstructor *xdbg_graph_new(void)
6282{
6283    XDbgBlockGraphConstructor *gr = g_new(XDbgBlockGraphConstructor, 1);
6284
6285    gr->graph = g_new0(XDbgBlockGraph, 1);
6286    gr->graph_nodes = g_hash_table_new(NULL, NULL);
6287
6288    return gr;
6289}
6290
6291static XDbgBlockGraph *xdbg_graph_finalize(XDbgBlockGraphConstructor *gr)
6292{
6293    XDbgBlockGraph *graph = gr->graph;
6294
6295    g_hash_table_destroy(gr->graph_nodes);
6296    g_free(gr);
6297
6298    return graph;
6299}
6300
6301static uintptr_t xdbg_graph_node_num(XDbgBlockGraphConstructor *gr, void *node)
6302{
6303    uintptr_t ret = (uintptr_t)g_hash_table_lookup(gr->graph_nodes, node);
6304
6305    if (ret != 0) {
6306        return ret;
6307    }
6308
6309    /*
6310     * Start counting from 1, not 0, because 0 interferes with not-found (NULL)
6311     * answer of g_hash_table_lookup.
6312     */
6313    ret = g_hash_table_size(gr->graph_nodes) + 1;
6314    g_hash_table_insert(gr->graph_nodes, node, (void *)ret);
6315
6316    return ret;
6317}
6318
6319static void xdbg_graph_add_node(XDbgBlockGraphConstructor *gr, void *node,
6320                                XDbgBlockGraphNodeType type, const char *name)
6321{
6322    XDbgBlockGraphNode *n;
6323
6324    n = g_new0(XDbgBlockGraphNode, 1);
6325
6326    n->id = xdbg_graph_node_num(gr, node);
6327    n->type = type;
6328    n->name = g_strdup(name);
6329
6330    QAPI_LIST_PREPEND(gr->graph->nodes, n);
6331}
6332
6333static void xdbg_graph_add_edge(XDbgBlockGraphConstructor *gr, void *parent,
6334                                const BdrvChild *child)
6335{
6336    BlockPermission qapi_perm;
6337    XDbgBlockGraphEdge *edge;
6338    GLOBAL_STATE_CODE();
6339
6340    edge = g_new0(XDbgBlockGraphEdge, 1);
6341
6342    edge->parent = xdbg_graph_node_num(gr, parent);
6343    edge->child = xdbg_graph_node_num(gr, child->bs);
6344    edge->name = g_strdup(child->name);
6345
6346    for (qapi_perm = 0; qapi_perm < BLOCK_PERMISSION__MAX; qapi_perm++) {
6347        uint64_t flag = bdrv_qapi_perm_to_blk_perm(qapi_perm);
6348
6349        if (flag & child->perm) {
6350            QAPI_LIST_PREPEND(edge->perm, qapi_perm);
6351        }
6352        if (flag & child->shared_perm) {
6353            QAPI_LIST_PREPEND(edge->shared_perm, qapi_perm);
6354        }
6355    }
6356
6357    QAPI_LIST_PREPEND(gr->graph->edges, edge);
6358}
6359
6360
6361XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
6362{
6363    BlockBackend *blk;
6364    BlockJob *job;
6365    BlockDriverState *bs;
6366    BdrvChild *child;
6367    XDbgBlockGraphConstructor *gr = xdbg_graph_new();
6368
6369    GLOBAL_STATE_CODE();
6370
6371    for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
6372        char *allocated_name = NULL;
6373        const char *name = blk_name(blk);
6374
6375        if (!*name) {
6376            name = allocated_name = blk_get_attached_dev_id(blk);
6377        }
6378        xdbg_graph_add_node(gr, blk, XDBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_BACKEND,
6379                           name);
6380        g_free(allocated_name);
6381        if (blk_root(blk)) {
6382            xdbg_graph_add_edge(gr, blk, blk_root(blk));
6383        }
6384    }
6385
6386    WITH_JOB_LOCK_GUARD() {
6387        for (job = block_job_next_locked(NULL); job;
6388             job = block_job_next_locked(job)) {
6389            GSList *el;
6390
6391            xdbg_graph_add_node(gr, job, XDBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
6392                                job->job.id);
6393            for (el = job->nodes; el; el = el->next) {
6394                xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
6395            }
6396        }
6397    }
6398
6399    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
6400        xdbg_graph_add_node(gr, bs, XDBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_DRIVER,
6401                           bs->node_name);
6402        QLIST_FOREACH(child, &bs->children, next) {
6403            xdbg_graph_add_edge(gr, bs, child);
6404        }
6405    }
6406
6407    return xdbg_graph_finalize(gr);
6408}
6409
6410BlockDriverState *bdrv_lookup_bs(const char *device,
6411                                 const char *node_name,
6412                                 Error **errp)
6413{
6414    BlockBackend *blk;
6415    BlockDriverState *bs;
6416
6417    GLOBAL_STATE_CODE();
6418
6419    if (device) {
6420        blk = blk_by_name(device);
6421
6422        if (blk) {
6423            bs = blk_bs(blk);
6424            if (!bs) {
6425                error_setg(errp, "Device '%s' has no medium", device);
6426            }
6427
6428            return bs;
6429        }
6430    }
6431
6432    if (node_name) {
6433        bs = bdrv_find_node(node_name);
6434
6435        if (bs) {
6436            return bs;
6437        }
6438    }
6439
6440    error_setg(errp, "Cannot find device=\'%s\' nor node-name=\'%s\'",
6441                     device ? device : "",
6442                     node_name ? node_name : "");
6443    return NULL;
6444}
6445
6446/* If 'base' is in the same chain as 'top', return true. Otherwise,
6447 * return false.  If either argument is NULL, return false. */
6448bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
6449{
6450
6451    GLOBAL_STATE_CODE();
6452
6453    while (top && top != base) {
6454        top = bdrv_filter_or_cow_bs(top);
6455    }
6456
6457    return top != NULL;
6458}
6459
6460BlockDriverState *bdrv_next_node(BlockDriverState *bs)
6461{
6462    GLOBAL_STATE_CODE();
6463    if (!bs) {
6464        return QTAILQ_FIRST(&graph_bdrv_states);
6465    }
6466    return QTAILQ_NEXT(bs, node_list);
6467}
6468
6469BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
6470{
6471    GLOBAL_STATE_CODE();
6472    if (!bs) {
6473        return QTAILQ_FIRST(&all_bdrv_states);
6474    }
6475    return QTAILQ_NEXT(bs, bs_list);
6476}
6477
6478const char *bdrv_get_node_name(const BlockDriverState *bs)
6479{
6480    IO_CODE();
6481    return bs->node_name;
6482}
6483
6484const char *bdrv_get_parent_name(const BlockDriverState *bs)
6485{
6486    BdrvChild *c;
6487    const char *name;
6488    IO_CODE();
6489
6490    /* If multiple parents have a name, just pick the first one. */
6491    QLIST_FOREACH(c, &bs->parents, next_parent) {
6492        if (c->klass->get_name) {
6493            name = c->klass->get_name(c);
6494            if (name && *name) {
6495                return name;
6496            }
6497        }
6498    }
6499
6500    return NULL;
6501}
6502
6503/* TODO check what callers really want: bs->node_name or blk_name() */
6504const char *bdrv_get_device_name(const BlockDriverState *bs)
6505{
6506    IO_CODE();
6507    return bdrv_get_parent_name(bs) ?: "";
6508}
6509
6510/* This can be used to identify nodes that might not have a device
6511 * name associated. Since node and device names live in the same
6512 * namespace, the result is unambiguous. The exception is if both are
6513 * absent, then this returns an empty (non-null) string. */
6514const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
6515{
6516    IO_CODE();
6517    return bdrv_get_parent_name(bs) ?: bs->node_name;
6518}
6519
6520int bdrv_get_flags(BlockDriverState *bs)
6521{
6522    IO_CODE();
6523    return bs->open_flags;
6524}
6525
6526int bdrv_has_zero_init_1(BlockDriverState *bs)
6527{
6528    GLOBAL_STATE_CODE();
6529    return 1;
6530}
6531
6532int coroutine_mixed_fn bdrv_has_zero_init(BlockDriverState *bs)
6533{
6534    BlockDriverState *filtered;
6535    GLOBAL_STATE_CODE();
6536
6537    if (!bs->drv) {
6538        return 0;
6539    }
6540
6541    /* If BS is a copy on write image, it is initialized to
6542       the contents of the base image, which may not be zeroes.  */
6543    if (bdrv_cow_child(bs)) {
6544        return 0;
6545    }
6546    if (bs->drv->bdrv_has_zero_init) {
6547        return bs->drv->bdrv_has_zero_init(bs);
6548    }
6549
6550    filtered = bdrv_filter_bs(bs);
6551    if (filtered) {
6552        return bdrv_has_zero_init(filtered);
6553    }
6554
6555    /* safe default */
6556    return 0;
6557}
6558
6559bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
6560{
6561    IO_CODE();
6562    if (!(bs->open_flags & BDRV_O_UNMAP)) {
6563        return false;
6564    }
6565
6566    return bs->supported_zero_flags & BDRV_REQ_MAY_UNMAP;
6567}
6568
6569void bdrv_get_backing_filename(BlockDriverState *bs,
6570                               char *filename, int filename_size)
6571{
6572    IO_CODE();
6573    pstrcpy(filename, filename_size, bs->backing_file);
6574}
6575
6576int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
6577{
6578    int ret;
6579    BlockDriver *drv = bs->drv;
6580    IO_CODE();
6581    assert_bdrv_graph_readable();
6582
6583    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
6584    if (!drv) {
6585        return -ENOMEDIUM;
6586    }
6587    if (!drv->bdrv_co_get_info) {
6588        BlockDriverState *filtered = bdrv_filter_bs(bs);
6589        if (filtered) {
6590            return bdrv_co_get_info(filtered, bdi);
6591        }
6592        return -ENOTSUP;
6593    }
6594    memset(bdi, 0, sizeof(*bdi));
6595    ret = drv->bdrv_co_get_info(bs, bdi);
6596    if (bdi->subcluster_size == 0) {
6597        /*
6598         * If the driver left this unset, subclusters are not supported.
6599         * Then it is safe to treat each cluster as having only one subcluster.
6600         */
6601        bdi->subcluster_size = bdi->cluster_size;
6602    }
6603    if (ret < 0) {
6604        return ret;
6605    }
6606
6607    if (bdi->cluster_size > BDRV_MAX_ALIGNMENT) {
6608        return -EINVAL;
6609    }
6610
6611    return 0;
6612}
6613
6614ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
6615                                          Error **errp)
6616{
6617    BlockDriver *drv = bs->drv;
6618    IO_CODE();
6619    if (drv && drv->bdrv_get_specific_info) {
6620        return drv->bdrv_get_specific_info(bs, errp);
6621    }
6622    return NULL;
6623}
6624
6625BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
6626{
6627    BlockDriver *drv = bs->drv;
6628    IO_CODE();
6629    if (!drv || !drv->bdrv_get_specific_stats) {
6630        return NULL;
6631    }
6632    return drv->bdrv_get_specific_stats(bs);
6633}
6634
6635void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent event)
6636{
6637    IO_CODE();
6638    assert_bdrv_graph_readable();
6639
6640    if (!bs || !bs->drv || !bs->drv->bdrv_co_debug_event) {
6641        return;
6642    }
6643
6644    bs->drv->bdrv_co_debug_event(bs, event);
6645}
6646
6647static BlockDriverState * GRAPH_RDLOCK
6648bdrv_find_debug_node(BlockDriverState *bs)
6649{
6650    GLOBAL_STATE_CODE();
6651    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
6652        bs = bdrv_primary_bs(bs);
6653    }
6654
6655    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
6656        assert(bs->drv->bdrv_debug_remove_breakpoint);
6657        return bs;
6658    }
6659
6660    return NULL;
6661}
6662
6663int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
6664                          const char *tag)
6665{
6666    GLOBAL_STATE_CODE();
6667    GRAPH_RDLOCK_GUARD_MAINLOOP();
6668
6669    bs = bdrv_find_debug_node(bs);
6670    if (bs) {
6671        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
6672    }
6673
6674    return -ENOTSUP;
6675}
6676
6677int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
6678{
6679    GLOBAL_STATE_CODE();
6680    GRAPH_RDLOCK_GUARD_MAINLOOP();
6681
6682    bs = bdrv_find_debug_node(bs);
6683    if (bs) {
6684        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
6685    }
6686
6687    return -ENOTSUP;
6688}
6689
6690int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
6691{
6692    GLOBAL_STATE_CODE();
6693    GRAPH_RDLOCK_GUARD_MAINLOOP();
6694
6695    while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
6696        bs = bdrv_primary_bs(bs);
6697    }
6698
6699    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
6700        return bs->drv->bdrv_debug_resume(bs, tag);
6701    }
6702
6703    return -ENOTSUP;
6704}
6705
6706bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
6707{
6708    GLOBAL_STATE_CODE();
6709    GRAPH_RDLOCK_GUARD_MAINLOOP();
6710
6711    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
6712        bs = bdrv_primary_bs(bs);
6713    }
6714
6715    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
6716        return bs->drv->bdrv_debug_is_suspended(bs, tag);
6717    }
6718
6719    return false;
6720}
6721
6722/* backing_file can either be relative, or absolute, or a protocol.  If it is
6723 * relative, it must be relative to the chain.  So, passing in bs->filename
6724 * from a BDS as backing_file should not be done, as that may be relative to
6725 * the CWD rather than the chain. */
6726BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
6727        const char *backing_file)
6728{
6729    char *filename_full = NULL;
6730    char *backing_file_full = NULL;
6731    char *filename_tmp = NULL;
6732    int is_protocol = 0;
6733    bool filenames_refreshed = false;
6734    BlockDriverState *curr_bs = NULL;
6735    BlockDriverState *retval = NULL;
6736    BlockDriverState *bs_below;
6737
6738    GLOBAL_STATE_CODE();
6739    GRAPH_RDLOCK_GUARD_MAINLOOP();
6740
6741    if (!bs || !bs->drv || !backing_file) {
6742        return NULL;
6743    }
6744
6745    filename_full     = g_malloc(PATH_MAX);
6746    backing_file_full = g_malloc(PATH_MAX);
6747
6748    is_protocol = path_has_protocol(backing_file);
6749
6750    /*
6751     * Being largely a legacy function, skip any filters here
6752     * (because filters do not have normal filenames, so they cannot
6753     * match anyway; and allowing json:{} filenames is a bit out of
6754     * scope).
6755     */
6756    for (curr_bs = bdrv_skip_filters(bs);
6757         bdrv_cow_child(curr_bs) != NULL;
6758         curr_bs = bs_below)
6759    {
6760        bs_below = bdrv_backing_chain_next(curr_bs);
6761
6762        if (bdrv_backing_overridden(curr_bs)) {
6763            /*
6764             * If the backing file was overridden, we can only compare
6765             * directly against the backing node's filename.
6766             */
6767
6768            if (!filenames_refreshed) {
6769                /*
6770                 * This will automatically refresh all of the
6771                 * filenames in the rest of the backing chain, so we
6772                 * only need to do this once.
6773                 */
6774                bdrv_refresh_filename(bs_below);
6775                filenames_refreshed = true;
6776            }
6777
6778            if (strcmp(backing_file, bs_below->filename) == 0) {
6779                retval = bs_below;
6780                break;
6781            }
6782        } else if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
6783            /*
6784             * If either of the filename paths is actually a protocol, then
6785             * compare unmodified paths; otherwise make paths relative.
6786             */
6787            char *backing_file_full_ret;
6788
6789            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
6790                retval = bs_below;
6791                break;
6792            }
6793            /* Also check against the full backing filename for the image */
6794            backing_file_full_ret = bdrv_get_full_backing_filename(curr_bs,
6795                                                                   NULL);
6796            if (backing_file_full_ret) {
6797                bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
6798                g_free(backing_file_full_ret);
6799                if (equal) {
6800                    retval = bs_below;
6801                    break;
6802                }
6803            }
6804        } else {
6805            /* If not an absolute filename path, make it relative to the current
6806             * image's filename path */
6807            filename_tmp = bdrv_make_absolute_filename(curr_bs, backing_file,
6808                                                       NULL);
6809            /* We are going to compare canonicalized absolute pathnames */
6810            if (!filename_tmp || !realpath(filename_tmp, filename_full)) {
6811                g_free(filename_tmp);
6812                continue;
6813            }
6814            g_free(filename_tmp);
6815
6816            /* We need to make sure the backing filename we are comparing against
6817             * is relative to the current image filename (or absolute) */
6818            filename_tmp = bdrv_get_full_backing_filename(curr_bs, NULL);
6819            if (!filename_tmp || !realpath(filename_tmp, backing_file_full)) {
6820                g_free(filename_tmp);
6821                continue;
6822            }
6823            g_free(filename_tmp);
6824
6825            if (strcmp(backing_file_full, filename_full) == 0) {
6826                retval = bs_below;
6827                break;
6828            }
6829        }
6830    }
6831
6832    g_free(filename_full);
6833    g_free(backing_file_full);
6834    return retval;
6835}
6836
6837void bdrv_init(void)
6838{
6839#ifdef CONFIG_BDRV_WHITELIST_TOOLS
6840    use_bdrv_whitelist = 1;
6841#endif
6842    module_call_init(MODULE_INIT_BLOCK);
6843}
6844
6845void bdrv_init_with_whitelist(void)
6846{
6847    use_bdrv_whitelist = 1;
6848    bdrv_init();
6849}
6850
6851bool bdrv_is_inactive(BlockDriverState *bs) {
6852    return bs->open_flags & BDRV_O_INACTIVE;
6853}
6854
6855int bdrv_activate(BlockDriverState *bs, Error **errp)
6856{
6857    BdrvChild *child, *parent;
6858    Error *local_err = NULL;
6859    int ret;
6860    BdrvDirtyBitmap *bm;
6861
6862    GLOBAL_STATE_CODE();
6863    GRAPH_RDLOCK_GUARD_MAINLOOP();
6864
6865    if (!bs->drv)  {
6866        return -ENOMEDIUM;
6867    }
6868
6869    QLIST_FOREACH(child, &bs->children, next) {
6870        bdrv_activate(child->bs, &local_err);
6871        if (local_err) {
6872            error_propagate(errp, local_err);
6873            return -EINVAL;
6874        }
6875    }
6876
6877    /*
6878     * Update permissions, they may differ for inactive nodes.
6879     *
6880     * Note that the required permissions of inactive images are always a
6881     * subset of the permissions required after activating the image. This
6882     * allows us to just get the permissions upfront without restricting
6883     * bdrv_co_invalidate_cache().
6884     *
6885     * It also means that in error cases, we don't have to try and revert to
6886     * the old permissions (which is an operation that could fail, too). We can
6887     * just keep the extended permissions for the next time that an activation
6888     * of the image is tried.
6889     */
6890    if (bs->open_flags & BDRV_O_INACTIVE) {
6891        bs->open_flags &= ~BDRV_O_INACTIVE;
6892        ret = bdrv_refresh_perms(bs, NULL, errp);
6893        if (ret < 0) {
6894            bs->open_flags |= BDRV_O_INACTIVE;
6895            return ret;
6896        }
6897
6898        ret = bdrv_invalidate_cache(bs, errp);
6899        if (ret < 0) {
6900            bs->open_flags |= BDRV_O_INACTIVE;
6901            return ret;
6902        }
6903
6904        FOR_EACH_DIRTY_BITMAP(bs, bm) {
6905            bdrv_dirty_bitmap_skip_store(bm, false);
6906        }
6907
6908        ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
6909        if (ret < 0) {
6910            bs->open_flags |= BDRV_O_INACTIVE;
6911            error_setg_errno(errp, -ret, "Could not refresh total sector count");
6912            return ret;
6913        }
6914    }
6915
6916    QLIST_FOREACH(parent, &bs->parents, next_parent) {
6917        if (parent->klass->activate) {
6918            parent->klass->activate(parent, &local_err);
6919            if (local_err) {
6920                bs->open_flags |= BDRV_O_INACTIVE;
6921                error_propagate(errp, local_err);
6922                return -EINVAL;
6923            }
6924        }
6925    }
6926
6927    return 0;
6928}
6929
6930int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
6931{
6932    Error *local_err = NULL;
6933    IO_CODE();
6934
6935    assert(!(bs->open_flags & BDRV_O_INACTIVE));
6936    assert_bdrv_graph_readable();
6937
6938    if (bs->drv->bdrv_co_invalidate_cache) {
6939        bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
6940        if (local_err) {
6941            error_propagate(errp, local_err);
6942            return -EINVAL;
6943        }
6944    }
6945
6946    return 0;
6947}
6948
6949void bdrv_activate_all(Error **errp)
6950{
6951    BlockDriverState *bs;
6952    BdrvNextIterator it;
6953
6954    GLOBAL_STATE_CODE();
6955    GRAPH_RDLOCK_GUARD_MAINLOOP();
6956
6957    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6958        int ret;
6959
6960        ret = bdrv_activate(bs, errp);
6961        if (ret < 0) {
6962            bdrv_next_cleanup(&it);
6963            return;
6964        }
6965    }
6966}
6967
6968static bool GRAPH_RDLOCK
6969bdrv_has_bds_parent(BlockDriverState *bs, bool only_active)
6970{
6971    BdrvChild *parent;
6972    GLOBAL_STATE_CODE();
6973
6974    QLIST_FOREACH(parent, &bs->parents, next_parent) {
6975        if (parent->klass->parent_is_bds) {
6976            BlockDriverState *parent_bs = parent->opaque;
6977            if (!only_active || !(parent_bs->open_flags & BDRV_O_INACTIVE)) {
6978                return true;
6979            }
6980        }
6981    }
6982
6983    return false;
6984}
6985
6986static int GRAPH_RDLOCK
6987bdrv_inactivate_recurse(BlockDriverState *bs, bool top_level)
6988{
6989    BdrvChild *child, *parent;
6990    int ret;
6991    uint64_t cumulative_perms, cumulative_shared_perms;
6992
6993    GLOBAL_STATE_CODE();
6994
6995    assert(bs->quiesce_counter > 0);
6996
6997    if (!bs->drv) {
6998        return -ENOMEDIUM;
6999    }
7000
7001    /* Make sure that we don't inactivate a child before its parent.
7002     * It will be covered by recursion from the yet active parent. */
7003    if (bdrv_has_bds_parent(bs, true)) {
7004        return 0;
7005    }
7006
7007    /*
7008     * Inactivating an already inactive node on user request is harmless, but if
7009     * a child is already inactive before its parent, that's bad.
7010     */
7011    if (bs->open_flags & BDRV_O_INACTIVE) {
7012        assert(top_level);
7013        return 0;
7014    }
7015
7016    /* Inactivate this node */
7017    if (bs->drv->bdrv_inactivate) {
7018        ret = bs->drv->bdrv_inactivate(bs);
7019        if (ret < 0) {
7020            return ret;
7021        }
7022    }
7023
7024    QLIST_FOREACH(parent, &bs->parents, next_parent) {
7025        if (parent->klass->inactivate) {
7026            ret = parent->klass->inactivate(parent);
7027            if (ret < 0) {
7028                return ret;
7029            }
7030        }
7031    }
7032
7033    bdrv_get_cumulative_perm(bs, &cumulative_perms,
7034                             &cumulative_shared_perms);
7035    if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
7036        /* Our inactive parents still need write access. Inactivation failed. */
7037        return -EPERM;
7038    }
7039
7040    bs->open_flags |= BDRV_O_INACTIVE;
7041
7042    /*
7043     * Update permissions, they may differ for inactive nodes.
7044     * We only tried to loosen restrictions, so errors are not fatal, ignore
7045     * them.
7046     */
7047    bdrv_refresh_perms(bs, NULL, NULL);
7048
7049    /* Recursively inactivate children */
7050    QLIST_FOREACH(child, &bs->children, next) {
7051        ret = bdrv_inactivate_recurse(child->bs, false);
7052        if (ret < 0) {
7053            return ret;
7054        }
7055    }
7056
7057    return 0;
7058}
7059
7060/* All block nodes must be drained. */
7061int bdrv_inactivate(BlockDriverState *bs, Error **errp)
7062{
7063    int ret;
7064
7065    GLOBAL_STATE_CODE();
7066
7067    if (bdrv_has_bds_parent(bs, true)) {
7068        error_setg(errp, "Node has active parent node");
7069        return -EPERM;
7070    }
7071
7072    ret = bdrv_inactivate_recurse(bs, true);
7073    if (ret < 0) {
7074        error_setg_errno(errp, -ret, "Failed to inactivate node");
7075        return ret;
7076    }
7077
7078    return 0;
7079}
7080
7081int bdrv_inactivate_all(void)
7082{
7083    BlockDriverState *bs = NULL;
7084    BdrvNextIterator it;
7085    int ret = 0;
7086
7087    GLOBAL_STATE_CODE();
7088
7089    bdrv_drain_all_begin();
7090    bdrv_graph_rdlock_main_loop();
7091
7092    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
7093        /* Nodes with BDS parents are covered by recursion from the last
7094         * parent that gets inactivated. Don't inactivate them a second
7095         * time if that has already happened. */
7096        if (bdrv_has_bds_parent(bs, false)) {
7097            continue;
7098        }
7099        ret = bdrv_inactivate_recurse(bs, true);
7100        if (ret < 0) {
7101            bdrv_next_cleanup(&it);
7102            break;
7103        }
7104    }
7105
7106    bdrv_graph_rdunlock_main_loop();
7107    bdrv_drain_all_end();
7108
7109    return ret;
7110}
7111
7112/**************************************************************/
7113/* removable device support */
7114
7115/**
7116 * Return TRUE if the media is present
7117 */
7118bool coroutine_fn bdrv_co_is_inserted(BlockDriverState *bs)
7119{
7120    BlockDriver *drv = bs->drv;
7121    BdrvChild *child;
7122    IO_CODE();
7123    assert_bdrv_graph_readable();
7124
7125    if (!drv) {
7126        return false;
7127    }
7128    if (drv->bdrv_co_is_inserted) {
7129        return drv->bdrv_co_is_inserted(bs);
7130    }
7131    QLIST_FOREACH(child, &bs->children, next) {
7132        if (!bdrv_co_is_inserted(child->bs)) {
7133            return false;
7134        }
7135    }
7136    return true;
7137}
7138
7139/**
7140 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
7141 */
7142void coroutine_fn bdrv_co_eject(BlockDriverState *bs, bool eject_flag)
7143{
7144    BlockDriver *drv = bs->drv;
7145    IO_CODE();
7146    assert_bdrv_graph_readable();
7147
7148    if (drv && drv->bdrv_co_eject) {
7149        drv->bdrv_co_eject(bs, eject_flag);
7150    }
7151}
7152
7153/**
7154 * Lock or unlock the media (if it is locked, the user won't be able
7155 * to eject it manually).
7156 */
7157void coroutine_fn bdrv_co_lock_medium(BlockDriverState *bs, bool locked)
7158{
7159    BlockDriver *drv = bs->drv;
7160    IO_CODE();
7161    assert_bdrv_graph_readable();
7162    trace_bdrv_lock_medium(bs, locked);
7163
7164    if (drv && drv->bdrv_co_lock_medium) {
7165        drv->bdrv_co_lock_medium(bs, locked);
7166    }
7167}
7168
7169/* Get a reference to bs */
7170void bdrv_ref(BlockDriverState *bs)
7171{
7172    GLOBAL_STATE_CODE();
7173    bs->refcnt++;
7174}
7175
7176/* Release a previously grabbed reference to bs.
7177 * If after releasing, reference count is zero, the BlockDriverState is
7178 * deleted. */
7179void bdrv_unref(BlockDriverState *bs)
7180{
7181    GLOBAL_STATE_CODE();
7182    if (!bs) {
7183        return;
7184    }
7185    assert(bs->refcnt > 0);
7186    if (--bs->refcnt == 0) {
7187        bdrv_delete(bs);
7188    }
7189}
7190
7191static void bdrv_schedule_unref_bh(void *opaque)
7192{
7193    BlockDriverState *bs = opaque;
7194
7195    bdrv_unref(bs);
7196}
7197
7198/*
7199 * Release a BlockDriverState reference while holding the graph write lock.
7200 *
7201 * Calling bdrv_unref() directly is forbidden while holding the graph lock
7202 * because bdrv_close() both involves polling and taking the graph lock
7203 * internally. bdrv_schedule_unref() instead delays decreasing the refcount and
7204 * possibly closing @bs until the graph lock is released.
7205 */
7206void bdrv_schedule_unref(BlockDriverState *bs)
7207{
7208    if (!bs) {
7209        return;
7210    }
7211    aio_bh_schedule_oneshot(qemu_get_aio_context(), bdrv_schedule_unref_bh, bs);
7212}
7213
7214struct BdrvOpBlocker {
7215    Error *reason;
7216    QLIST_ENTRY(BdrvOpBlocker) list;
7217};
7218
7219bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
7220{
7221    BdrvOpBlocker *blocker;
7222    GLOBAL_STATE_CODE();
7223
7224    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7225    if (!QLIST_EMPTY(&bs->op_blockers[op])) {
7226        blocker = QLIST_FIRST(&bs->op_blockers[op]);
7227        error_propagate_prepend(errp, error_copy(blocker->reason),
7228                                "Node '%s' is busy: ",
7229                                bdrv_get_device_or_node_name(bs));
7230        return true;
7231    }
7232    return false;
7233}
7234
7235void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
7236{
7237    BdrvOpBlocker *blocker;
7238    GLOBAL_STATE_CODE();
7239    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7240
7241    blocker = g_new0(BdrvOpBlocker, 1);
7242    blocker->reason = reason;
7243    QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
7244}
7245
7246void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
7247{
7248    BdrvOpBlocker *blocker, *next;
7249    GLOBAL_STATE_CODE();
7250    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
7251    QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) {
7252        if (blocker->reason == reason) {
7253            QLIST_REMOVE(blocker, list);
7254            g_free(blocker);
7255        }
7256    }
7257}
7258
7259void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
7260{
7261    int i;
7262    GLOBAL_STATE_CODE();
7263    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7264        bdrv_op_block(bs, i, reason);
7265    }
7266}
7267
7268void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
7269{
7270    int i;
7271    GLOBAL_STATE_CODE();
7272    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7273        bdrv_op_unblock(bs, i, reason);
7274    }
7275}
7276
7277bool bdrv_op_blocker_is_empty(BlockDriverState *bs)
7278{
7279    int i;
7280    GLOBAL_STATE_CODE();
7281    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
7282        if (!QLIST_EMPTY(&bs->op_blockers[i])) {
7283            return false;
7284        }
7285    }
7286    return true;
7287}
7288
7289void bdrv_img_create(const char *filename, const char *fmt,
7290                     const char *base_filename, const char *base_fmt,
7291                     char *options, uint64_t img_size, int flags, bool quiet,
7292                     Error **errp)
7293{
7294    QemuOptsList *create_opts = NULL;
7295    QemuOpts *opts = NULL;
7296    const char *backing_fmt, *backing_file;
7297    int64_t size;
7298    BlockDriver *drv, *proto_drv;
7299    Error *local_err = NULL;
7300    int ret = 0;
7301
7302    GLOBAL_STATE_CODE();
7303
7304    /* Find driver and parse its options */
7305    drv = bdrv_find_format(fmt);
7306    if (!drv) {
7307        error_setg(errp, "Unknown file format '%s'", fmt);
7308        return;
7309    }
7310
7311    proto_drv = bdrv_find_protocol(filename, true, errp);
7312    if (!proto_drv) {
7313        return;
7314    }
7315
7316    if (!drv->create_opts) {
7317        error_setg(errp, "Format driver '%s' does not support image creation",
7318                   drv->format_name);
7319        return;
7320    }
7321
7322    if (!proto_drv->create_opts) {
7323        error_setg(errp, "Protocol driver '%s' does not support image creation",
7324                   proto_drv->format_name);
7325        return;
7326    }
7327
7328    /* Create parameter list */
7329    create_opts = qemu_opts_append(create_opts, drv->create_opts);
7330    create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
7331
7332    opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
7333
7334    /* Parse -o options */
7335    if (options) {
7336        if (!qemu_opts_do_parse(opts, options, NULL, errp)) {
7337            goto out;
7338        }
7339    }
7340
7341    if (!qemu_opt_get(opts, BLOCK_OPT_SIZE)) {
7342        qemu_opt_set_number(opts, BLOCK_OPT_SIZE, img_size, &error_abort);
7343    } else if (img_size != UINT64_C(-1)) {
7344        error_setg(errp, "The image size must be specified only once");
7345        goto out;
7346    }
7347
7348    if (base_filename) {
7349        if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, base_filename,
7350                          NULL)) {
7351            error_setg(errp, "Backing file not supported for file format '%s'",
7352                       fmt);
7353            goto out;
7354        }
7355    }
7356
7357    if (base_fmt) {
7358        if (!qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, base_fmt, NULL)) {
7359            error_setg(errp, "Backing file format not supported for file "
7360                             "format '%s'", fmt);
7361            goto out;
7362        }
7363    }
7364
7365    backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
7366    if (backing_file) {
7367        if (!strcmp(filename, backing_file)) {
7368            error_setg(errp, "Error: Trying to create an image with the "
7369                             "same filename as the backing file");
7370            goto out;
7371        }
7372        if (backing_file[0] == '\0') {
7373            error_setg(errp, "Expected backing file name, got empty string");
7374            goto out;
7375        }
7376    }
7377
7378    backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
7379
7380    /* The size for the image must always be specified, unless we have a backing
7381     * file and we have not been forbidden from opening it. */
7382    size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, img_size);
7383    if (backing_file && !(flags & BDRV_O_NO_BACKING)) {
7384        BlockDriverState *bs;
7385        char *full_backing;
7386        int back_flags;
7387        QDict *backing_options = NULL;
7388
7389        full_backing =
7390            bdrv_get_full_backing_filename_from_filename(filename, backing_file,
7391                                                         &local_err);
7392        if (local_err) {
7393            goto out;
7394        }
7395        assert(full_backing);
7396
7397        /*
7398         * No need to do I/O here, which allows us to open encrypted
7399         * backing images without needing the secret
7400         */
7401        back_flags = flags;
7402        back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
7403        back_flags |= BDRV_O_NO_IO;
7404
7405        backing_options = qdict_new();
7406        if (backing_fmt) {
7407            qdict_put_str(backing_options, "driver", backing_fmt);
7408        }
7409        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);
7410
7411        bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
7412                       &local_err);
7413        g_free(full_backing);
7414        if (!bs) {
7415            error_append_hint(&local_err, "Could not open backing image.\n");
7416            goto out;
7417        } else {
7418            if (!backing_fmt) {
7419                error_setg(&local_err,
7420                           "Backing file specified without backing format");
7421                error_append_hint(&local_err, "Detected format of %s.\n",
7422                                  bs->drv->format_name);
7423                goto out;
7424            }
7425            if (size == -1) {
7426                /* Opened BS, have no size */
7427                size = bdrv_getlength(bs);
7428                if (size < 0) {
7429                    error_setg_errno(errp, -size, "Could not get size of '%s'",
7430                                     backing_file);
7431                    bdrv_unref(bs);
7432                    goto out;
7433                }
7434                qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size, &error_abort);
7435            }
7436            bdrv_unref(bs);
7437        }
7438        /* (backing_file && !(flags & BDRV_O_NO_BACKING)) */
7439    } else if (backing_file && !backing_fmt) {
7440        error_setg(&local_err,
7441                   "Backing file specified without backing format");
7442        goto out;
7443    }
7444
7445    /* Parameter 'size' is not needed for detached LUKS header */
7446    if (size == -1 &&
7447        !(!strcmp(fmt, "luks") &&
7448          qemu_opt_get_bool(opts, "detached-header", false))) {
7449        error_setg(errp, "Image creation needs a size parameter");
7450        goto out;
7451    }
7452
7453    if (!quiet) {
7454        printf("Formatting '%s', fmt=%s ", filename, fmt);
7455        qemu_opts_print(opts, " ");
7456        puts("");
7457        fflush(stdout);
7458    }
7459
7460    ret = bdrv_create(drv, filename, opts, &local_err);
7461
7462    if (ret == -EFBIG) {
7463        /* This is generally a better message than whatever the driver would
7464         * deliver (especially because of the cluster_size_hint), since that
7465         * is most probably not much different from "image too large". */
7466        const char *cluster_size_hint = "";
7467        if (qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 0)) {
7468            cluster_size_hint = " (try using a larger cluster size)";
7469        }
7470        error_setg(errp, "The image size is too large for file format '%s'"
7471                   "%s", fmt, cluster_size_hint);
7472        error_free(local_err);
7473        local_err = NULL;
7474    }
7475
7476out:
7477    qemu_opts_del(opts);
7478    qemu_opts_free(create_opts);
7479    error_propagate(errp, local_err);
7480}
7481
7482AioContext *bdrv_get_aio_context(BlockDriverState *bs)
7483{
7484    IO_CODE();
7485    return bs ? bs->aio_context : qemu_get_aio_context();
7486}
7487
7488AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs)
7489{
7490    Coroutine *self = qemu_coroutine_self();
7491    AioContext *old_ctx = qemu_coroutine_get_aio_context(self);
7492    AioContext *new_ctx;
7493    IO_CODE();
7494
7495    /*
7496     * Increase bs->in_flight to ensure that this operation is completed before
7497     * moving the node to a different AioContext. Read new_ctx only afterwards.
7498     */
7499    bdrv_inc_in_flight(bs);
7500
7501    new_ctx = bdrv_get_aio_context(bs);
7502    aio_co_reschedule_self(new_ctx);
7503    return old_ctx;
7504}
7505
7506void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx)
7507{
7508    IO_CODE();
7509    aio_co_reschedule_self(old_ctx);
7510    bdrv_dec_in_flight(bs);
7511}
7512
7513static void bdrv_do_remove_aio_context_notifier(BdrvAioNotifier *ban)
7514{
7515    GLOBAL_STATE_CODE();
7516    QLIST_REMOVE(ban, list);
7517    g_free(ban);
7518}
7519
7520static void bdrv_detach_aio_context(BlockDriverState *bs)
7521{
7522    BdrvAioNotifier *baf, *baf_tmp;
7523
7524    assert(!bs->walking_aio_notifiers);
7525    GLOBAL_STATE_CODE();
7526    bs->walking_aio_notifiers = true;
7527    QLIST_FOREACH_SAFE(baf, &bs->aio_notifiers, list, baf_tmp) {
7528        if (baf->deleted) {
7529            bdrv_do_remove_aio_context_notifier(baf);
7530        } else {
7531            baf->detach_aio_context(baf->opaque);
7532        }
7533    }
7534    /* Never mind iterating again to check for ->deleted.  bdrv_close() will
7535     * remove remaining aio notifiers if we aren't called again.
7536     */
7537    bs->walking_aio_notifiers = false;
7538
7539    if (bs->drv && bs->drv->bdrv_detach_aio_context) {
7540        bs->drv->bdrv_detach_aio_context(bs);
7541    }
7542
7543    bs->aio_context = NULL;
7544}
7545
7546static void bdrv_attach_aio_context(BlockDriverState *bs,
7547                                    AioContext *new_context)
7548{
7549    BdrvAioNotifier *ban, *ban_tmp;
7550    GLOBAL_STATE_CODE();
7551
7552    bs->aio_context = new_context;
7553
7554    if (bs->drv && bs->drv->bdrv_attach_aio_context) {
7555        bs->drv->bdrv_attach_aio_context(bs, new_context);
7556    }
7557
7558    assert(!bs->walking_aio_notifiers);
7559    bs->walking_aio_notifiers = true;
7560    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_tmp) {
7561        if (ban->deleted) {
7562            bdrv_do_remove_aio_context_notifier(ban);
7563        } else {
7564            ban->attached_aio_context(new_context, ban->opaque);
7565        }
7566    }
7567    bs->walking_aio_notifiers = false;
7568}
7569
7570typedef struct BdrvStateSetAioContext {
7571    AioContext *new_ctx;
7572    BlockDriverState *bs;
7573} BdrvStateSetAioContext;
7574
7575/*
7576 * Changes the AioContext of @child to @ctx and recursively for the associated
7577 * block nodes and all their children and parents. Returns true if the change is
7578 * possible and the transaction @tran can be continued. Returns false and sets
7579 * @errp if not and the transaction must be aborted.
7580 *
7581 * @visited will accumulate all visited BdrvChild objects. The caller is
7582 * responsible for freeing the list afterwards.
7583 *
7584 * Must be called with the affected block nodes drained.
7585 */
7586static bool GRAPH_RDLOCK
7587bdrv_parent_change_aio_context(BdrvChild *c, AioContext *ctx,
7588                               GHashTable *visited, Transaction *tran,
7589                               Error **errp)
7590{
7591    GLOBAL_STATE_CODE();
7592    if (g_hash_table_contains(visited, c)) {
7593        return true;
7594    }
7595    g_hash_table_add(visited, c);
7596
7597    /*
7598     * A BdrvChildClass that doesn't handle AioContext changes cannot
7599     * tolerate any AioContext changes
7600     */
7601    if (!c->klass->change_aio_ctx) {
7602        char *user = bdrv_child_user_desc(c);
7603        error_setg(errp, "Changing iothreads is not supported by %s", user);
7604        g_free(user);
7605        return false;
7606    }
7607    if (!c->klass->change_aio_ctx(c, ctx, visited, tran, errp)) {
7608        assert(!errp || *errp);
7609        return false;
7610    }
7611    return true;
7612}
7613
7614/*
7615 * Changes the AioContext of @c->bs to @ctx and recursively for all its children
7616 * and parents. Returns true if the change is possible and the transaction @tran
7617 * can be continued. Returns false and sets @errp if not and the transaction
7618 * must be aborted.
7619 *
7620 * @visited will accumulate all visited BdrvChild objects. The caller is
7621 * responsible for freeing the list afterwards.
7622 *
7623 * Must be called with the affected block nodes drained.
7624 */
7625bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx,
7626                                   GHashTable *visited, Transaction *tran,
7627                                   Error **errp)
7628{
7629    GLOBAL_STATE_CODE();
7630    if (g_hash_table_contains(visited, c)) {
7631        return true;
7632    }
7633    g_hash_table_add(visited, c);
7634    return bdrv_change_aio_context(c->bs, ctx, visited, tran, errp);
7635}
7636
7637static void bdrv_set_aio_context_clean(void *opaque)
7638{
7639    BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
7640
7641    g_free(state);
7642}
7643
7644static void bdrv_set_aio_context_commit(void *opaque)
7645{
7646    BdrvStateSetAioContext *state = (BdrvStateSetAioContext *) opaque;
7647    BlockDriverState *bs = (BlockDriverState *) state->bs;
7648    AioContext *new_context = state->new_ctx;
7649
7650    bdrv_detach_aio_context(bs);
7651    bdrv_attach_aio_context(bs, new_context);
7652}
7653
7654static TransactionActionDrv set_aio_context = {
7655    .commit = bdrv_set_aio_context_commit,
7656    .clean = bdrv_set_aio_context_clean,
7657};
7658
7659/*
7660 * Changes the AioContext used for fd handlers, timers, and BHs by this
7661 * BlockDriverState and all its children and parents.
7662 *
7663 * Must be called from the main AioContext.
7664 *
7665 * @visited will accumulate all visited BdrvChild objects. The caller is
7666 * responsible for freeing the list afterwards.
7667 *
7668 * @bs must be drained.
7669 */
7670static bool GRAPH_RDLOCK
7671bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
7672                        GHashTable *visited, Transaction *tran, Error **errp)
7673{
7674    BdrvChild *c;
7675    BdrvStateSetAioContext *state;
7676
7677    GLOBAL_STATE_CODE();
7678
7679    if (bdrv_get_aio_context(bs) == ctx) {
7680        return true;
7681    }
7682
7683    QLIST_FOREACH(c, &bs->parents, next_parent) {
7684        if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) {
7685            return false;
7686        }
7687    }
7688
7689    QLIST_FOREACH(c, &bs->children, next) {
7690        if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) {
7691            return false;
7692        }
7693    }
7694
7695    state = g_new(BdrvStateSetAioContext, 1);
7696    *state = (BdrvStateSetAioContext) {
7697        .new_ctx = ctx,
7698        .bs = bs,
7699    };
7700
7701    assert(bs->quiesce_counter > 0);
7702
7703    tran_add(tran, &set_aio_context, state);
7704
7705    return true;
7706}
7707
7708/*
7709 * Change bs's and recursively all of its parents' and children's AioContext
7710 * to the given new context, returning an error if that isn't possible.
7711 *
7712 * If ignore_child is not NULL, that child (and its subgraph) will not
7713 * be touched.
7714 *
7715 * Called with the graph lock held.
7716 *
7717 * Called while all bs are drained.
7718 */
7719int bdrv_try_change_aio_context_locked(BlockDriverState *bs, AioContext *ctx,
7720                                       BdrvChild *ignore_child, Error **errp)
7721{
7722    Transaction *tran;
7723    GHashTable *visited;
7724    int ret;
7725    GLOBAL_STATE_CODE();
7726
7727    /*
7728     * Recursion phase: go through all nodes of the graph.
7729     * Take care of checking that all nodes support changing AioContext,
7730     * building a linear list of callbacks to run if everything is successful
7731     * (the transaction itself).
7732     */
7733    tran = tran_new();
7734    visited = g_hash_table_new(NULL, NULL);
7735    if (ignore_child) {
7736        g_hash_table_add(visited, ignore_child);
7737    }
7738    ret = bdrv_change_aio_context(bs, ctx, visited, tran, errp);
7739    g_hash_table_destroy(visited);
7740
7741    /*
7742     * Linear phase: go through all callbacks collected in the transaction.
7743     * Run all callbacks collected in the recursion to switch every node's
7744     * AioContext (transaction commit), or undo all changes done in the
7745     * recursion (transaction abort).
7746     */
7747
7748    if (!ret) {
7749        /* Just run clean() callbacks. No AioContext changed. */
7750        tran_abort(tran);
7751        return -EPERM;
7752    }
7753
7754    tran_commit(tran);
7755    return 0;
7756}
7757
7758/*
7759 * Change bs's and recursively all of its parents' and children's AioContext
7760 * to the given new context, returning an error if that isn't possible.
7761 *
7762 * If ignore_child is not NULL, that child (and its subgraph) will not
7763 * be touched.
7764 */
7765int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
7766                                BdrvChild *ignore_child, Error **errp)
7767{
7768    int ret;
7769
7770    GLOBAL_STATE_CODE();
7771
7772    bdrv_drain_all_begin();
7773    bdrv_graph_rdlock_main_loop();
7774    ret = bdrv_try_change_aio_context_locked(bs, ctx, ignore_child, errp);
7775    bdrv_graph_rdunlock_main_loop();
7776    bdrv_drain_all_end();
7777
7778    return ret;
7779}
7780
7781void bdrv_add_aio_context_notifier(BlockDriverState *bs,
7782        void (*attached_aio_context)(AioContext *new_context, void *opaque),
7783        void (*detach_aio_context)(void *opaque), void *opaque)
7784{
7785    BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
7786    *ban = (BdrvAioNotifier){
7787        .attached_aio_context = attached_aio_context,
7788        .detach_aio_context   = detach_aio_context,
7789        .opaque               = opaque
7790    };
7791    GLOBAL_STATE_CODE();
7792
7793    QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
7794}
7795
7796void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
7797                                      void (*attached_aio_context)(AioContext *,
7798                                                                   void *),
7799                                      void (*detach_aio_context)(void *),
7800                                      void *opaque)
7801{
7802    BdrvAioNotifier *ban, *ban_next;
7803    GLOBAL_STATE_CODE();
7804
7805    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
7806        if (ban->attached_aio_context == attached_aio_context &&
7807            ban->detach_aio_context   == detach_aio_context   &&
7808            ban->opaque               == opaque               &&
7809            ban->deleted              == false)
7810        {
7811            if (bs->walking_aio_notifiers) {
7812                ban->deleted = true;
7813            } else {
7814                bdrv_do_remove_aio_context_notifier(ban);
7815            }
7816            return;
7817        }
7818    }
7819
7820    abort();
7821}
7822
7823int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
7824                       BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
7825                       bool force,
7826                       Error **errp)
7827{
7828    GLOBAL_STATE_CODE();
7829    if (!bs->drv) {
7830        error_setg(errp, "Node is ejected");
7831        return -ENOMEDIUM;
7832    }
7833    if (!bs->drv->bdrv_amend_options) {
7834        error_setg(errp, "Block driver '%s' does not support option amendment",
7835                   bs->drv->format_name);
7836        return -ENOTSUP;
7837    }
7838    return bs->drv->bdrv_amend_options(bs, opts, status_cb,
7839                                       cb_opaque, force, errp);
7840}
7841
7842/*
7843 * This function checks whether the given @to_replace is allowed to be
7844 * replaced by a node that always shows the same data as @bs.  This is
7845 * used for example to verify whether the mirror job can replace
7846 * @to_replace by the target mirrored from @bs.
7847 * To be replaceable, @bs and @to_replace may either be guaranteed to
7848 * always show the same data (because they are only connected through
7849 * filters), or some driver may allow replacing one of its children
7850 * because it can guarantee that this child's data is not visible at
7851 * all (for example, for dissenting quorum children that have no other
7852 * parents).
7853 */
7854bool bdrv_recurse_can_replace(BlockDriverState *bs,
7855                              BlockDriverState *to_replace)
7856{
7857    BlockDriverState *filtered;
7858
7859    GLOBAL_STATE_CODE();
7860
7861    if (!bs || !bs->drv) {
7862        return false;
7863    }
7864
7865    if (bs == to_replace) {
7866        return true;
7867    }
7868
7869    /* See what the driver can do */
7870    if (bs->drv->bdrv_recurse_can_replace) {
7871        return bs->drv->bdrv_recurse_can_replace(bs, to_replace);
7872    }
7873
7874    /* For filters without an own implementation, we can recurse on our own */
7875    filtered = bdrv_filter_bs(bs);
7876    if (filtered) {
7877        return bdrv_recurse_can_replace(filtered, to_replace);
7878    }
7879
7880    /* Safe default */
7881    return false;
7882}
7883
7884/*
7885 * Check whether the given @node_name can be replaced by a node that
7886 * has the same data as @parent_bs.  If so, return @node_name's BDS;
7887 * NULL otherwise.
7888 *
7889 * @node_name must be a (recursive) *child of @parent_bs (or this
7890 * function will return NULL).
7891 *
7892 * The result (whether the node can be replaced or not) is only valid
7893 * for as long as no graph or permission changes occur.
7894 */
7895BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
7896                                        const char *node_name, Error **errp)
7897{
7898    BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
7899
7900    GLOBAL_STATE_CODE();
7901
7902    if (!to_replace_bs) {
7903        error_setg(errp, "Failed to find node with node-name='%s'", node_name);
7904        return NULL;
7905    }
7906
7907    if (bdrv_op_is_blocked(to_replace_bs, BLOCK_OP_TYPE_REPLACE, errp)) {
7908        return NULL;
7909    }
7910
7911    /* We don't want arbitrary node of the BDS chain to be replaced only the top
7912     * most non filter in order to prevent data corruption.
7913     * Another benefit is that this tests exclude backing files which are
7914     * blocked by the backing blockers.
7915     */
7916    if (!bdrv_recurse_can_replace(parent_bs, to_replace_bs)) {
7917        error_setg(errp, "Cannot replace '%s' by a node mirrored from '%s', "
7918                   "because it cannot be guaranteed that doing so would not "
7919                   "lead to an abrupt change of visible data",
7920                   node_name, parent_bs->node_name);
7921        return NULL;
7922    }
7923
7924    return to_replace_bs;
7925}
7926
7927/**
7928 * Iterates through the list of runtime option keys that are said to
7929 * be "strong" for a BDS.  An option is called "strong" if it changes
7930 * a BDS's data.  For example, the null block driver's "size" and
7931 * "read-zeroes" options are strong, but its "latency-ns" option is
7932 * not.
7933 *
7934 * If a key returned by this function ends with a dot, all options
7935 * starting with that prefix are strong.
7936 */
7937static const char *const *strong_options(BlockDriverState *bs,
7938                                         const char *const *curopt)
7939{
7940    static const char *const global_options[] = {
7941        "driver", "filename", NULL
7942    };
7943
7944    if (!curopt) {
7945        return &global_options[0];
7946    }
7947
7948    curopt++;
7949    if (curopt == &global_options[ARRAY_SIZE(global_options) - 1] && bs->drv) {
7950        curopt = bs->drv->strong_runtime_opts;
7951    }
7952
7953    return (curopt && *curopt) ? curopt : NULL;
7954}
7955
7956/**
7957 * Copies all strong runtime options from bs->options to the given
7958 * QDict.  The set of strong option keys is determined by invoking
7959 * strong_options().
7960 *
7961 * Returns true iff any strong option was present in bs->options (and
7962 * thus copied to the target QDict) with the exception of "filename"
7963 * and "driver".  The caller is expected to use this value to decide
7964 * whether the existence of strong options prevents the generation of
7965 * a plain filename.
7966 */
7967static bool append_strong_runtime_options(QDict *d, BlockDriverState *bs)
7968{
7969    bool found_any = false;
7970    const char *const *option_name = NULL;
7971
7972    if (!bs->drv) {
7973        return false;
7974    }
7975
7976    while ((option_name = strong_options(bs, option_name))) {
7977        bool option_given = false;
7978
7979        assert(strlen(*option_name) > 0);
7980        if ((*option_name)[strlen(*option_name) - 1] != '.') {
7981            QObject *entry = qdict_get(bs->options, *option_name);
7982            if (!entry) {
7983                continue;
7984            }
7985
7986            qdict_put_obj(d, *option_name, qobject_ref(entry));
7987            option_given = true;
7988        } else {
7989            const QDictEntry *entry;
7990            for (entry = qdict_first(bs->options); entry;
7991                 entry = qdict_next(bs->options, entry))
7992            {
7993                if (strstart(qdict_entry_key(entry), *option_name, NULL)) {
7994                    qdict_put_obj(d, qdict_entry_key(entry),
7995                                  qobject_ref(qdict_entry_value(entry)));
7996                    option_given = true;
7997                }
7998            }
7999        }
8000
8001        /* While "driver" and "filename" need to be included in a JSON filename,
8002         * their existence does not prohibit generation of a plain filename. */
8003        if (!found_any && option_given &&
8004            strcmp(*option_name, "driver") && strcmp(*option_name, "filename"))
8005        {
8006            found_any = true;
8007        }
8008    }
8009
8010    if (!qdict_haskey(d, "driver")) {
8011        /* Drivers created with bdrv_new_open_driver() may not have a
8012         * @driver option.  Add it here. */
8013        qdict_put_str(d, "driver", bs->drv->format_name);
8014    }
8015
8016    return found_any;
8017}
8018
8019/* Note: This function may return false positives; it may return true
8020 * even if opening the backing file specified by bs's image header
8021 * would result in exactly bs->backing. */
8022static bool GRAPH_RDLOCK bdrv_backing_overridden(BlockDriverState *bs)
8023{
8024    GLOBAL_STATE_CODE();
8025    if (bs->backing) {
8026        return strcmp(bs->auto_backing_file,
8027                      bs->backing->bs->filename);
8028    } else {
8029        /* No backing BDS, so if the image header reports any backing
8030         * file, it must have been suppressed */
8031        return bs->auto_backing_file[0] != '\0';
8032    }
8033}
8034
8035/* Updates the following BDS fields:
8036 *  - exact_filename: A filename which may be used for opening a block device
8037 *                    which (mostly) equals the given BDS (even without any
8038 *                    other options; so reading and writing must return the same
8039 *                    results, but caching etc. may be different)
8040 *  - full_open_options: Options which, when given when opening a block device
8041 *                       (without a filename), result in a BDS (mostly)
8042 *                       equalling the given one
8043 *  - filename: If exact_filename is set, it is copied here. Otherwise,
8044 *              full_open_options is converted to a JSON object, prefixed with
8045 *              "json:" (for use through the JSON pseudo protocol) and put here.
8046 */
8047void bdrv_refresh_filename(BlockDriverState *bs)
8048{
8049    BlockDriver *drv = bs->drv;
8050    BdrvChild *child;
8051    BlockDriverState *primary_child_bs;
8052    QDict *opts;
8053    bool backing_overridden;
8054    bool generate_json_filename; /* Whether our default implementation should
8055                                    fill exact_filename (false) or not (true) */
8056
8057    GLOBAL_STATE_CODE();
8058
8059    if (!drv) {
8060        return;
8061    }
8062
8063    /* This BDS's file name may depend on any of its children's file names, so
8064     * refresh those first */
8065    QLIST_FOREACH(child, &bs->children, next) {
8066        bdrv_refresh_filename(child->bs);
8067    }
8068
8069    if (bs->implicit) {
8070        /* For implicit nodes, just copy everything from the single child */
8071        child = QLIST_FIRST(&bs->children);
8072        assert(QLIST_NEXT(child, next) == NULL);
8073
8074        pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
8075                child->bs->exact_filename);
8076        pstrcpy(bs->filename, sizeof(bs->filename), child->bs->filename);
8077
8078        qobject_unref(bs->full_open_options);
8079        bs->full_open_options = qobject_ref(child->bs->full_open_options);
8080
8081        return;
8082    }
8083
8084    backing_overridden = bdrv_backing_overridden(bs);
8085
8086    if (bs->open_flags & BDRV_O_NO_IO) {
8087        /* Without I/O, the backing file does not change anything.
8088         * Therefore, in such a case (primarily qemu-img), we can
8089         * pretend the backing file has not been overridden even if
8090         * it technically has been. */
8091        backing_overridden = false;
8092    }
8093
8094    /* Gather the options QDict */
8095    opts = qdict_new();
8096    generate_json_filename = append_strong_runtime_options(opts, bs);
8097    generate_json_filename |= backing_overridden;
8098
8099    if (drv->bdrv_gather_child_options) {
8100        /* Some block drivers may not want to present all of their children's
8101         * options, or name them differently from BdrvChild.name */
8102        drv->bdrv_gather_child_options(bs, opts, backing_overridden);
8103    } else {
8104        QLIST_FOREACH(child, &bs->children, next) {
8105            if (child == bs->backing && !backing_overridden) {
8106                /* We can skip the backing BDS if it has not been overridden */
8107                continue;
8108            }
8109
8110            qdict_put(opts, child->name,
8111                      qobject_ref(child->bs->full_open_options));
8112        }
8113
8114        if (backing_overridden && !bs->backing) {
8115            /* Force no backing file */
8116            qdict_put_null(opts, "backing");
8117        }
8118    }
8119
8120    qobject_unref(bs->full_open_options);
8121    bs->full_open_options = opts;
8122
8123    primary_child_bs = bdrv_primary_bs(bs);
8124
8125    if (drv->bdrv_refresh_filename) {
8126        /* Obsolete information is of no use here, so drop the old file name
8127         * information before refreshing it */
8128        bs->exact_filename[0] = '\0';
8129
8130        drv->bdrv_refresh_filename(bs);
8131    } else if (primary_child_bs) {
8132        /*
8133         * Try to reconstruct valid information from the underlying
8134         * file -- this only works for format nodes (filter nodes
8135         * cannot be probed and as such must be selected by the user
8136         * either through an options dict, or through a special
8137         * filename which the filter driver must construct in its
8138         * .bdrv_refresh_filename() implementation).
8139         */
8140
8141        bs->exact_filename[0] = '\0';
8142
8143        /*
8144         * We can use the underlying file's filename if:
8145         * - it has a filename,
8146         * - the current BDS is not a filter,
8147         * - the file is a protocol BDS, and
8148         * - opening that file (as this BDS's format) will automatically create
8149         *   the BDS tree we have right now, that is:
8150         *   - the user did not significantly change this BDS's behavior with
8151         *     some explicit (strong) options
8152         *   - no non-file child of this BDS has been overridden by the user
8153         *   Both of these conditions are represented by generate_json_filename.
8154         */
8155        if (primary_child_bs->exact_filename[0] &&
8156            primary_child_bs->drv->protocol_name &&
8157            !drv->is_filter && !generate_json_filename)
8158        {
8159            strcpy(bs->exact_filename, primary_child_bs->exact_filename);
8160        }
8161    }
8162
8163    if (bs->exact_filename[0]) {
8164        pstrcpy(bs->filename, sizeof(bs->filename), bs->exact_filename);
8165    } else {
8166        GString *json = qobject_to_json(QOBJECT(bs->full_open_options));
8167        if (snprintf(bs->filename, sizeof(bs->filename), "json:%s",
8168                     json->str) >= sizeof(bs->filename)) {
8169            /* Give user a hint if we truncated things. */
8170            strcpy(bs->filename + sizeof(bs->filename) - 4, "...");
8171        }
8172        g_string_free(json, true);
8173    }
8174}
8175
8176char *bdrv_dirname(BlockDriverState *bs, Error **errp)
8177{
8178    BlockDriver *drv = bs->drv;
8179    BlockDriverState *child_bs;
8180
8181    GLOBAL_STATE_CODE();
8182
8183    if (!drv) {
8184        error_setg(errp, "Node '%s' is ejected", bs->node_name);
8185        return NULL;
8186    }
8187
8188    if (drv->bdrv_dirname) {
8189        return drv->bdrv_dirname(bs, errp);
8190    }
8191
8192    child_bs = bdrv_primary_bs(bs);
8193    if (child_bs) {
8194        return bdrv_dirname(child_bs, errp);
8195    }
8196
8197    bdrv_refresh_filename(bs);
8198    if (bs->exact_filename[0] != '\0') {
8199        return path_combine(bs->exact_filename, "");
8200    }
8201
8202    error_setg(errp, "Cannot generate a base directory for %s nodes",
8203               drv->format_name);
8204    return NULL;
8205}
8206
8207/*
8208 * Hot add a BDS's child. Used in combination with bdrv_del_child, so the user
8209 * can take a child offline when it is broken and take a new child online.
8210 *
8211 * All block nodes must be drained.
8212 */
8213void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
8214                    Error **errp)
8215{
8216    GLOBAL_STATE_CODE();
8217    if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
8218        error_setg(errp, "The node %s does not support adding a child",
8219                   bdrv_get_device_or_node_name(parent_bs));
8220        return;
8221    }
8222
8223    /*
8224     * Non-zoned block drivers do not follow zoned storage constraints
8225     * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
8226     * drivers in a graph.
8227     */
8228    if (!parent_bs->drv->supports_zoned_children &&
8229        child_bs->bl.zoned == BLK_Z_HM) {
8230        /*
8231         * The host-aware model allows zoned storage constraints and random
8232         * write. Allow mixing host-aware and non-zoned drivers. Using
8233         * host-aware device as a regular device.
8234         */
8235        error_setg(errp, "Cannot add a %s child to a %s parent",
8236                   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
8237                   parent_bs->drv->supports_zoned_children ?
8238                   "support zoned children" : "not support zoned children");
8239        return;
8240    }
8241
8242    if (!QLIST_EMPTY(&child_bs->parents)) {
8243        error_setg(errp, "The node %s already has a parent",
8244                   child_bs->node_name);
8245        return;
8246    }
8247
8248    parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
8249}
8250
8251/*
8252 * Hot remove a BDS's child. Used in combination with bdrv_add_child, so the
8253 * user can take a child offline when it is broken and take a new child online.
8254 *
8255 * All block nodes must be drained.
8256 */
8257void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp)
8258{
8259    BdrvChild *tmp;
8260
8261    GLOBAL_STATE_CODE();
8262    if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
8263        error_setg(errp, "The node %s does not support removing a child",
8264                   bdrv_get_device_or_node_name(parent_bs));
8265        return;
8266    }
8267
8268    QLIST_FOREACH(tmp, &parent_bs->children, next) {
8269        if (tmp == child) {
8270            break;
8271        }
8272    }
8273
8274    if (!tmp) {
8275        error_setg(errp, "The node %s does not have a child named %s",
8276                   bdrv_get_device_or_node_name(parent_bs),
8277                   bdrv_get_device_or_node_name(child->bs));
8278        return;
8279    }
8280
8281    parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
8282}
8283
8284int bdrv_make_empty(BdrvChild *c, Error **errp)
8285{
8286    BlockDriver *drv = c->bs->drv;
8287    int ret;
8288
8289    GLOBAL_STATE_CODE();
8290    assert(c->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED));
8291
8292    if (!drv->bdrv_make_empty) {
8293        error_setg(errp, "%s does not support emptying nodes",
8294                   drv->format_name);
8295        return -ENOTSUP;
8296    }
8297
8298    ret = drv->bdrv_make_empty(c->bs);
8299    if (ret < 0) {
8300        error_setg_errno(errp, -ret, "Failed to empty %s",
8301                         c->bs->filename);
8302        return ret;
8303    }
8304
8305    return 0;
8306}
8307
8308/*
8309 * Return the child that @bs acts as an overlay for, and from which data may be
8310 * copied in COW or COR operations.  Usually this is the backing file.
8311 */
8312BdrvChild *bdrv_cow_child(BlockDriverState *bs)
8313{
8314    IO_CODE();
8315
8316    if (!bs || !bs->drv) {
8317        return NULL;
8318    }
8319
8320    if (bs->drv->is_filter) {
8321        return NULL;
8322    }
8323
8324    if (!bs->backing) {
8325        return NULL;
8326    }
8327
8328    assert(bs->backing->role & BDRV_CHILD_COW);
8329    return bs->backing;
8330}
8331
8332/*
8333 * If @bs acts as a filter for exactly one of its children, return
8334 * that child.
8335 */
8336BdrvChild *bdrv_filter_child(BlockDriverState *bs)
8337{
8338    BdrvChild *c;
8339    IO_CODE();
8340
8341    if (!bs || !bs->drv) {
8342        return NULL;
8343    }
8344
8345    if (!bs->drv->is_filter) {
8346        return NULL;
8347    }
8348
8349    /* Only one of @backing or @file may be used */
8350    assert(!(bs->backing && bs->file));
8351
8352    c = bs->backing ?: bs->file;
8353    if (!c) {
8354        return NULL;
8355    }
8356
8357    assert(c->role & BDRV_CHILD_FILTERED);
8358    return c;
8359}
8360
8361/*
8362 * Return either the result of bdrv_cow_child() or bdrv_filter_child(),
8363 * whichever is non-NULL.
8364 *
8365 * Return NULL if both are NULL.
8366 */
8367BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
8368{
8369    BdrvChild *cow_child = bdrv_cow_child(bs);
8370    BdrvChild *filter_child = bdrv_filter_child(bs);
8371    IO_CODE();
8372
8373    /* Filter nodes cannot have COW backing files */
8374    assert(!(cow_child && filter_child));
8375
8376    return cow_child ?: filter_child;
8377}
8378
8379/*
8380 * Return the primary child of this node: For filters, that is the
8381 * filtered child.  For other nodes, that is usually the child storing
8382 * metadata.
8383 * (A generally more helpful description is that this is (usually) the
8384 * child that has the same filename as @bs.)
8385 *
8386 * Drivers do not necessarily have a primary child; for example quorum
8387 * does not.
8388 */
8389BdrvChild *bdrv_primary_child(BlockDriverState *bs)
8390{
8391    BdrvChild *c, *found = NULL;
8392    IO_CODE();
8393
8394    QLIST_FOREACH(c, &bs->children, next) {
8395        if (c->role & BDRV_CHILD_PRIMARY) {
8396            assert(!found);
8397            found = c;
8398        }
8399    }
8400
8401    return found;
8402}
8403
8404static BlockDriverState * GRAPH_RDLOCK
8405bdrv_do_skip_filters(BlockDriverState *bs, bool stop_on_explicit_filter)
8406{
8407    BdrvChild *c;
8408
8409    if (!bs) {
8410        return NULL;
8411    }
8412
8413    while (!(stop_on_explicit_filter && !bs->implicit)) {
8414        c = bdrv_filter_child(bs);
8415        if (!c) {
8416            /*
8417             * A filter that is embedded in a working block graph must
8418             * have a child.  Assert this here so this function does
8419             * not return a filter node that is not expected by the
8420             * caller.
8421             */
8422            assert(!bs->drv || !bs->drv->is_filter);
8423            break;
8424        }
8425        bs = c->bs;
8426    }
8427    /*
8428     * Note that this treats nodes with bs->drv == NULL as not being
8429     * filters (bs->drv == NULL should be replaced by something else
8430     * anyway).
8431     * The advantage of this behavior is that this function will thus
8432     * always return a non-NULL value (given a non-NULL @bs).
8433     */
8434
8435    return bs;
8436}
8437
8438/*
8439 * Return the first BDS that has not been added implicitly or that
8440 * does not have a filtered child down the chain starting from @bs
8441 * (including @bs itself).
8442 */
8443BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
8444{
8445    GLOBAL_STATE_CODE();
8446    return bdrv_do_skip_filters(bs, true);
8447}
8448
8449/*
8450 * Return the first BDS that does not have a filtered child down the
8451 * chain starting from @bs (including @bs itself).
8452 */
8453BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
8454{
8455    IO_CODE();
8456    return bdrv_do_skip_filters(bs, false);
8457}
8458
8459/*
8460 * For a backing chain, return the first non-filter backing image of
8461 * the first non-filter image.
8462 */
8463BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
8464{
8465    IO_CODE();
8466    return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
8467}
8468
8469/**
8470 * Check whether [offset, offset + bytes) overlaps with the cached
8471 * block-status data region.
8472 *
8473 * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`,
8474 * which is what bdrv_bsc_is_data()'s interface needs.
8475 * Otherwise, *pnum is not touched.
8476 */
8477static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
8478                                           int64_t offset, int64_t bytes,
8479                                           int64_t *pnum)
8480{
8481    BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache);
8482    bool overlaps;
8483
8484    overlaps =
8485        qatomic_read(&bsc->valid) &&
8486        ranges_overlap(offset, bytes, bsc->data_start,
8487                       bsc->data_end - bsc->data_start);
8488
8489    if (overlaps && pnum) {
8490        *pnum = bsc->data_end - offset;
8491    }
8492
8493    return overlaps;
8494}
8495
8496/**
8497 * See block_int.h for this function's documentation.
8498 */
8499bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
8500{
8501    IO_CODE();
8502    RCU_READ_LOCK_GUARD();
8503    return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
8504}
8505
8506/**
8507 * See block_int.h for this function's documentation.
8508 */
8509void bdrv_bsc_invalidate_range(BlockDriverState *bs,
8510                               int64_t offset, int64_t bytes)
8511{
8512    IO_CODE();
8513    RCU_READ_LOCK_GUARD();
8514
8515    if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
8516        qatomic_set(&bs->block_status_cache->valid, false);
8517    }
8518}
8519
8520/**
8521 * See block_int.h for this function's documentation.
8522 */
8523void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes)
8524{
8525    BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
8526    BdrvBlockStatusCache *old_bsc;
8527    IO_CODE();
8528
8529    *new_bsc = (BdrvBlockStatusCache) {
8530        .valid = true,
8531        .data_start = offset,
8532        .data_end = offset + bytes,
8533    };
8534
8535    QEMU_LOCK_GUARD(&bs->bsc_modify_lock);
8536
8537    old_bsc = qatomic_rcu_read(&bs->block_status_cache);
8538    qatomic_rcu_set(&bs->block_status_cache, new_bsc);
8539    if (old_bsc) {
8540        g_free_rcu(old_bsc, rcu);
8541    }
8542}
8543