qemu/block.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator block driver
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 * Copyright (c) 2020 Virtuozzo International GmbH.
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a copy
   8 * of this software and associated documentation files (the "Software"), to deal
   9 * in the Software without restriction, including without limitation the rights
  10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 * copies of the Software, and to permit persons to whom the Software is
  12 * furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23 * THE SOFTWARE.
  24 */
  25
  26#include "qemu/osdep.h"
  27#include "block/trace.h"
  28#include "block/block_int.h"
  29#include "block/blockjob.h"
  30#include "block/fuse.h"
  31#include "block/nbd.h"
  32#include "block/qdict.h"
  33#include "qemu/error-report.h"
  34#include "block/module_block.h"
  35#include "qemu/main-loop.h"
  36#include "qemu/module.h"
  37#include "qapi/error.h"
  38#include "qapi/qmp/qdict.h"
  39#include "qapi/qmp/qjson.h"
  40#include "qapi/qmp/qnull.h"
  41#include "qapi/qmp/qstring.h"
  42#include "qapi/qobject-output-visitor.h"
  43#include "qapi/qapi-visit-block-core.h"
  44#include "sysemu/block-backend.h"
  45#include "qemu/notify.h"
  46#include "qemu/option.h"
  47#include "qemu/coroutine.h"
  48#include "block/qapi.h"
  49#include "qemu/timer.h"
  50#include "qemu/cutils.h"
  51#include "qemu/id.h"
  52#include "block/coroutines.h"
  53
  54#ifdef CONFIG_BSD
  55#include <sys/ioctl.h>
  56#include <sys/queue.h>
  57#if defined(HAVE_SYS_DISK_H)
  58#include <sys/disk.h>
  59#endif
  60#endif
  61
  62#ifdef _WIN32
  63#include <windows.h>
  64#endif
  65
  66#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  67
  68static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
  69    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
  70
  71static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
  72    QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
  73
  74static QLIST_HEAD(, BlockDriver) bdrv_drivers =
  75    QLIST_HEAD_INITIALIZER(bdrv_drivers);
  76
  77static BlockDriverState *bdrv_open_inherit(const char *filename,
  78                                           const char *reference,
  79                                           QDict *options, int flags,
  80                                           BlockDriverState *parent,
  81                                           const BdrvChildClass *child_class,
  82                                           BdrvChildRole child_role,
  83                                           Error **errp);
  84
  85static void bdrv_replace_child_noperm(BdrvChild *child,
  86                                      BlockDriverState *new_bs);
  87static void bdrv_remove_file_or_backing_child(BlockDriverState *bs,
  88                                              BdrvChild *child,
  89                                              Transaction *tran);
  90static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
  91                                            Transaction *tran);
  92
  93static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
  94                               BlockReopenQueue *queue,
  95                               Transaction *change_child_tran, Error **errp);
  96static void bdrv_reopen_commit(BDRVReopenState *reopen_state);
  97static void bdrv_reopen_abort(BDRVReopenState *reopen_state);
  98
  99/* If non-zero, use only whitelisted block drivers */
 100static int use_bdrv_whitelist;
 101
 102#ifdef _WIN32
 103static int is_windows_drive_prefix(const char *filename)
 104{
 105    return (((filename[0] >= 'a' && filename[0] <= 'z') ||
 106             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
 107            filename[1] == ':');
 108}
 109
 110int is_windows_drive(const char *filename)
 111{
 112    if (is_windows_drive_prefix(filename) &&
 113        filename[2] == '\0')
 114        return 1;
 115    if (strstart(filename, "\\\\.\\", NULL) ||
 116        strstart(filename, "//./", NULL))
 117        return 1;
 118    return 0;
 119}
 120#endif
 121
 122size_t bdrv_opt_mem_align(BlockDriverState *bs)
 123{
 124    if (!bs || !bs->drv) {
 125        /* page size or 4k (hdd sector size) should be on the safe side */
 126        return MAX(4096, qemu_real_host_page_size);
 127    }
 128
 129    return bs->bl.opt_mem_alignment;
 130}
 131
 132size_t bdrv_min_mem_align(BlockDriverState *bs)
 133{
 134    if (!bs || !bs->drv) {
 135        /* page size or 4k (hdd sector size) should be on the safe side */
 136        return MAX(4096, qemu_real_host_page_size);
 137    }
 138
 139    return bs->bl.min_mem_alignment;
 140}
 141
 142/* check if the path starts with "<protocol>:" */
 143int path_has_protocol(const char *path)
 144{
 145    const char *p;
 146
 147#ifdef _WIN32
 148    if (is_windows_drive(path) ||
 149        is_windows_drive_prefix(path)) {
 150        return 0;
 151    }
 152    p = path + strcspn(path, ":/\\");
 153#else
 154    p = path + strcspn(path, ":/");
 155#endif
 156
 157    return *p == ':';
 158}
 159
 160int path_is_absolute(const char *path)
 161{
 162#ifdef _WIN32
 163    /* specific case for names like: "\\.\d:" */
 164    if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
 165        return 1;
 166    }
 167    return (*path == '/' || *path == '\\');
 168#else
 169    return (*path == '/');
 170#endif
 171}
 172
 173/* if filename is absolute, just return its duplicate. Otherwise, build a
 174   path to it by considering it is relative to base_path. URL are
 175   supported. */
 176char *path_combine(const char *base_path, const char *filename)
 177{
 178    const char *protocol_stripped = NULL;
 179    const char *p, *p1;
 180    char *result;
 181    int len;
 182
 183    if (path_is_absolute(filename)) {
 184        return g_strdup(filename);
 185    }
 186
 187    if (path_has_protocol(base_path)) {
 188        protocol_stripped = strchr(base_path, ':');
 189        if (protocol_stripped) {
 190            protocol_stripped++;
 191        }
 192    }
 193    p = protocol_stripped ?: base_path;
 194
 195    p1 = strrchr(base_path, '/');
 196#ifdef _WIN32
 197    {
 198        const char *p2;
 199        p2 = strrchr(base_path, '\\');
 200        if (!p1 || p2 > p1) {
 201            p1 = p2;
 202        }
 203    }
 204#endif
 205    if (p1) {
 206        p1++;
 207    } else {
 208        p1 = base_path;
 209    }
 210    if (p1 > p) {
 211        p = p1;
 212    }
 213    len = p - base_path;
 214
 215    result = g_malloc(len + strlen(filename) + 1);
 216    memcpy(result, base_path, len);
 217    strcpy(result + len, filename);
 218
 219    return result;
 220}
 221
 222/*
 223 * Helper function for bdrv_parse_filename() implementations to remove optional
 224 * protocol prefixes (especially "file:") from a filename and for putting the
 225 * stripped filename into the options QDict if there is such a prefix.
 226 */
 227void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
 228                                      QDict *options)
 229{
 230    if (strstart(filename, prefix, &filename)) {
 231        /* Stripping the explicit protocol prefix may result in a protocol
 232         * prefix being (wrongly) detected (if the filename contains a colon) */
 233        if (path_has_protocol(filename)) {
 234            GString *fat_filename;
 235
 236            /* This means there is some colon before the first slash; therefore,
 237             * this cannot be an absolute path */
 238            assert(!path_is_absolute(filename));
 239
 240            /* And we can thus fix the protocol detection issue by prefixing it
 241             * by "./" */
 242            fat_filename = g_string_new("./");
 243            g_string_append(fat_filename, filename);
 244
 245            assert(!path_has_protocol(fat_filename->str));
 246
 247            qdict_put(options, "filename",
 248                      qstring_from_gstring(fat_filename));
 249        } else {
 250            /* If no protocol prefix was detected, we can use the shortened
 251             * filename as-is */
 252            qdict_put_str(options, "filename", filename);
 253        }
 254    }
 255}
 256
 257
 258/* Returns whether the image file is opened as read-only. Note that this can
 259 * return false and writing to the image file is still not possible because the
 260 * image is inactivated. */
 261bool bdrv_is_read_only(BlockDriverState *bs)
 262{
 263    return !(bs->open_flags & BDRV_O_RDWR);
 264}
 265
 266int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
 267                           bool ignore_allow_rdw, Error **errp)
 268{
 269    /* Do not set read_only if copy_on_read is enabled */
 270    if (bs->copy_on_read && read_only) {
 271        error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled",
 272                   bdrv_get_device_or_node_name(bs));
 273        return -EINVAL;
 274    }
 275
 276    /* Do not clear read_only if it is prohibited */
 277    if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
 278        !ignore_allow_rdw)
 279    {
 280        error_setg(errp, "Node '%s' is read only",
 281                   bdrv_get_device_or_node_name(bs));
 282        return -EPERM;
 283    }
 284
 285    return 0;
 286}
 287
 288/*
 289 * Called by a driver that can only provide a read-only image.
 290 *
 291 * Returns 0 if the node is already read-only or it could switch the node to
 292 * read-only because BDRV_O_AUTO_RDONLY is set.
 293 *
 294 * Returns -EACCES if the node is read-write and BDRV_O_AUTO_RDONLY is not set
 295 * or bdrv_can_set_read_only() forbids making the node read-only. If @errmsg
 296 * is not NULL, it is used as the error message for the Error object.
 297 */
 298int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
 299                              Error **errp)
 300{
 301    int ret = 0;
 302
 303    if (!(bs->open_flags & BDRV_O_RDWR)) {
 304        return 0;
 305    }
 306    if (!(bs->open_flags & BDRV_O_AUTO_RDONLY)) {
 307        goto fail;
 308    }
 309
 310    ret = bdrv_can_set_read_only(bs, true, false, NULL);
 311    if (ret < 0) {
 312        goto fail;
 313    }
 314
 315    bs->open_flags &= ~BDRV_O_RDWR;
 316
 317    return 0;
 318
 319fail:
 320    error_setg(errp, "%s", errmsg ?: "Image is read-only");
 321    return -EACCES;
 322}
 323
 324/*
 325 * If @backing is empty, this function returns NULL without setting
 326 * @errp.  In all other cases, NULL will only be returned with @errp
 327 * set.
 328 *
 329 * Therefore, a return value of NULL without @errp set means that
 330 * there is no backing file; if @errp is set, there is one but its
 331 * absolute filename cannot be generated.
 332 */
 333char *bdrv_get_full_backing_filename_from_filename(const char *backed,
 334                                                   const char *backing,
 335                                                   Error **errp)
 336{
 337    if (backing[0] == '\0') {
 338        return NULL;
 339    } else if (path_has_protocol(backing) || path_is_absolute(backing)) {
 340        return g_strdup(backing);
 341    } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
 342        error_setg(errp, "Cannot use relative backing file names for '%s'",
 343                   backed);
 344        return NULL;
 345    } else {
 346        return path_combine(backed, backing);
 347    }
 348}
 349
 350/*
 351 * If @filename is empty or NULL, this function returns NULL without
 352 * setting @errp.  In all other cases, NULL will only be returned with
 353 * @errp set.
 354 */
 355static char *bdrv_make_absolute_filename(BlockDriverState *relative_to,
 356                                         const char *filename, Error **errp)
 357{
 358    char *dir, *full_name;
 359
 360    if (!filename || filename[0] == '\0') {
 361        return NULL;
 362    } else if (path_has_protocol(filename) || path_is_absolute(filename)) {
 363        return g_strdup(filename);
 364    }
 365
 366    dir = bdrv_dirname(relative_to, errp);
 367    if (!dir) {
 368        return NULL;
 369    }
 370
 371    full_name = g_strconcat(dir, filename, NULL);
 372    g_free(dir);
 373    return full_name;
 374}
 375
 376char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
 377{
 378    return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
 379}
 380
 381void bdrv_register(BlockDriver *bdrv)
 382{
 383    assert(bdrv->format_name);
 384    QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 385}
 386
 387BlockDriverState *bdrv_new(void)
 388{
 389    BlockDriverState *bs;
 390    int i;
 391
 392    bs = g_new0(BlockDriverState, 1);
 393    QLIST_INIT(&bs->dirty_bitmaps);
 394    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
 395        QLIST_INIT(&bs->op_blockers[i]);
 396    }
 397    qemu_co_mutex_init(&bs->reqs_lock);
 398    qemu_mutex_init(&bs->dirty_bitmap_mutex);
 399    bs->refcnt = 1;
 400    bs->aio_context = qemu_get_aio_context();
 401
 402    qemu_co_queue_init(&bs->flush_queue);
 403
 404    for (i = 0; i < bdrv_drain_all_count; i++) {
 405        bdrv_drained_begin(bs);
 406    }
 407
 408    QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
 409
 410    return bs;
 411}
 412
 413static BlockDriver *bdrv_do_find_format(const char *format_name)
 414{
 415    BlockDriver *drv1;
 416
 417    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 418        if (!strcmp(drv1->format_name, format_name)) {
 419            return drv1;
 420        }
 421    }
 422
 423    return NULL;
 424}
 425
 426BlockDriver *bdrv_find_format(const char *format_name)
 427{
 428    BlockDriver *drv1;
 429    int i;
 430
 431    drv1 = bdrv_do_find_format(format_name);
 432    if (drv1) {
 433        return drv1;
 434    }
 435
 436    /* The driver isn't registered, maybe we need to load a module */
 437    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
 438        if (!strcmp(block_driver_modules[i].format_name, format_name)) {
 439            block_module_load_one(block_driver_modules[i].library_name);
 440            break;
 441        }
 442    }
 443
 444    return bdrv_do_find_format(format_name);
 445}
 446
 447static int bdrv_format_is_whitelisted(const char *format_name, bool read_only)
 448{
 449    static const char *whitelist_rw[] = {
 450        CONFIG_BDRV_RW_WHITELIST
 451        NULL
 452    };
 453    static const char *whitelist_ro[] = {
 454        CONFIG_BDRV_RO_WHITELIST
 455        NULL
 456    };
 457    const char **p;
 458
 459    if (!whitelist_rw[0] && !whitelist_ro[0]) {
 460        return 1;               /* no whitelist, anything goes */
 461    }
 462
 463    for (p = whitelist_rw; *p; p++) {
 464        if (!strcmp(format_name, *p)) {
 465            return 1;
 466        }
 467    }
 468    if (read_only) {
 469        for (p = whitelist_ro; *p; p++) {
 470            if (!strcmp(format_name, *p)) {
 471                return 1;
 472            }
 473        }
 474    }
 475    return 0;
 476}
 477
 478int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
 479{
 480    return bdrv_format_is_whitelisted(drv->format_name, read_only);
 481}
 482
 483bool bdrv_uses_whitelist(void)
 484{
 485    return use_bdrv_whitelist;
 486}
 487
 488typedef struct CreateCo {
 489    BlockDriver *drv;
 490    char *filename;
 491    QemuOpts *opts;
 492    int ret;
 493    Error *err;
 494} CreateCo;
 495
 496static void coroutine_fn bdrv_create_co_entry(void *opaque)
 497{
 498    Error *local_err = NULL;
 499    int ret;
 500
 501    CreateCo *cco = opaque;
 502    assert(cco->drv);
 503
 504    ret = cco->drv->bdrv_co_create_opts(cco->drv,
 505                                        cco->filename, cco->opts, &local_err);
 506    error_propagate(&cco->err, local_err);
 507    cco->ret = ret;
 508}
 509
 510int bdrv_create(BlockDriver *drv, const char* filename,
 511                QemuOpts *opts, Error **errp)
 512{
 513    int ret;
 514
 515    Coroutine *co;
 516    CreateCo cco = {
 517        .drv = drv,
 518        .filename = g_strdup(filename),
 519        .opts = opts,
 520        .ret = NOT_DONE,
 521        .err = NULL,
 522    };
 523
 524    if (!drv->bdrv_co_create_opts) {
 525        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
 526        ret = -ENOTSUP;
 527        goto out;
 528    }
 529
 530    if (qemu_in_coroutine()) {
 531        /* Fast-path if already in coroutine context */
 532        bdrv_create_co_entry(&cco);
 533    } else {
 534        co = qemu_coroutine_create(bdrv_create_co_entry, &cco);
 535        qemu_coroutine_enter(co);
 536        while (cco.ret == NOT_DONE) {
 537            aio_poll(qemu_get_aio_context(), true);
 538        }
 539    }
 540
 541    ret = cco.ret;
 542    if (ret < 0) {
 543        if (cco.err) {
 544            error_propagate(errp, cco.err);
 545        } else {
 546            error_setg_errno(errp, -ret, "Could not create image");
 547        }
 548    }
 549
 550out:
 551    g_free(cco.filename);
 552    return ret;
 553}
 554
 555/**
 556 * Helper function for bdrv_create_file_fallback(): Resize @blk to at
 557 * least the given @minimum_size.
 558 *
 559 * On success, return @blk's actual length.
 560 * Otherwise, return -errno.
 561 */
 562static int64_t create_file_fallback_truncate(BlockBackend *blk,
 563                                             int64_t minimum_size, Error **errp)
 564{
 565    Error *local_err = NULL;
 566    int64_t size;
 567    int ret;
 568
 569    ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
 570                       &local_err);
 571    if (ret < 0 && ret != -ENOTSUP) {
 572        error_propagate(errp, local_err);
 573        return ret;
 574    }
 575
 576    size = blk_getlength(blk);
 577    if (size < 0) {
 578        error_free(local_err);
 579        error_setg_errno(errp, -size,
 580                         "Failed to inquire the new image file's length");
 581        return size;
 582    }
 583
 584    if (size < minimum_size) {
 585        /* Need to grow the image, but we failed to do that */
 586        error_propagate(errp, local_err);
 587        return -ENOTSUP;
 588    }
 589
 590    error_free(local_err);
 591    local_err = NULL;
 592
 593    return size;
 594}
 595
 596/**
 597 * Helper function for bdrv_create_file_fallback(): Zero the first
 598 * sector to remove any potentially pre-existing image header.
 599 */
 600static int create_file_fallback_zero_first_sector(BlockBackend *blk,
 601                                                  int64_t current_size,
 602                                                  Error **errp)
 603{
 604    int64_t bytes_to_clear;
 605    int ret;
 606
 607    bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
 608    if (bytes_to_clear) {
 609        ret = blk_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
 610        if (ret < 0) {
 611            error_setg_errno(errp, -ret,
 612                             "Failed to clear the new image's first sector");
 613            return ret;
 614        }
 615    }
 616
 617    return 0;
 618}
 619
 620/**
 621 * Simple implementation of bdrv_co_create_opts for protocol drivers
 622 * which only support creation via opening a file
 623 * (usually existing raw storage device)
 624 */
 625int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
 626                                            const char *filename,
 627                                            QemuOpts *opts,
 628                                            Error **errp)
 629{
 630    BlockBackend *blk;
 631    QDict *options;
 632    int64_t size = 0;
 633    char *buf = NULL;
 634    PreallocMode prealloc;
 635    Error *local_err = NULL;
 636    int ret;
 637
 638    size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
 639    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
 640    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
 641                               PREALLOC_MODE_OFF, &local_err);
 642    g_free(buf);
 643    if (local_err) {
 644        error_propagate(errp, local_err);
 645        return -EINVAL;
 646    }
 647
 648    if (prealloc != PREALLOC_MODE_OFF) {
 649        error_setg(errp, "Unsupported preallocation mode '%s'",
 650                   PreallocMode_str(prealloc));
 651        return -ENOTSUP;
 652    }
 653
 654    options = qdict_new();
 655    qdict_put_str(options, "driver", drv->format_name);
 656
 657    blk = blk_new_open(filename, NULL, options,
 658                       BDRV_O_RDWR | BDRV_O_RESIZE, errp);
 659    if (!blk) {
 660        error_prepend(errp, "Protocol driver '%s' does not support image "
 661                      "creation, and opening the image failed: ",
 662                      drv->format_name);
 663        return -EINVAL;
 664    }
 665
 666    size = create_file_fallback_truncate(blk, size, errp);
 667    if (size < 0) {
 668        ret = size;
 669        goto out;
 670    }
 671
 672    ret = create_file_fallback_zero_first_sector(blk, size, errp);
 673    if (ret < 0) {
 674        goto out;
 675    }
 676
 677    ret = 0;
 678out:
 679    blk_unref(blk);
 680    return ret;
 681}
 682
 683int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
 684{
 685    QemuOpts *protocol_opts;
 686    BlockDriver *drv;
 687    QDict *qdict;
 688    int ret;
 689
 690    drv = bdrv_find_protocol(filename, true, errp);
 691    if (drv == NULL) {
 692        return -ENOENT;
 693    }
 694
 695    if (!drv->create_opts) {
 696        error_setg(errp, "Driver '%s' does not support image creation",
 697                   drv->format_name);
 698        return -ENOTSUP;
 699    }
 700
 701    /*
 702     * 'opts' contains a QemuOptsList with a combination of format and protocol
 703     * default values.
 704     *
 705     * The format properly removes its options, but the default values remain
 706     * in 'opts->list'.  So if the protocol has options with the same name
 707     * (e.g. rbd has 'cluster_size' as qcow2), it will see the default values
 708     * of the format, since for overlapping options, the format wins.
 709     *
 710     * To avoid this issue, lets convert QemuOpts to QDict, in this way we take
 711     * only the set options, and then convert it back to QemuOpts, using the
 712     * create_opts of the protocol. So the new QemuOpts, will contain only the
 713     * protocol defaults.
 714     */
 715    qdict = qemu_opts_to_qdict(opts, NULL);
 716    protocol_opts = qemu_opts_from_qdict(drv->create_opts, qdict, errp);
 717    if (protocol_opts == NULL) {
 718        ret = -EINVAL;
 719        goto out;
 720    }
 721
 722    ret = bdrv_create(drv, filename, protocol_opts, errp);
 723out:
 724    qemu_opts_del(protocol_opts);
 725    qobject_unref(qdict);
 726    return ret;
 727}
 728
 729int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
 730{
 731    Error *local_err = NULL;
 732    int ret;
 733
 734    assert(bs != NULL);
 735
 736    if (!bs->drv) {
 737        error_setg(errp, "Block node '%s' is not opened", bs->filename);
 738        return -ENOMEDIUM;
 739    }
 740
 741    if (!bs->drv->bdrv_co_delete_file) {
 742        error_setg(errp, "Driver '%s' does not support image deletion",
 743                   bs->drv->format_name);
 744        return -ENOTSUP;
 745    }
 746
 747    ret = bs->drv->bdrv_co_delete_file(bs, &local_err);
 748    if (ret < 0) {
 749        error_propagate(errp, local_err);
 750    }
 751
 752    return ret;
 753}
 754
 755void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs)
 756{
 757    Error *local_err = NULL;
 758    int ret;
 759
 760    if (!bs) {
 761        return;
 762    }
 763
 764    ret = bdrv_co_delete_file(bs, &local_err);
 765    /*
 766     * ENOTSUP will happen if the block driver doesn't support
 767     * the 'bdrv_co_delete_file' interface. This is a predictable
 768     * scenario and shouldn't be reported back to the user.
 769     */
 770    if (ret == -ENOTSUP) {
 771        error_free(local_err);
 772    } else if (ret < 0) {
 773        error_report_err(local_err);
 774    }
 775}
 776
 777/**
 778 * Try to get @bs's logical and physical block size.
 779 * On success, store them in @bsz struct and return 0.
 780 * On failure return -errno.
 781 * @bs must not be empty.
 782 */
 783int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 784{
 785    BlockDriver *drv = bs->drv;
 786    BlockDriverState *filtered = bdrv_filter_bs(bs);
 787
 788    if (drv && drv->bdrv_probe_blocksizes) {
 789        return drv->bdrv_probe_blocksizes(bs, bsz);
 790    } else if (filtered) {
 791        return bdrv_probe_blocksizes(filtered, bsz);
 792    }
 793
 794    return -ENOTSUP;
 795}
 796
 797/**
 798 * Try to get @bs's geometry (cyls, heads, sectors).
 799 * On success, store them in @geo struct and return 0.
 800 * On failure return -errno.
 801 * @bs must not be empty.
 802 */
 803int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 804{
 805    BlockDriver *drv = bs->drv;
 806    BlockDriverState *filtered = bdrv_filter_bs(bs);
 807
 808    if (drv && drv->bdrv_probe_geometry) {
 809        return drv->bdrv_probe_geometry(bs, geo);
 810    } else if (filtered) {
 811        return bdrv_probe_geometry(filtered, geo);
 812    }
 813
 814    return -ENOTSUP;
 815}
 816
 817/*
 818 * Create a uniquely-named empty temporary file.
 819 * Return 0 upon success, otherwise a negative errno value.
 820 */
 821int get_tmp_filename(char *filename, int size)
 822{
 823#ifdef _WIN32
 824    char temp_dir[MAX_PATH];
 825    /* GetTempFileName requires that its output buffer (4th param)
 826       have length MAX_PATH or greater.  */
 827    assert(size >= MAX_PATH);
 828    return (GetTempPath(MAX_PATH, temp_dir)
 829            && GetTempFileName(temp_dir, "qem", 0, filename)
 830            ? 0 : -GetLastError());
 831#else
 832    int fd;
 833    const char *tmpdir;
 834    tmpdir = getenv("TMPDIR");
 835    if (!tmpdir) {
 836        tmpdir = "/var/tmp";
 837    }
 838    if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
 839        return -EOVERFLOW;
 840    }
 841    fd = mkstemp(filename);
 842    if (fd < 0) {
 843        return -errno;
 844    }
 845    if (close(fd) != 0) {
 846        unlink(filename);
 847        return -errno;
 848    }
 849    return 0;
 850#endif
 851}
 852
 853/*
 854 * Detect host devices. By convention, /dev/cdrom[N] is always
 855 * recognized as a host CDROM.
 856 */
 857static BlockDriver *find_hdev_driver(const char *filename)
 858{
 859    int score_max = 0, score;
 860    BlockDriver *drv = NULL, *d;
 861
 862    QLIST_FOREACH(d, &bdrv_drivers, list) {
 863        if (d->bdrv_probe_device) {
 864            score = d->bdrv_probe_device(filename);
 865            if (score > score_max) {
 866                score_max = score;
 867                drv = d;
 868            }
 869        }
 870    }
 871
 872    return drv;
 873}
 874
 875static BlockDriver *bdrv_do_find_protocol(const char *protocol)
 876{
 877    BlockDriver *drv1;
 878
 879    QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 880        if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
 881            return drv1;
 882        }
 883    }
 884
 885    return NULL;
 886}
 887
 888BlockDriver *bdrv_find_protocol(const char *filename,
 889                                bool allow_protocol_prefix,
 890                                Error **errp)
 891{
 892    BlockDriver *drv1;
 893    char protocol[128];
 894    int len;
 895    const char *p;
 896    int i;
 897
 898    /* TODO Drivers without bdrv_file_open must be specified explicitly */
 899
 900    /*
 901     * XXX(hch): we really should not let host device detection
 902     * override an explicit protocol specification, but moving this
 903     * later breaks access to device names with colons in them.
 904     * Thanks to the brain-dead persistent naming schemes on udev-
 905     * based Linux systems those actually are quite common.
 906     */
 907    drv1 = find_hdev_driver(filename);
 908    if (drv1) {
 909        return drv1;
 910    }
 911
 912    if (!path_has_protocol(filename) || !allow_protocol_prefix) {
 913        return &bdrv_file;
 914    }
 915
 916    p = strchr(filename, ':');
 917    assert(p != NULL);
 918    len = p - filename;
 919    if (len > sizeof(protocol) - 1)
 920        len = sizeof(protocol) - 1;
 921    memcpy(protocol, filename, len);
 922    protocol[len] = '\0';
 923
 924    drv1 = bdrv_do_find_protocol(protocol);
 925    if (drv1) {
 926        return drv1;
 927    }
 928
 929    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
 930        if (block_driver_modules[i].protocol_name &&
 931            !strcmp(block_driver_modules[i].protocol_name, protocol)) {
 932            block_module_load_one(block_driver_modules[i].library_name);
 933            break;
 934        }
 935    }
 936
 937    drv1 = bdrv_do_find_protocol(protocol);
 938    if (!drv1) {
 939        error_setg(errp, "Unknown protocol '%s'", protocol);
 940    }
 941    return drv1;
 942}
 943
 944/*
 945 * Guess image format by probing its contents.
 946 * This is not a good idea when your image is raw (CVE-2008-2004), but
 947 * we do it anyway for backward compatibility.
 948 *
 949 * @buf         contains the image's first @buf_size bytes.
 950 * @buf_size    is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
 951 *              but can be smaller if the image file is smaller)
 952 * @filename    is its filename.
 953 *
 954 * For all block drivers, call the bdrv_probe() method to get its
 955 * probing score.
 956 * Return the first block driver with the highest probing score.
 957 */
 958BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
 959                            const char *filename)
 960{
 961    int score_max = 0, score;
 962    BlockDriver *drv = NULL, *d;
 963
 964    QLIST_FOREACH(d, &bdrv_drivers, list) {
 965        if (d->bdrv_probe) {
 966            score = d->bdrv_probe(buf, buf_size, filename);
 967            if (score > score_max) {
 968                score_max = score;
 969                drv = d;
 970            }
 971        }
 972    }
 973
 974    return drv;
 975}
 976
 977static int find_image_format(BlockBackend *file, const char *filename,
 978                             BlockDriver **pdrv, Error **errp)
 979{
 980    BlockDriver *drv;
 981    uint8_t buf[BLOCK_PROBE_BUF_SIZE];
 982    int ret = 0;
 983
 984    /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
 985    if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) {
 986        *pdrv = &bdrv_raw;
 987        return ret;
 988    }
 989
 990    ret = blk_pread(file, 0, buf, sizeof(buf));
 991    if (ret < 0) {
 992        error_setg_errno(errp, -ret, "Could not read image for determining its "
 993                         "format");
 994        *pdrv = NULL;
 995        return ret;
 996    }
 997
 998    drv = bdrv_probe_all(buf, ret, filename);
 999    if (!drv) {
1000        error_setg(errp, "Could not determine image format: No compatible "
1001                   "driver found");
1002        ret = -ENOENT;
1003    }
1004    *pdrv = drv;
1005    return ret;
1006}
1007
1008/**
1009 * Set the current 'total_sectors' value
1010 * Return 0 on success, -errno on error.
1011 */
1012int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
1013{
1014    BlockDriver *drv = bs->drv;
1015
1016    if (!drv) {
1017        return -ENOMEDIUM;
1018    }
1019
1020    /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
1021    if (bdrv_is_sg(bs))
1022        return 0;
1023
1024    /* query actual device if possible, otherwise just trust the hint */
1025    if (drv->bdrv_getlength) {
1026        int64_t length = drv->bdrv_getlength(bs);
1027        if (length < 0) {
1028            return length;
1029        }
1030        hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
1031    }
1032
1033    bs->total_sectors = hint;
1034
1035    if (bs->total_sectors * BDRV_SECTOR_SIZE > BDRV_MAX_LENGTH) {
1036        return -EFBIG;
1037    }
1038
1039    return 0;
1040}
1041
1042/**
1043 * Combines a QDict of new block driver @options with any missing options taken
1044 * from @old_options, so that leaving out an option defaults to its old value.
1045 */
1046static void bdrv_join_options(BlockDriverState *bs, QDict *options,
1047                              QDict *old_options)
1048{
1049    if (bs->drv && bs->drv->bdrv_join_options) {
1050        bs->drv->bdrv_join_options(options, old_options);
1051    } else {
1052        qdict_join(options, old_options, false);
1053    }
1054}
1055
1056static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
1057                                                            int open_flags,
1058                                                            Error **errp)
1059{
1060    Error *local_err = NULL;
1061    char *value = qemu_opt_get_del(opts, "detect-zeroes");
1062    BlockdevDetectZeroesOptions detect_zeroes =
1063        qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
1064                        BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
1065    g_free(value);
1066    if (local_err) {
1067        error_propagate(errp, local_err);
1068        return detect_zeroes;
1069    }
1070
1071    if (detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
1072        !(open_flags & BDRV_O_UNMAP))
1073    {
1074        error_setg(errp, "setting detect-zeroes to unmap is not allowed "
1075                   "without setting discard operation to unmap");
1076    }
1077
1078    return detect_zeroes;
1079}
1080
1081/**
1082 * Set open flags for aio engine
1083 *
1084 * Return 0 on success, -1 if the engine specified is invalid
1085 */
1086int bdrv_parse_aio(const char *mode, int *flags)
1087{
1088    if (!strcmp(mode, "threads")) {
1089        /* do nothing, default */
1090    } else if (!strcmp(mode, "native")) {
1091        *flags |= BDRV_O_NATIVE_AIO;
1092#ifdef CONFIG_LINUX_IO_URING
1093    } else if (!strcmp(mode, "io_uring")) {
1094        *flags |= BDRV_O_IO_URING;
1095#endif
1096    } else {
1097        return -1;
1098    }
1099
1100    return 0;
1101}
1102
1103/**
1104 * Set open flags for a given discard mode
1105 *
1106 * Return 0 on success, -1 if the discard mode was invalid.
1107 */
1108int bdrv_parse_discard_flags(const char *mode, int *flags)
1109{
1110    *flags &= ~BDRV_O_UNMAP;
1111
1112    if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
1113        /* do nothing */
1114    } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
1115        *flags |= BDRV_O_UNMAP;
1116    } else {
1117        return -1;
1118    }
1119
1120    return 0;
1121}
1122
1123/**
1124 * Set open flags for a given cache mode
1125 *
1126 * Return 0 on success, -1 if the cache mode was invalid.
1127 */
1128int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
1129{
1130    *flags &= ~BDRV_O_CACHE_MASK;
1131
1132    if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
1133        *writethrough = false;
1134        *flags |= BDRV_O_NOCACHE;
1135    } else if (!strcmp(mode, "directsync")) {
1136        *writethrough = true;
1137        *flags |= BDRV_O_NOCACHE;
1138    } else if (!strcmp(mode, "writeback")) {
1139        *writethrough = false;
1140    } else if (!strcmp(mode, "unsafe")) {
1141        *writethrough = false;
1142        *flags |= BDRV_O_NO_FLUSH;
1143    } else if (!strcmp(mode, "writethrough")) {
1144        *writethrough = true;
1145    } else {
1146        return -1;
1147    }
1148
1149    return 0;
1150}
1151
1152static char *bdrv_child_get_parent_desc(BdrvChild *c)
1153{
1154    BlockDriverState *parent = c->opaque;
1155    return g_strdup_printf("node '%s'", bdrv_get_node_name(parent));
1156}
1157
1158static void bdrv_child_cb_drained_begin(BdrvChild *child)
1159{
1160    BlockDriverState *bs = child->opaque;
1161    bdrv_do_drained_begin_quiesce(bs, NULL, false);
1162}
1163
1164static bool bdrv_child_cb_drained_poll(BdrvChild *child)
1165{
1166    BlockDriverState *bs = child->opaque;
1167    return bdrv_drain_poll(bs, false, NULL, false);
1168}
1169
1170static void bdrv_child_cb_drained_end(BdrvChild *child,
1171                                      int *drained_end_counter)
1172{
1173    BlockDriverState *bs = child->opaque;
1174    bdrv_drained_end_no_poll(bs, drained_end_counter);
1175}
1176
1177static int bdrv_child_cb_inactivate(BdrvChild *child)
1178{
1179    BlockDriverState *bs = child->opaque;
1180    assert(bs->open_flags & BDRV_O_INACTIVE);
1181    return 0;
1182}
1183
1184static bool bdrv_child_cb_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
1185                                          GSList **ignore, Error **errp)
1186{
1187    BlockDriverState *bs = child->opaque;
1188    return bdrv_can_set_aio_context(bs, ctx, ignore, errp);
1189}
1190
1191static void bdrv_child_cb_set_aio_ctx(BdrvChild *child, AioContext *ctx,
1192                                      GSList **ignore)
1193{
1194    BlockDriverState *bs = child->opaque;
1195    return bdrv_set_aio_context_ignore(bs, ctx, ignore);
1196}
1197
1198/*
1199 * Returns the options and flags that a temporary snapshot should get, based on
1200 * the originally requested flags (the originally requested image will have
1201 * flags like a backing file)
1202 */
1203static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
1204                                       int parent_flags, QDict *parent_options)
1205{
1206    *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
1207
1208    /* For temporary files, unconditional cache=unsafe is fine */
1209    qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
1210    qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
1211
1212    /* Copy the read-only and discard options from the parent */
1213    qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1214    qdict_copy_default(child_options, parent_options, BDRV_OPT_DISCARD);
1215
1216    /* aio=native doesn't work for cache.direct=off, so disable it for the
1217     * temporary snapshot */
1218    *child_flags &= ~BDRV_O_NATIVE_AIO;
1219}
1220
1221static void bdrv_backing_attach(BdrvChild *c)
1222{
1223    BlockDriverState *parent = c->opaque;
1224    BlockDriverState *backing_hd = c->bs;
1225
1226    assert(!parent->backing_blocker);
1227    error_setg(&parent->backing_blocker,
1228               "node is used as backing hd of '%s'",
1229               bdrv_get_device_or_node_name(parent));
1230
1231    bdrv_refresh_filename(backing_hd);
1232
1233    parent->open_flags &= ~BDRV_O_NO_BACKING;
1234
1235    bdrv_op_block_all(backing_hd, parent->backing_blocker);
1236    /* Otherwise we won't be able to commit or stream */
1237    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1238                    parent->backing_blocker);
1239    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
1240                    parent->backing_blocker);
1241    /*
1242     * We do backup in 3 ways:
1243     * 1. drive backup
1244     *    The target bs is new opened, and the source is top BDS
1245     * 2. blockdev backup
1246     *    Both the source and the target are top BDSes.
1247     * 3. internal backup(used for block replication)
1248     *    Both the source and the target are backing file
1249     *
1250     * In case 1 and 2, neither the source nor the target is the backing file.
1251     * In case 3, we will block the top BDS, so there is only one block job
1252     * for the top BDS and its backing chain.
1253     */
1254    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
1255                    parent->backing_blocker);
1256    bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
1257                    parent->backing_blocker);
1258}
1259
1260static void bdrv_backing_detach(BdrvChild *c)
1261{
1262    BlockDriverState *parent = c->opaque;
1263
1264    assert(parent->backing_blocker);
1265    bdrv_op_unblock_all(c->bs, parent->backing_blocker);
1266    error_free(parent->backing_blocker);
1267    parent->backing_blocker = NULL;
1268}
1269
1270static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
1271                                        const char *filename, Error **errp)
1272{
1273    BlockDriverState *parent = c->opaque;
1274    bool read_only = bdrv_is_read_only(parent);
1275    int ret;
1276
1277    if (read_only) {
1278        ret = bdrv_reopen_set_read_only(parent, false, errp);
1279        if (ret < 0) {
1280            return ret;
1281        }
1282    }
1283
1284    ret = bdrv_change_backing_file(parent, filename,
1285                                   base->drv ? base->drv->format_name : "",
1286                                   false);
1287    if (ret < 0) {
1288        error_setg_errno(errp, -ret, "Could not update backing file link");
1289    }
1290
1291    if (read_only) {
1292        bdrv_reopen_set_read_only(parent, true, NULL);
1293    }
1294
1295    return ret;
1296}
1297
1298/*
1299 * Returns the options and flags that a generic child of a BDS should
1300 * get, based on the given options and flags for the parent BDS.
1301 */
1302static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
1303                                   int *child_flags, QDict *child_options,
1304                                   int parent_flags, QDict *parent_options)
1305{
1306    int flags = parent_flags;
1307
1308    /*
1309     * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
1310     * Generally, the question to answer is: Should this child be
1311     * format-probed by default?
1312     */
1313
1314    /*
1315     * Pure and non-filtered data children of non-format nodes should
1316     * be probed by default (even when the node itself has BDRV_O_PROTOCOL
1317     * set).  This only affects a very limited set of drivers (namely
1318     * quorum and blkverify when this comment was written).
1319     * Force-clear BDRV_O_PROTOCOL then.
1320     */
1321    if (!parent_is_format &&
1322        (role & BDRV_CHILD_DATA) &&
1323        !(role & (BDRV_CHILD_METADATA | BDRV_CHILD_FILTERED)))
1324    {
1325        flags &= ~BDRV_O_PROTOCOL;
1326    }
1327
1328    /*
1329     * All children of format nodes (except for COW children) and all
1330     * metadata children in general should never be format-probed.
1331     * Force-set BDRV_O_PROTOCOL then.
1332     */
1333    if ((parent_is_format && !(role & BDRV_CHILD_COW)) ||
1334        (role & BDRV_CHILD_METADATA))
1335    {
1336        flags |= BDRV_O_PROTOCOL;
1337    }
1338
1339    /*
1340     * If the cache mode isn't explicitly set, inherit direct and no-flush from
1341     * the parent.
1342     */
1343    qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_DIRECT);
1344    qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_NO_FLUSH);
1345    qdict_copy_default(child_options, parent_options, BDRV_OPT_FORCE_SHARE);
1346
1347    if (role & BDRV_CHILD_COW) {
1348        /* backing files are opened read-only by default */
1349        qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "on");
1350        qdict_set_default_str(child_options, BDRV_OPT_AUTO_READ_ONLY, "off");
1351    } else {
1352        /* Inherit the read-only option from the parent if it's not set */
1353        qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1354        qdict_copy_default(child_options, parent_options,
1355                           BDRV_OPT_AUTO_READ_ONLY);
1356    }
1357
1358    /*
1359     * bdrv_co_pdiscard() respects unmap policy for the parent, so we
1360     * can default to enable it on lower layers regardless of the
1361     * parent option.
1362     */
1363    qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");
1364
1365    /* Clear flags that only apply to the top layer */
1366    flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
1367
1368    if (role & BDRV_CHILD_METADATA) {
1369        flags &= ~BDRV_O_NO_IO;
1370    }
1371    if (role & BDRV_CHILD_COW) {
1372        flags &= ~BDRV_O_TEMPORARY;
1373    }
1374
1375    *child_flags = flags;
1376}
1377
1378static void bdrv_child_cb_attach(BdrvChild *child)
1379{
1380    BlockDriverState *bs = child->opaque;
1381
1382    if (child->role & BDRV_CHILD_COW) {
1383        bdrv_backing_attach(child);
1384    }
1385
1386    bdrv_apply_subtree_drain(child, bs);
1387}
1388
1389static void bdrv_child_cb_detach(BdrvChild *child)
1390{
1391    BlockDriverState *bs = child->opaque;
1392
1393    if (child->role & BDRV_CHILD_COW) {
1394        bdrv_backing_detach(child);
1395    }
1396
1397    bdrv_unapply_subtree_drain(child, bs);
1398}
1399
1400static int bdrv_child_cb_update_filename(BdrvChild *c, BlockDriverState *base,
1401                                         const char *filename, Error **errp)
1402{
1403    if (c->role & BDRV_CHILD_COW) {
1404        return bdrv_backing_update_filename(c, base, filename, errp);
1405    }
1406    return 0;
1407}
1408
1409AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
1410{
1411    BlockDriverState *bs = c->opaque;
1412
1413    return bdrv_get_aio_context(bs);
1414}
1415
1416const BdrvChildClass child_of_bds = {
1417    .parent_is_bds   = true,
1418    .get_parent_desc = bdrv_child_get_parent_desc,
1419    .inherit_options = bdrv_inherited_options,
1420    .drained_begin   = bdrv_child_cb_drained_begin,
1421    .drained_poll    = bdrv_child_cb_drained_poll,
1422    .drained_end     = bdrv_child_cb_drained_end,
1423    .attach          = bdrv_child_cb_attach,
1424    .detach          = bdrv_child_cb_detach,
1425    .inactivate      = bdrv_child_cb_inactivate,
1426    .can_set_aio_ctx = bdrv_child_cb_can_set_aio_ctx,
1427    .set_aio_ctx     = bdrv_child_cb_set_aio_ctx,
1428    .update_filename = bdrv_child_cb_update_filename,
1429    .get_parent_aio_context = child_of_bds_get_parent_aio_context,
1430};
1431
1432AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
1433{
1434    return c->klass->get_parent_aio_context(c);
1435}
1436
1437static int bdrv_open_flags(BlockDriverState *bs, int flags)
1438{
1439    int open_flags = flags;
1440
1441    /*
1442     * Clear flags that are internal to the block layer before opening the
1443     * image.
1444     */
1445    open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
1446
1447    return open_flags;
1448}
1449
1450static void update_flags_from_options(int *flags, QemuOpts *opts)
1451{
1452    *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY);
1453
1454    if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) {
1455        *flags |= BDRV_O_NO_FLUSH;
1456    }
1457
1458    if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) {
1459        *flags |= BDRV_O_NOCACHE;
1460    }
1461
1462    if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) {
1463        *flags |= BDRV_O_RDWR;
1464    }
1465
1466    if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) {
1467        *flags |= BDRV_O_AUTO_RDONLY;
1468    }
1469}
1470
1471static void update_options_from_flags(QDict *options, int flags)
1472{
1473    if (!qdict_haskey(options, BDRV_OPT_CACHE_DIRECT)) {
1474        qdict_put_bool(options, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE);
1475    }
1476    if (!qdict_haskey(options, BDRV_OPT_CACHE_NO_FLUSH)) {
1477        qdict_put_bool(options, BDRV_OPT_CACHE_NO_FLUSH,
1478                       flags & BDRV_O_NO_FLUSH);
1479    }
1480    if (!qdict_haskey(options, BDRV_OPT_READ_ONLY)) {
1481        qdict_put_bool(options, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR));
1482    }
1483    if (!qdict_haskey(options, BDRV_OPT_AUTO_READ_ONLY)) {
1484        qdict_put_bool(options, BDRV_OPT_AUTO_READ_ONLY,
1485                       flags & BDRV_O_AUTO_RDONLY);
1486    }
1487}
1488
1489static void bdrv_assign_node_name(BlockDriverState *bs,
1490                                  const char *node_name,
1491                                  Error **errp)
1492{
1493    char *gen_node_name = NULL;
1494
1495    if (!node_name) {
1496        node_name = gen_node_name = id_generate(ID_BLOCK);
1497    } else if (!id_wellformed(node_name)) {
1498        /*
1499         * Check for empty string or invalid characters, but not if it is
1500         * generated (generated names use characters not available to the user)
1501         */
1502        error_setg(errp, "Invalid node-name: '%s'", node_name);
1503        return;
1504    }
1505
1506    /* takes care of avoiding namespaces collisions */
1507    if (blk_by_name(node_name)) {
1508        error_setg(errp, "node-name=%s is conflicting with a device id",
1509                   node_name);
1510        goto out;
1511    }
1512
1513    /* takes care of avoiding duplicates node names */
1514    if (bdrv_find_node(node_name)) {
1515        error_setg(errp, "Duplicate nodes with node-name='%s'", node_name);
1516        goto out;
1517    }
1518
1519    /* Make sure that the node name isn't truncated */
1520    if (strlen(node_name) >= sizeof(bs->node_name)) {
1521        error_setg(errp, "Node name too long");
1522        goto out;
1523    }
1524
1525    /* copy node name into the bs and insert it into the graph list */
1526    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
1527    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
1528out:
1529    g_free(gen_node_name);
1530}
1531
1532static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
1533                            const char *node_name, QDict *options,
1534                            int open_flags, Error **errp)
1535{
1536    Error *local_err = NULL;
1537    int i, ret;
1538
1539    bdrv_assign_node_name(bs, node_name, &local_err);
1540    if (local_err) {
1541        error_propagate(errp, local_err);
1542        return -EINVAL;
1543    }
1544
1545    bs->drv = drv;
1546    bs->opaque = g_malloc0(drv->instance_size);
1547
1548    if (drv->bdrv_file_open) {
1549        assert(!drv->bdrv_needs_filename || bs->filename[0]);
1550        ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1551    } else if (drv->bdrv_open) {
1552        ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1553    } else {
1554        ret = 0;
1555    }
1556
1557    if (ret < 0) {
1558        if (local_err) {
1559            error_propagate(errp, local_err);
1560        } else if (bs->filename[0]) {
1561            error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1562        } else {
1563            error_setg_errno(errp, -ret, "Could not open image");
1564        }
1565        goto open_failed;
1566    }
1567
1568    ret = refresh_total_sectors(bs, bs->total_sectors);
1569    if (ret < 0) {
1570        error_setg_errno(errp, -ret, "Could not refresh total sector count");
1571        return ret;
1572    }
1573
1574    bdrv_refresh_limits(bs, NULL, &local_err);
1575    if (local_err) {
1576        error_propagate(errp, local_err);
1577        return -EINVAL;
1578    }
1579
1580    assert(bdrv_opt_mem_align(bs) != 0);
1581    assert(bdrv_min_mem_align(bs) != 0);
1582    assert(is_power_of_2(bs->bl.request_alignment));
1583
1584    for (i = 0; i < bs->quiesce_counter; i++) {
1585        if (drv->bdrv_co_drain_begin) {
1586            drv->bdrv_co_drain_begin(bs);
1587        }
1588    }
1589
1590    return 0;
1591open_failed:
1592    bs->drv = NULL;
1593    if (bs->file != NULL) {
1594        bdrv_unref_child(bs, bs->file);
1595        bs->file = NULL;
1596    }
1597    g_free(bs->opaque);
1598    bs->opaque = NULL;
1599    return ret;
1600}
1601
1602BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
1603                                       int flags, Error **errp)
1604{
1605    BlockDriverState *bs;
1606    int ret;
1607
1608    bs = bdrv_new();
1609    bs->open_flags = flags;
1610    bs->explicit_options = qdict_new();
1611    bs->options = qdict_new();
1612    bs->opaque = NULL;
1613
1614    update_options_from_flags(bs->options, flags);
1615
1616    ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
1617    if (ret < 0) {
1618        qobject_unref(bs->explicit_options);
1619        bs->explicit_options = NULL;
1620        qobject_unref(bs->options);
1621        bs->options = NULL;
1622        bdrv_unref(bs);
1623        return NULL;
1624    }
1625
1626    return bs;
1627}
1628
1629QemuOptsList bdrv_runtime_opts = {
1630    .name = "bdrv_common",
1631    .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
1632    .desc = {
1633        {
1634            .name = "node-name",
1635            .type = QEMU_OPT_STRING,
1636            .help = "Node name of the block device node",
1637        },
1638        {
1639            .name = "driver",
1640            .type = QEMU_OPT_STRING,
1641            .help = "Block driver to use for the node",
1642        },
1643        {
1644            .name = BDRV_OPT_CACHE_DIRECT,
1645            .type = QEMU_OPT_BOOL,
1646            .help = "Bypass software writeback cache on the host",
1647        },
1648        {
1649            .name = BDRV_OPT_CACHE_NO_FLUSH,
1650            .type = QEMU_OPT_BOOL,
1651            .help = "Ignore flush requests",
1652        },
1653        {
1654            .name = BDRV_OPT_READ_ONLY,
1655            .type = QEMU_OPT_BOOL,
1656            .help = "Node is opened in read-only mode",
1657        },
1658        {
1659            .name = BDRV_OPT_AUTO_READ_ONLY,
1660            .type = QEMU_OPT_BOOL,
1661            .help = "Node can become read-only if opening read-write fails",
1662        },
1663        {
1664            .name = "detect-zeroes",
1665            .type = QEMU_OPT_STRING,
1666            .help = "try to optimize zero writes (off, on, unmap)",
1667        },
1668        {
1669            .name = BDRV_OPT_DISCARD,
1670            .type = QEMU_OPT_STRING,
1671            .help = "discard operation (ignore/off, unmap/on)",
1672        },
1673        {
1674            .name = BDRV_OPT_FORCE_SHARE,
1675            .type = QEMU_OPT_BOOL,
1676            .help = "always accept other writers (default: off)",
1677        },
1678        { /* end of list */ }
1679    },
1680};
1681
1682QemuOptsList bdrv_create_opts_simple = {
1683    .name = "simple-create-opts",
1684    .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
1685    .desc = {
1686        {
1687            .name = BLOCK_OPT_SIZE,
1688            .type = QEMU_OPT_SIZE,
1689            .help = "Virtual disk size"
1690        },
1691        {
1692            .name = BLOCK_OPT_PREALLOC,
1693            .type = QEMU_OPT_STRING,
1694            .help = "Preallocation mode (allowed values: off)"
1695        },
1696        { /* end of list */ }
1697    }
1698};
1699
1700/*
1701 * Common part for opening disk images and files
1702 *
1703 * Removes all processed options from *options.
1704 */
1705static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
1706                            QDict *options, Error **errp)
1707{
1708    int ret, open_flags;
1709    const char *filename;
1710    const char *driver_name = NULL;
1711    const char *node_name = NULL;
1712    const char *discard;
1713    QemuOpts *opts;
1714    BlockDriver *drv;
1715    Error *local_err = NULL;
1716    bool ro;
1717
1718    assert(bs->file == NULL);
1719    assert(options != NULL && bs->options != options);
1720
1721    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
1722    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
1723        ret = -EINVAL;
1724        goto fail_opts;
1725    }
1726
1727    update_flags_from_options(&bs->open_flags, opts);
1728
1729    driver_name = qemu_opt_get(opts, "driver");
1730    drv = bdrv_find_format(driver_name);
1731    assert(drv != NULL);
1732
1733    bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
1734
1735    if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
1736        error_setg(errp,
1737                   BDRV_OPT_FORCE_SHARE
1738                   "=on can only be used with read-only images");
1739        ret = -EINVAL;
1740        goto fail_opts;
1741    }
1742
1743    if (file != NULL) {
1744        bdrv_refresh_filename(blk_bs(file));
1745        filename = blk_bs(file)->filename;
1746    } else {
1747        /*
1748         * Caution: while qdict_get_try_str() is fine, getting
1749         * non-string types would require more care.  When @options
1750         * come from -blockdev or blockdev_add, its members are typed
1751         * according to the QAPI schema, but when they come from
1752         * -drive, they're all QString.
1753         */
1754        filename = qdict_get_try_str(options, "filename");
1755    }
1756
1757    if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
1758        error_setg(errp, "The '%s' block driver requires a file name",
1759                   drv->format_name);
1760        ret = -EINVAL;
1761        goto fail_opts;
1762    }
1763
1764    trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
1765                           drv->format_name);
1766
1767    ro = bdrv_is_read_only(bs);
1768
1769    if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, ro)) {
1770        if (!ro && bdrv_is_whitelisted(drv, true)) {
1771            ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
1772        } else {
1773            ret = -ENOTSUP;
1774        }
1775        if (ret < 0) {
1776            error_setg(errp,
1777                       !ro && bdrv_is_whitelisted(drv, true)
1778                       ? "Driver '%s' can only be used for read-only devices"
1779                       : "Driver '%s' is not whitelisted",
1780                       drv->format_name);
1781            goto fail_opts;
1782        }
1783    }
1784
1785    /* bdrv_new() and bdrv_close() make it so */
1786    assert(qatomic_read(&bs->copy_on_read) == 0);
1787
1788    if (bs->open_flags & BDRV_O_COPY_ON_READ) {
1789        if (!ro) {
1790            bdrv_enable_copy_on_read(bs);
1791        } else {
1792            error_setg(errp, "Can't use copy-on-read on read-only device");
1793            ret = -EINVAL;
1794            goto fail_opts;
1795        }
1796    }
1797
1798    discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
1799    if (discard != NULL) {
1800        if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
1801            error_setg(errp, "Invalid discard option");
1802            ret = -EINVAL;
1803            goto fail_opts;
1804        }
1805    }
1806
1807    bs->detect_zeroes =
1808        bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
1809    if (local_err) {
1810        error_propagate(errp, local_err);
1811        ret = -EINVAL;
1812        goto fail_opts;
1813    }
1814
1815    if (filename != NULL) {
1816        pstrcpy(bs->filename, sizeof(bs->filename), filename);
1817    } else {
1818        bs->filename[0] = '\0';
1819    }
1820    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1821
1822    /* Open the image, either directly or using a protocol */
1823    open_flags = bdrv_open_flags(bs, bs->open_flags);
1824    node_name = qemu_opt_get(opts, "node-name");
1825
1826    assert(!drv->bdrv_file_open || file == NULL);
1827    ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
1828    if (ret < 0) {
1829        goto fail_opts;
1830    }
1831
1832    qemu_opts_del(opts);
1833    return 0;
1834
1835fail_opts:
1836    qemu_opts_del(opts);
1837    return ret;
1838}
1839
1840static QDict *parse_json_filename(const char *filename, Error **errp)
1841{
1842    QObject *options_obj;
1843    QDict *options;
1844    int ret;
1845
1846    ret = strstart(filename, "json:", &filename);
1847    assert(ret);
1848
1849    options_obj = qobject_from_json(filename, errp);
1850    if (!options_obj) {
1851        error_prepend(errp, "Could not parse the JSON options: ");
1852        return NULL;
1853    }
1854
1855    options = qobject_to(QDict, options_obj);
1856    if (!options) {
1857        qobject_unref(options_obj);
1858        error_setg(errp, "Invalid JSON object given");
1859        return NULL;
1860    }
1861
1862    qdict_flatten(options);
1863
1864    return options;
1865}
1866
1867static void parse_json_protocol(QDict *options, const char **pfilename,
1868                                Error **errp)
1869{
1870    QDict *json_options;
1871    Error *local_err = NULL;
1872
1873    /* Parse json: pseudo-protocol */
1874    if (!*pfilename || !g_str_has_prefix(*pfilename, "json:")) {
1875        return;
1876    }
1877
1878    json_options = parse_json_filename(*pfilename, &local_err);
1879    if (local_err) {
1880        error_propagate(errp, local_err);
1881        return;
1882    }
1883
1884    /* Options given in the filename have lower priority than options
1885     * specified directly */
1886    qdict_join(options, json_options, false);
1887    qobject_unref(json_options);
1888    *pfilename = NULL;
1889}
1890
1891/*
1892 * Fills in default options for opening images and converts the legacy
1893 * filename/flags pair to option QDict entries.
1894 * The BDRV_O_PROTOCOL flag in *flags will be set or cleared accordingly if a
1895 * block driver has been specified explicitly.
1896 */
1897static int bdrv_fill_options(QDict **options, const char *filename,
1898                             int *flags, Error **errp)
1899{
1900    const char *drvname;
1901    bool protocol = *flags & BDRV_O_PROTOCOL;
1902    bool parse_filename = false;
1903    BlockDriver *drv = NULL;
1904    Error *local_err = NULL;
1905
1906    /*
1907     * Caution: while qdict_get_try_str() is fine, getting non-string
1908     * types would require more care.  When @options come from
1909     * -blockdev or blockdev_add, its members are typed according to
1910     * the QAPI schema, but when they come from -drive, they're all
1911     * QString.
1912     */
1913    drvname = qdict_get_try_str(*options, "driver");
1914    if (drvname) {
1915        drv = bdrv_find_format(drvname);
1916        if (!drv) {
1917            error_setg(errp, "Unknown driver '%s'", drvname);
1918            return -ENOENT;
1919        }
1920        /* If the user has explicitly specified the driver, this choice should
1921         * override the BDRV_O_PROTOCOL flag */
1922        protocol = drv->bdrv_file_open;
1923    }
1924
1925    if (protocol) {
1926        *flags |= BDRV_O_PROTOCOL;
1927    } else {
1928        *flags &= ~BDRV_O_PROTOCOL;
1929    }
1930
1931    /* Translate cache options from flags into options */
1932    update_options_from_flags(*options, *flags);
1933
1934    /* Fetch the file name from the options QDict if necessary */
1935    if (protocol && filename) {
1936        if (!qdict_haskey(*options, "filename")) {
1937            qdict_put_str(*options, "filename", filename);
1938            parse_filename = true;
1939        } else {
1940            error_setg(errp, "Can't specify 'file' and 'filename' options at "
1941                             "the same time");
1942            return -EINVAL;
1943        }
1944    }
1945
1946    /* Find the right block driver */
1947    /* See cautionary note on accessing @options above */
1948    filename = qdict_get_try_str(*options, "filename");
1949
1950    if (!drvname && protocol) {
1951        if (filename) {
1952            drv = bdrv_find_protocol(filename, parse_filename, errp);
1953            if (!drv) {
1954                return -EINVAL;
1955            }
1956
1957            drvname = drv->format_name;
1958            qdict_put_str(*options, "driver", drvname);
1959        } else {
1960            error_setg(errp, "Must specify either driver or file");
1961            return -EINVAL;
1962        }
1963    }
1964
1965    assert(drv || !protocol);
1966
1967    /* Driver-specific filename parsing */
1968    if (drv && drv->bdrv_parse_filename && parse_filename) {
1969        drv->bdrv_parse_filename(filename, *options, &local_err);
1970        if (local_err) {
1971            error_propagate(errp, local_err);
1972            return -EINVAL;
1973        }
1974
1975        if (!drv->bdrv_needs_filename) {
1976            qdict_del(*options, "filename");
1977        }
1978    }
1979
1980    return 0;
1981}
1982
1983typedef struct BlockReopenQueueEntry {
1984     bool prepared;
1985     bool perms_checked;
1986     BDRVReopenState state;
1987     QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
1988} BlockReopenQueueEntry;
1989
1990/*
1991 * Return the flags that @bs will have after the reopens in @q have
1992 * successfully completed. If @q is NULL (or @bs is not contained in @q),
1993 * return the current flags.
1994 */
1995static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
1996{
1997    BlockReopenQueueEntry *entry;
1998
1999    if (q != NULL) {
2000        QTAILQ_FOREACH(entry, q, entry) {
2001            if (entry->state.bs == bs) {
2002                return entry->state.flags;
2003            }
2004        }
2005    }
2006
2007    return bs->open_flags;
2008}
2009
2010/* Returns whether the image file can be written to after the reopen queue @q
2011 * has been successfully applied, or right now if @q is NULL. */
2012static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
2013                                          BlockReopenQueue *q)
2014{
2015    int flags = bdrv_reopen_get_flags(q, bs);
2016
2017    return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
2018}
2019
2020/*
2021 * Return whether the BDS can be written to.  This is not necessarily
2022 * the same as !bdrv_is_read_only(bs), as inactivated images may not
2023 * be written to but do not count as read-only images.
2024 */
2025bool bdrv_is_writable(BlockDriverState *bs)
2026{
2027    return bdrv_is_writable_after_reopen(bs, NULL);
2028}
2029
2030static char *bdrv_child_user_desc(BdrvChild *c)
2031{
2032    return c->klass->get_parent_desc(c);
2033}
2034
2035/*
2036 * Check that @a allows everything that @b needs. @a and @b must reference same
2037 * child node.
2038 */
2039static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
2040{
2041    const char *child_bs_name;
2042    g_autofree char *a_user = NULL;
2043    g_autofree char *b_user = NULL;
2044    g_autofree char *perms = NULL;
2045
2046    assert(a->bs);
2047    assert(a->bs == b->bs);
2048
2049    if ((b->perm & a->shared_perm) == b->perm) {
2050        return true;
2051    }
2052
2053    child_bs_name = bdrv_get_node_name(b->bs);
2054    a_user = bdrv_child_user_desc(a);
2055    b_user = bdrv_child_user_desc(b);
2056    perms = bdrv_perm_names(b->perm & ~a->shared_perm);
2057
2058    error_setg(errp, "Permission conflict on node '%s': permissions '%s' are "
2059               "both required by %s (uses node '%s' as '%s' child) and "
2060               "unshared by %s (uses node '%s' as '%s' child).",
2061               child_bs_name, perms,
2062               b_user, child_bs_name, b->name,
2063               a_user, child_bs_name, a->name);
2064
2065    return false;
2066}
2067
2068static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
2069{
2070    BdrvChild *a, *b;
2071
2072    /*
2073     * During the loop we'll look at each pair twice. That's correct because
2074     * bdrv_a_allow_b() is asymmetric and we should check each pair in both
2075     * directions.
2076     */
2077    QLIST_FOREACH(a, &bs->parents, next_parent) {
2078        QLIST_FOREACH(b, &bs->parents, next_parent) {
2079            if (a == b) {
2080                continue;
2081            }
2082
2083            if (!bdrv_a_allow_b(a, b, errp)) {
2084                return true;
2085            }
2086        }
2087    }
2088
2089    return false;
2090}
2091
2092static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
2093                            BdrvChild *c, BdrvChildRole role,
2094                            BlockReopenQueue *reopen_queue,
2095                            uint64_t parent_perm, uint64_t parent_shared,
2096                            uint64_t *nperm, uint64_t *nshared)
2097{
2098    assert(bs->drv && bs->drv->bdrv_child_perm);
2099    bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
2100                             parent_perm, parent_shared,
2101                             nperm, nshared);
2102    /* TODO Take force_share from reopen_queue */
2103    if (child_bs && child_bs->force_share) {
2104        *nshared = BLK_PERM_ALL;
2105    }
2106}
2107
2108/*
2109 * Adds the whole subtree of @bs (including @bs itself) to the @list (except for
2110 * nodes that are already in the @list, of course) so that final list is
2111 * topologically sorted. Return the result (GSList @list object is updated, so
2112 * don't use old reference after function call).
2113 *
2114 * On function start @list must be already topologically sorted and for any node
2115 * in the @list the whole subtree of the node must be in the @list as well. The
2116 * simplest way to satisfy this criteria: use only result of
2117 * bdrv_topological_dfs() or NULL as @list parameter.
2118 */
2119static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
2120                                    BlockDriverState *bs)
2121{
2122    BdrvChild *child;
2123    g_autoptr(GHashTable) local_found = NULL;
2124
2125    if (!found) {
2126        assert(!list);
2127        found = local_found = g_hash_table_new(NULL, NULL);
2128    }
2129
2130    if (g_hash_table_contains(found, bs)) {
2131        return list;
2132    }
2133    g_hash_table_add(found, bs);
2134
2135    QLIST_FOREACH(child, &bs->children, next) {
2136        list = bdrv_topological_dfs(list, found, child->bs);
2137    }
2138
2139    return g_slist_prepend(list, bs);
2140}
2141
2142typedef struct BdrvChildSetPermState {
2143    BdrvChild *child;
2144    uint64_t old_perm;
2145    uint64_t old_shared_perm;
2146} BdrvChildSetPermState;
2147
2148static void bdrv_child_set_perm_abort(void *opaque)
2149{
2150    BdrvChildSetPermState *s = opaque;
2151
2152    s->child->perm = s->old_perm;
2153    s->child->shared_perm = s->old_shared_perm;
2154}
2155
2156static TransactionActionDrv bdrv_child_set_pem_drv = {
2157    .abort = bdrv_child_set_perm_abort,
2158    .clean = g_free,
2159};
2160
2161static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
2162                                uint64_t shared, Transaction *tran)
2163{
2164    BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
2165
2166    *s = (BdrvChildSetPermState) {
2167        .child = c,
2168        .old_perm = c->perm,
2169        .old_shared_perm = c->shared_perm,
2170    };
2171
2172    c->perm = perm;
2173    c->shared_perm = shared;
2174
2175    tran_add(tran, &bdrv_child_set_pem_drv, s);
2176}
2177
2178static void bdrv_drv_set_perm_commit(void *opaque)
2179{
2180    BlockDriverState *bs = opaque;
2181    uint64_t cumulative_perms, cumulative_shared_perms;
2182
2183    if (bs->drv->bdrv_set_perm) {
2184        bdrv_get_cumulative_perm(bs, &cumulative_perms,
2185                                 &cumulative_shared_perms);
2186        bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
2187    }
2188}
2189
2190static void bdrv_drv_set_perm_abort(void *opaque)
2191{
2192    BlockDriverState *bs = opaque;
2193
2194    if (bs->drv->bdrv_abort_perm_update) {
2195        bs->drv->bdrv_abort_perm_update(bs);
2196    }
2197}
2198
2199TransactionActionDrv bdrv_drv_set_perm_drv = {
2200    .abort = bdrv_drv_set_perm_abort,
2201    .commit = bdrv_drv_set_perm_commit,
2202};
2203
2204static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
2205                             uint64_t shared_perm, Transaction *tran,
2206                             Error **errp)
2207{
2208    if (!bs->drv) {
2209        return 0;
2210    }
2211
2212    if (bs->drv->bdrv_check_perm) {
2213        int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
2214        if (ret < 0) {
2215            return ret;
2216        }
2217    }
2218
2219    if (tran) {
2220        tran_add(tran, &bdrv_drv_set_perm_drv, bs);
2221    }
2222
2223    return 0;
2224}
2225
2226typedef struct BdrvReplaceChildState {
2227    BdrvChild *child;
2228    BlockDriverState *old_bs;
2229} BdrvReplaceChildState;
2230
2231static void bdrv_replace_child_commit(void *opaque)
2232{
2233    BdrvReplaceChildState *s = opaque;
2234
2235    bdrv_unref(s->old_bs);
2236}
2237
2238static void bdrv_replace_child_abort(void *opaque)
2239{
2240    BdrvReplaceChildState *s = opaque;
2241    BlockDriverState *new_bs = s->child->bs;
2242
2243    /* old_bs reference is transparently moved from @s to @s->child */
2244    bdrv_replace_child_noperm(s->child, s->old_bs);
2245    bdrv_unref(new_bs);
2246}
2247
2248static TransactionActionDrv bdrv_replace_child_drv = {
2249    .commit = bdrv_replace_child_commit,
2250    .abort = bdrv_replace_child_abort,
2251    .clean = g_free,
2252};
2253
2254/*
2255 * bdrv_replace_child_tran
2256 *
2257 * Note: real unref of old_bs is done only on commit.
2258 *
2259 * The function doesn't update permissions, caller is responsible for this.
2260 */
2261static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
2262                                    Transaction *tran)
2263{
2264    BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
2265    *s = (BdrvReplaceChildState) {
2266        .child = child,
2267        .old_bs = child->bs,
2268    };
2269    tran_add(tran, &bdrv_replace_child_drv, s);
2270
2271    if (new_bs) {
2272        bdrv_ref(new_bs);
2273    }
2274    bdrv_replace_child_noperm(child, new_bs);
2275    /* old_bs reference is transparently moved from @child to @s */
2276}
2277
2278/*
2279 * Refresh permissions in @bs subtree. The function is intended to be called
2280 * after some graph modification that was done without permission update.
2281 */
2282static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
2283                                  Transaction *tran, Error **errp)
2284{
2285    BlockDriver *drv = bs->drv;
2286    BdrvChild *c;
2287    int ret;
2288    uint64_t cumulative_perms, cumulative_shared_perms;
2289
2290    bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
2291
2292    /* Write permissions never work with read-only images */
2293    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2294        !bdrv_is_writable_after_reopen(bs, q))
2295    {
2296        if (!bdrv_is_writable_after_reopen(bs, NULL)) {
2297            error_setg(errp, "Block node is read-only");
2298        } else {
2299            error_setg(errp, "Read-only block node '%s' cannot support "
2300                       "read-write users", bdrv_get_node_name(bs));
2301        }
2302
2303        return -EPERM;
2304    }
2305
2306    /*
2307     * Unaligned requests will automatically be aligned to bl.request_alignment
2308     * and without RESIZE we can't extend requests to write to space beyond the
2309     * end of the image, so it's required that the image size is aligned.
2310     */
2311    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2312        !(cumulative_perms & BLK_PERM_RESIZE))
2313    {
2314        if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) {
2315            error_setg(errp, "Cannot get 'write' permission without 'resize': "
2316                             "Image size is not a multiple of request "
2317                             "alignment");
2318            return -EPERM;
2319        }
2320    }
2321
2322    /* Check this node */
2323    if (!drv) {
2324        return 0;
2325    }
2326
2327    ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, tran,
2328                            errp);
2329    if (ret < 0) {
2330        return ret;
2331    }
2332
2333    /* Drivers that never have children can omit .bdrv_child_perm() */
2334    if (!drv->bdrv_child_perm) {
2335        assert(QLIST_EMPTY(&bs->children));
2336        return 0;
2337    }
2338
2339    /* Check all children */
2340    QLIST_FOREACH(c, &bs->children, next) {
2341        uint64_t cur_perm, cur_shared;
2342
2343        bdrv_child_perm(bs, c->bs, c, c->role, q,
2344                        cumulative_perms, cumulative_shared_perms,
2345                        &cur_perm, &cur_shared);
2346        bdrv_child_set_perm(c, cur_perm, cur_shared, tran);
2347    }
2348
2349    return 0;
2350}
2351
2352static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
2353                                   Transaction *tran, Error **errp)
2354{
2355    int ret;
2356    BlockDriverState *bs;
2357
2358    for ( ; list; list = list->next) {
2359        bs = list->data;
2360
2361        if (bdrv_parent_perms_conflict(bs, errp)) {
2362            return -EINVAL;
2363        }
2364
2365        ret = bdrv_node_refresh_perm(bs, q, tran, errp);
2366        if (ret < 0) {
2367            return ret;
2368        }
2369    }
2370
2371    return 0;
2372}
2373
2374void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
2375                              uint64_t *shared_perm)
2376{
2377    BdrvChild *c;
2378    uint64_t cumulative_perms = 0;
2379    uint64_t cumulative_shared_perms = BLK_PERM_ALL;
2380
2381    QLIST_FOREACH(c, &bs->parents, next_parent) {
2382        cumulative_perms |= c->perm;
2383        cumulative_shared_perms &= c->shared_perm;
2384    }
2385
2386    *perm = cumulative_perms;
2387    *shared_perm = cumulative_shared_perms;
2388}
2389
2390char *bdrv_perm_names(uint64_t perm)
2391{
2392    struct perm_name {
2393        uint64_t perm;
2394        const char *name;
2395    } permissions[] = {
2396        { BLK_PERM_CONSISTENT_READ, "consistent read" },
2397        { BLK_PERM_WRITE,           "write" },
2398        { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
2399        { BLK_PERM_RESIZE,          "resize" },
2400        { BLK_PERM_GRAPH_MOD,       "change children" },
2401        { 0, NULL }
2402    };
2403
2404    GString *result = g_string_sized_new(30);
2405    struct perm_name *p;
2406
2407    for (p = permissions; p->name; p++) {
2408        if (perm & p->perm) {
2409            if (result->len > 0) {
2410                g_string_append(result, ", ");
2411            }
2412            g_string_append(result, p->name);
2413        }
2414    }
2415
2416    return g_string_free(result, FALSE);
2417}
2418
2419
2420static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp)
2421{
2422    int ret;
2423    Transaction *tran = tran_new();
2424    g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
2425
2426    ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
2427    tran_finalize(tran, ret);
2428
2429    return ret;
2430}
2431
2432int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
2433                            Error **errp)
2434{
2435    Error *local_err = NULL;
2436    Transaction *tran = tran_new();
2437    int ret;
2438
2439    bdrv_child_set_perm(c, perm, shared, tran);
2440
2441    ret = bdrv_refresh_perms(c->bs, &local_err);
2442
2443    tran_finalize(tran, ret);
2444
2445    if (ret < 0) {
2446        if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
2447            /* tighten permissions */
2448            error_propagate(errp, local_err);
2449        } else {
2450            /*
2451             * Our caller may intend to only loosen restrictions and
2452             * does not expect this function to fail.  Errors are not
2453             * fatal in such a case, so we can just hide them from our
2454             * caller.
2455             */
2456            error_free(local_err);
2457            ret = 0;
2458        }
2459    }
2460
2461    return ret;
2462}
2463
2464int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
2465{
2466    uint64_t parent_perms, parent_shared;
2467    uint64_t perms, shared;
2468
2469    bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
2470    bdrv_child_perm(bs, c->bs, c, c->role, NULL,
2471                    parent_perms, parent_shared, &perms, &shared);
2472
2473    return bdrv_child_try_set_perm(c, perms, shared, errp);
2474}
2475
2476/*
2477 * Default implementation for .bdrv_child_perm() for block filters:
2478 * Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED, and RESIZE to the
2479 * filtered child.
2480 */
2481static void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
2482                                      BdrvChildRole role,
2483                                      BlockReopenQueue *reopen_queue,
2484                                      uint64_t perm, uint64_t shared,
2485                                      uint64_t *nperm, uint64_t *nshared)
2486{
2487    *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
2488    *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
2489}
2490
2491static void bdrv_default_perms_for_cow(BlockDriverState *bs, BdrvChild *c,
2492                                       BdrvChildRole role,
2493                                       BlockReopenQueue *reopen_queue,
2494                                       uint64_t perm, uint64_t shared,
2495                                       uint64_t *nperm, uint64_t *nshared)
2496{
2497    assert(role & BDRV_CHILD_COW);
2498
2499    /*
2500     * We want consistent read from backing files if the parent needs it.
2501     * No other operations are performed on backing files.
2502     */
2503    perm &= BLK_PERM_CONSISTENT_READ;
2504
2505    /*
2506     * If the parent can deal with changing data, we're okay with a
2507     * writable and resizable backing file.
2508     * TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too?
2509     */
2510    if (shared & BLK_PERM_WRITE) {
2511        shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2512    } else {
2513        shared = 0;
2514    }
2515
2516    shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_GRAPH_MOD |
2517              BLK_PERM_WRITE_UNCHANGED;
2518
2519    if (bs->open_flags & BDRV_O_INACTIVE) {
2520        shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2521    }
2522
2523    *nperm = perm;
2524    *nshared = shared;
2525}
2526
2527static void bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
2528                                           BdrvChildRole role,
2529                                           BlockReopenQueue *reopen_queue,
2530                                           uint64_t perm, uint64_t shared,
2531                                           uint64_t *nperm, uint64_t *nshared)
2532{
2533    int flags;
2534
2535    assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
2536
2537    flags = bdrv_reopen_get_flags(reopen_queue, bs);
2538
2539    /*
2540     * Apart from the modifications below, the same permissions are
2541     * forwarded and left alone as for filters
2542     */
2543    bdrv_filter_default_perms(bs, c, role, reopen_queue,
2544                              perm, shared, &perm, &shared);
2545
2546    if (role & BDRV_CHILD_METADATA) {
2547        /* Format drivers may touch metadata even if the guest doesn't write */
2548        if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
2549            perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2550        }
2551
2552        /*
2553         * bs->file always needs to be consistent because of the
2554         * metadata. We can never allow other users to resize or write
2555         * to it.
2556         */
2557        if (!(flags & BDRV_O_NO_IO)) {
2558            perm |= BLK_PERM_CONSISTENT_READ;
2559        }
2560        shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
2561    }
2562
2563    if (role & BDRV_CHILD_DATA) {
2564        /*
2565         * Technically, everything in this block is a subset of the
2566         * BDRV_CHILD_METADATA path taken above, and so this could
2567         * be an "else if" branch.  However, that is not obvious, and
2568         * this function is not performance critical, therefore we let
2569         * this be an independent "if".
2570         */
2571
2572        /*
2573         * We cannot allow other users to resize the file because the
2574         * format driver might have some assumptions about the size
2575         * (e.g. because it is stored in metadata, or because the file
2576         * is split into fixed-size data files).
2577         */
2578        shared &= ~BLK_PERM_RESIZE;
2579
2580        /*
2581         * WRITE_UNCHANGED often cannot be performed as such on the
2582         * data file.  For example, the qcow2 driver may still need to
2583         * write copied clusters on copy-on-read.
2584         */
2585        if (perm & BLK_PERM_WRITE_UNCHANGED) {
2586            perm |= BLK_PERM_WRITE;
2587        }
2588
2589        /*
2590         * If the data file is written to, the format driver may
2591         * expect to be able to resize it by writing beyond the EOF.
2592         */
2593        if (perm & BLK_PERM_WRITE) {
2594            perm |= BLK_PERM_RESIZE;
2595        }
2596    }
2597
2598    if (bs->open_flags & BDRV_O_INACTIVE) {
2599        shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2600    }
2601
2602    *nperm = perm;
2603    *nshared = shared;
2604}
2605
2606void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
2607                        BdrvChildRole role, BlockReopenQueue *reopen_queue,
2608                        uint64_t perm, uint64_t shared,
2609                        uint64_t *nperm, uint64_t *nshared)
2610{
2611    if (role & BDRV_CHILD_FILTERED) {
2612        assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
2613                         BDRV_CHILD_COW)));
2614        bdrv_filter_default_perms(bs, c, role, reopen_queue,
2615                                  perm, shared, nperm, nshared);
2616    } else if (role & BDRV_CHILD_COW) {
2617        assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA)));
2618        bdrv_default_perms_for_cow(bs, c, role, reopen_queue,
2619                                   perm, shared, nperm, nshared);
2620    } else if (role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA)) {
2621        bdrv_default_perms_for_storage(bs, c, role, reopen_queue,
2622                                       perm, shared, nperm, nshared);
2623    } else {
2624        g_assert_not_reached();
2625    }
2626}
2627
2628uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
2629{
2630    static const uint64_t permissions[] = {
2631        [BLOCK_PERMISSION_CONSISTENT_READ]  = BLK_PERM_CONSISTENT_READ,
2632        [BLOCK_PERMISSION_WRITE]            = BLK_PERM_WRITE,
2633        [BLOCK_PERMISSION_WRITE_UNCHANGED]  = BLK_PERM_WRITE_UNCHANGED,
2634        [BLOCK_PERMISSION_RESIZE]           = BLK_PERM_RESIZE,
2635        [BLOCK_PERMISSION_GRAPH_MOD]        = BLK_PERM_GRAPH_MOD,
2636    };
2637
2638    QEMU_BUILD_BUG_ON(ARRAY_SIZE(permissions) != BLOCK_PERMISSION__MAX);
2639    QEMU_BUILD_BUG_ON(1UL << ARRAY_SIZE(permissions) != BLK_PERM_ALL + 1);
2640
2641    assert(qapi_perm < BLOCK_PERMISSION__MAX);
2642
2643    return permissions[qapi_perm];
2644}
2645
2646static void bdrv_replace_child_noperm(BdrvChild *child,
2647                                      BlockDriverState *new_bs)
2648{
2649    BlockDriverState *old_bs = child->bs;
2650    int new_bs_quiesce_counter;
2651    int drain_saldo;
2652
2653    assert(!child->frozen);
2654
2655    if (old_bs && new_bs) {
2656        assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
2657    }
2658
2659    new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
2660    drain_saldo = new_bs_quiesce_counter - child->parent_quiesce_counter;
2661
2662    /*
2663     * If the new child node is drained but the old one was not, flush
2664     * all outstanding requests to the old child node.
2665     */
2666    while (drain_saldo > 0 && child->klass->drained_begin) {
2667        bdrv_parent_drained_begin_single(child, true);
2668        drain_saldo--;
2669    }
2670
2671    if (old_bs) {
2672        /* Detach first so that the recursive drain sections coming from @child
2673         * are already gone and we only end the drain sections that came from
2674         * elsewhere. */
2675        if (child->klass->detach) {
2676            child->klass->detach(child);
2677        }
2678        QLIST_REMOVE(child, next_parent);
2679    }
2680
2681    child->bs = new_bs;
2682
2683    if (new_bs) {
2684        QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
2685
2686        /*
2687         * Detaching the old node may have led to the new node's
2688         * quiesce_counter having been decreased.  Not a problem, we
2689         * just need to recognize this here and then invoke
2690         * drained_end appropriately more often.
2691         */
2692        assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
2693        drain_saldo += new_bs->quiesce_counter - new_bs_quiesce_counter;
2694
2695        /* Attach only after starting new drained sections, so that recursive
2696         * drain sections coming from @child don't get an extra .drained_begin
2697         * callback. */
2698        if (child->klass->attach) {
2699            child->klass->attach(child);
2700        }
2701    }
2702
2703    /*
2704     * If the old child node was drained but the new one is not, allow
2705     * requests to come in only after the new node has been attached.
2706     */
2707    while (drain_saldo < 0 && child->klass->drained_end) {
2708        bdrv_parent_drained_end_single(child);
2709        drain_saldo++;
2710    }
2711}
2712
2713static void bdrv_child_free(void *opaque)
2714{
2715    BdrvChild *c = opaque;
2716
2717    g_free(c->name);
2718    g_free(c);
2719}
2720
2721static void bdrv_remove_empty_child(BdrvChild *child)
2722{
2723    assert(!child->bs);
2724    QLIST_SAFE_REMOVE(child, next);
2725    bdrv_child_free(child);
2726}
2727
2728typedef struct BdrvAttachChildCommonState {
2729    BdrvChild **child;
2730    AioContext *old_parent_ctx;
2731    AioContext *old_child_ctx;
2732} BdrvAttachChildCommonState;
2733
2734static void bdrv_attach_child_common_abort(void *opaque)
2735{
2736    BdrvAttachChildCommonState *s = opaque;
2737    BdrvChild *child = *s->child;
2738    BlockDriverState *bs = child->bs;
2739
2740    bdrv_replace_child_noperm(child, NULL);
2741
2742    if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
2743        bdrv_try_set_aio_context(bs, s->old_child_ctx, &error_abort);
2744    }
2745
2746    if (bdrv_child_get_parent_aio_context(child) != s->old_parent_ctx) {
2747        GSList *ignore = g_slist_prepend(NULL, child);
2748
2749        child->klass->can_set_aio_ctx(child, s->old_parent_ctx, &ignore,
2750                                      &error_abort);
2751        g_slist_free(ignore);
2752        ignore = g_slist_prepend(NULL, child);
2753        child->klass->set_aio_ctx(child, s->old_parent_ctx, &ignore);
2754
2755        g_slist_free(ignore);
2756    }
2757
2758    bdrv_unref(bs);
2759    bdrv_remove_empty_child(child);
2760    *s->child = NULL;
2761}
2762
2763static TransactionActionDrv bdrv_attach_child_common_drv = {
2764    .abort = bdrv_attach_child_common_abort,
2765    .clean = g_free,
2766};
2767
2768/*
2769 * Common part of attaching bdrv child to bs or to blk or to job
2770 *
2771 * Resulting new child is returned through @child.
2772 * At start *@child must be NULL.
2773 * @child is saved to a new entry of @tran, so that *@child could be reverted to
2774 * NULL on abort(). So referenced variable must live at least until transaction
2775 * end.
2776 *
2777 * Function doesn't update permissions, caller is responsible for this.
2778 */
2779static int bdrv_attach_child_common(BlockDriverState *child_bs,
2780                                    const char *child_name,
2781                                    const BdrvChildClass *child_class,
2782                                    BdrvChildRole child_role,
2783                                    uint64_t perm, uint64_t shared_perm,
2784                                    void *opaque, BdrvChild **child,
2785                                    Transaction *tran, Error **errp)
2786{
2787    BdrvChild *new_child;
2788    AioContext *parent_ctx;
2789    AioContext *child_ctx = bdrv_get_aio_context(child_bs);
2790
2791    assert(child);
2792    assert(*child == NULL);
2793    assert(child_class->get_parent_desc);
2794
2795    new_child = g_new(BdrvChild, 1);
2796    *new_child = (BdrvChild) {
2797        .bs             = NULL,
2798        .name           = g_strdup(child_name),
2799        .klass          = child_class,
2800        .role           = child_role,
2801        .perm           = perm,
2802        .shared_perm    = shared_perm,
2803        .opaque         = opaque,
2804    };
2805
2806    /*
2807     * If the AioContexts don't match, first try to move the subtree of
2808     * child_bs into the AioContext of the new parent. If this doesn't work,
2809     * try moving the parent into the AioContext of child_bs instead.
2810     */
2811    parent_ctx = bdrv_child_get_parent_aio_context(new_child);
2812    if (child_ctx != parent_ctx) {
2813        Error *local_err = NULL;
2814        int ret = bdrv_try_set_aio_context(child_bs, parent_ctx, &local_err);
2815
2816        if (ret < 0 && child_class->can_set_aio_ctx) {
2817            GSList *ignore = g_slist_prepend(NULL, new_child);
2818            if (child_class->can_set_aio_ctx(new_child, child_ctx, &ignore,
2819                                             NULL))
2820            {
2821                error_free(local_err);
2822                ret = 0;
2823                g_slist_free(ignore);
2824                ignore = g_slist_prepend(NULL, new_child);
2825                child_class->set_aio_ctx(new_child, child_ctx, &ignore);
2826            }
2827            g_slist_free(ignore);
2828        }
2829
2830        if (ret < 0) {
2831            error_propagate(errp, local_err);
2832            bdrv_remove_empty_child(new_child);
2833            return ret;
2834        }
2835    }
2836
2837    bdrv_ref(child_bs);
2838    bdrv_replace_child_noperm(new_child, child_bs);
2839
2840    *child = new_child;
2841
2842    BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
2843    *s = (BdrvAttachChildCommonState) {
2844        .child = child,
2845        .old_parent_ctx = parent_ctx,
2846        .old_child_ctx = child_ctx,
2847    };
2848    tran_add(tran, &bdrv_attach_child_common_drv, s);
2849
2850    return 0;
2851}
2852
2853/*
2854 * Variable referenced by @child must live at least until transaction end.
2855 * (see bdrv_attach_child_common() doc for details)
2856 *
2857 * Function doesn't update permissions, caller is responsible for this.
2858 */
2859static int bdrv_attach_child_noperm(BlockDriverState *parent_bs,
2860                                    BlockDriverState *child_bs,
2861                                    const char *child_name,
2862                                    const BdrvChildClass *child_class,
2863                                    BdrvChildRole child_role,
2864                                    BdrvChild **child,
2865                                    Transaction *tran,
2866                                    Error **errp)
2867{
2868    int ret;
2869    uint64_t perm, shared_perm;
2870
2871    assert(parent_bs->drv);
2872
2873    bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
2874    bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
2875                    perm, shared_perm, &perm, &shared_perm);
2876
2877    ret = bdrv_attach_child_common(child_bs, child_name, child_class,
2878                                   child_role, perm, shared_perm, parent_bs,
2879                                   child, tran, errp);
2880    if (ret < 0) {
2881        return ret;
2882    }
2883
2884    QLIST_INSERT_HEAD(&parent_bs->children, *child, next);
2885    /*
2886     * child is removed in bdrv_attach_child_common_abort(), so don't care to
2887     * abort this change separately.
2888     */
2889
2890    return 0;
2891}
2892
2893static void bdrv_detach_child(BdrvChild *child)
2894{
2895    BlockDriverState *old_bs = child->bs;
2896
2897    bdrv_replace_child_noperm(child, NULL);
2898    bdrv_remove_empty_child(child);
2899
2900    if (old_bs) {
2901        /*
2902         * Update permissions for old node. We're just taking a parent away, so
2903         * we're loosening restrictions. Errors of permission update are not
2904         * fatal in this case, ignore them.
2905         */
2906        bdrv_refresh_perms(old_bs, NULL);
2907
2908        /*
2909         * When the parent requiring a non-default AioContext is removed, the
2910         * node moves back to the main AioContext
2911         */
2912        bdrv_try_set_aio_context(old_bs, qemu_get_aio_context(), NULL);
2913    }
2914}
2915
2916/*
2917 * This function steals the reference to child_bs from the caller.
2918 * That reference is later dropped by bdrv_root_unref_child().
2919 *
2920 * On failure NULL is returned, errp is set and the reference to
2921 * child_bs is also dropped.
2922 *
2923 * The caller must hold the AioContext lock @child_bs, but not that of @ctx
2924 * (unless @child_bs is already in @ctx).
2925 */
2926BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
2927                                  const char *child_name,
2928                                  const BdrvChildClass *child_class,
2929                                  BdrvChildRole child_role,
2930                                  uint64_t perm, uint64_t shared_perm,
2931                                  void *opaque, Error **errp)
2932{
2933    int ret;
2934    BdrvChild *child = NULL;
2935    Transaction *tran = tran_new();
2936
2937    ret = bdrv_attach_child_common(child_bs, child_name, child_class,
2938                                   child_role, perm, shared_perm, opaque,
2939                                   &child, tran, errp);
2940    if (ret < 0) {
2941        goto out;
2942    }
2943
2944    ret = bdrv_refresh_perms(child_bs, errp);
2945
2946out:
2947    tran_finalize(tran, ret);
2948    /* child is unset on failure by bdrv_attach_child_common_abort() */
2949    assert((ret < 0) == !child);
2950
2951    bdrv_unref(child_bs);
2952    return child;
2953}
2954
2955/*
2956 * This function transfers the reference to child_bs from the caller
2957 * to parent_bs. That reference is later dropped by parent_bs on
2958 * bdrv_close() or if someone calls bdrv_unref_child().
2959 *
2960 * On failure NULL is returned, errp is set and the reference to
2961 * child_bs is also dropped.
2962 *
2963 * If @parent_bs and @child_bs are in different AioContexts, the caller must
2964 * hold the AioContext lock for @child_bs, but not for @parent_bs.
2965 */
2966BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
2967                             BlockDriverState *child_bs,
2968                             const char *child_name,
2969                             const BdrvChildClass *child_class,
2970                             BdrvChildRole child_role,
2971                             Error **errp)
2972{
2973    int ret;
2974    BdrvChild *child = NULL;
2975    Transaction *tran = tran_new();
2976
2977    ret = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, child_class,
2978                                   child_role, &child, tran, errp);
2979    if (ret < 0) {
2980        goto out;
2981    }
2982
2983    ret = bdrv_refresh_perms(parent_bs, errp);
2984    if (ret < 0) {
2985        goto out;
2986    }
2987
2988out:
2989    tran_finalize(tran, ret);
2990    /* child is unset on failure by bdrv_attach_child_common_abort() */
2991    assert((ret < 0) == !child);
2992
2993    bdrv_unref(child_bs);
2994
2995    return child;
2996}
2997
2998/* Callers must ensure that child->frozen is false. */
2999void bdrv_root_unref_child(BdrvChild *child)
3000{
3001    BlockDriverState *child_bs;
3002
3003    child_bs = child->bs;
3004    bdrv_detach_child(child);
3005    bdrv_unref(child_bs);
3006}
3007
3008typedef struct BdrvSetInheritsFrom {
3009    BlockDriverState *bs;
3010    BlockDriverState *old_inherits_from;
3011} BdrvSetInheritsFrom;
3012
3013static void bdrv_set_inherits_from_abort(void *opaque)
3014{
3015    BdrvSetInheritsFrom *s = opaque;
3016
3017    s->bs->inherits_from = s->old_inherits_from;
3018}
3019
3020static TransactionActionDrv bdrv_set_inherits_from_drv = {
3021    .abort = bdrv_set_inherits_from_abort,
3022    .clean = g_free,
3023};
3024
3025/* @tran is allowed to be NULL. In this case no rollback is possible */
3026static void bdrv_set_inherits_from(BlockDriverState *bs,
3027                                   BlockDriverState *new_inherits_from,
3028                                   Transaction *tran)
3029{
3030    if (tran) {
3031        BdrvSetInheritsFrom *s = g_new(BdrvSetInheritsFrom, 1);
3032
3033        *s = (BdrvSetInheritsFrom) {
3034            .bs = bs,
3035            .old_inherits_from = bs->inherits_from,
3036        };
3037
3038        tran_add(tran, &bdrv_set_inherits_from_drv, s);
3039    }
3040
3041    bs->inherits_from = new_inherits_from;
3042}
3043
3044/**
3045 * Clear all inherits_from pointers from children and grandchildren of
3046 * @root that point to @root, where necessary.
3047 * @tran is allowed to be NULL. In this case no rollback is possible
3048 */
3049static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
3050                                     Transaction *tran)
3051{
3052    BdrvChild *c;
3053
3054    if (child->bs->inherits_from == root) {
3055        /*
3056         * Remove inherits_from only when the last reference between root and
3057         * child->bs goes away.
3058         */
3059        QLIST_FOREACH(c, &root->children, next) {
3060            if (c != child && c->bs == child->bs) {
3061                break;
3062            }
3063        }
3064        if (c == NULL) {
3065            bdrv_set_inherits_from(child->bs, NULL, tran);
3066        }
3067    }
3068
3069    QLIST_FOREACH(c, &child->bs->children, next) {
3070        bdrv_unset_inherits_from(root, c, tran);
3071    }
3072}
3073
3074/* Callers must ensure that child->frozen is false. */
3075void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
3076{
3077    if (child == NULL) {
3078        return;
3079    }
3080
3081    bdrv_unset_inherits_from(parent, child, NULL);
3082    bdrv_root_unref_child(child);
3083}
3084
3085
3086static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
3087{
3088    BdrvChild *c;
3089    QLIST_FOREACH(c, &bs->parents, next_parent) {
3090        if (c->klass->change_media) {
3091            c->klass->change_media(c, load);
3092        }
3093    }
3094}
3095
3096/* Return true if you can reach parent going through child->inherits_from
3097 * recursively. If parent or child are NULL, return false */
3098static bool bdrv_inherits_from_recursive(BlockDriverState *child,
3099                                         BlockDriverState *parent)
3100{
3101    while (child && child != parent) {
3102        child = child->inherits_from;
3103    }
3104
3105    return child != NULL;
3106}
3107
3108/*
3109 * Return the BdrvChildRole for @bs's backing child.  bs->backing is
3110 * mostly used for COW backing children (role = COW), but also for
3111 * filtered children (role = FILTERED | PRIMARY).
3112 */
3113static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
3114{
3115    if (bs->drv && bs->drv->is_filter) {
3116        return BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3117    } else {
3118        return BDRV_CHILD_COW;
3119    }
3120}
3121
3122/*
3123 * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
3124 * callers which don't need their own reference any more must call bdrv_unref().
3125 *
3126 * Function doesn't update permissions, caller is responsible for this.
3127 */
3128static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
3129                                           BlockDriverState *child_bs,
3130                                           bool is_backing,
3131                                           Transaction *tran, Error **errp)
3132{
3133    int ret = 0;
3134    bool update_inherits_from =
3135        bdrv_inherits_from_recursive(child_bs, parent_bs);
3136    BdrvChild *child = is_backing ? parent_bs->backing : parent_bs->file;
3137    BdrvChildRole role;
3138
3139    if (!parent_bs->drv) {
3140        /*
3141         * Node without drv is an object without a class :/. TODO: finally fix
3142         * qcow2 driver to never clear bs->drv and implement format corruption
3143         * handling in other way.
3144         */
3145        error_setg(errp, "Node corrupted");
3146        return -EINVAL;
3147    }
3148
3149    if (child && child->frozen) {
3150        error_setg(errp, "Cannot change frozen '%s' link from '%s' to '%s'",
3151                   child->name, parent_bs->node_name, child->bs->node_name);
3152        return -EPERM;
3153    }
3154
3155    if (is_backing && !parent_bs->drv->is_filter &&
3156        !parent_bs->drv->supports_backing)
3157    {
3158        error_setg(errp, "Driver '%s' of node '%s' does not support backing "
3159                   "files", parent_bs->drv->format_name, parent_bs->node_name);
3160        return -EINVAL;
3161    }
3162
3163    if (parent_bs->drv->is_filter) {
3164        role = BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3165    } else if (is_backing) {
3166        role = BDRV_CHILD_COW;
3167    } else {
3168        /*
3169         * We only can use same role as it is in existing child. We don't have
3170         * infrastructure to determine role of file child in generic way
3171         */
3172        if (!child) {
3173            error_setg(errp, "Cannot set file child to format node without "
3174                       "file child");
3175            return -EINVAL;
3176        }
3177        role = child->role;
3178    }
3179
3180    if (child) {
3181        bdrv_unset_inherits_from(parent_bs, child, tran);
3182        bdrv_remove_file_or_backing_child(parent_bs, child, tran);
3183    }
3184
3185    if (!child_bs) {
3186        goto out;
3187    }
3188
3189    ret = bdrv_attach_child_noperm(parent_bs, child_bs,
3190                                   is_backing ? "backing" : "file",
3191                                   &child_of_bds, role,
3192                                   is_backing ? &parent_bs->backing :
3193                                                &parent_bs->file,
3194                                   tran, errp);
3195    if (ret < 0) {
3196        return ret;
3197    }
3198
3199
3200    /*
3201     * If inherits_from pointed recursively to bs then let's update it to
3202     * point directly to bs (else it will become NULL).
3203     */
3204    if (update_inherits_from) {
3205        bdrv_set_inherits_from(child_bs, parent_bs, tran);
3206    }
3207
3208out:
3209    bdrv_refresh_limits(parent_bs, tran, NULL);
3210
3211    return 0;
3212}
3213
3214static int bdrv_set_backing_noperm(BlockDriverState *bs,
3215                                   BlockDriverState *backing_hd,
3216                                   Transaction *tran, Error **errp)
3217{
3218    return bdrv_set_file_or_backing_noperm(bs, backing_hd, true, tran, errp);
3219}
3220
3221int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
3222                        Error **errp)
3223{
3224    int ret;
3225    Transaction *tran = tran_new();
3226
3227    ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
3228    if (ret < 0) {
3229        goto out;
3230    }
3231
3232    ret = bdrv_refresh_perms(bs, errp);
3233out:
3234    tran_finalize(tran, ret);
3235
3236    return ret;
3237}
3238
3239/*
3240 * Opens the backing file for a BlockDriverState if not yet open
3241 *
3242 * bdref_key specifies the key for the image's BlockdevRef in the options QDict.
3243 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3244 * itself, all options starting with "${bdref_key}." are considered part of the
3245 * BlockdevRef.
3246 *
3247 * TODO Can this be unified with bdrv_open_image()?
3248 */
3249int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
3250                           const char *bdref_key, Error **errp)
3251{
3252    char *backing_filename = NULL;
3253    char *bdref_key_dot;
3254    const char *reference = NULL;
3255    int ret = 0;
3256    bool implicit_backing = false;
3257    BlockDriverState *backing_hd;
3258    QDict *options;
3259    QDict *tmp_parent_options = NULL;
3260    Error *local_err = NULL;
3261
3262    if (bs->backing != NULL) {
3263        goto free_exit;
3264    }
3265
3266    /* NULL means an empty set of options */
3267    if (parent_options == NULL) {
3268        tmp_parent_options = qdict_new();
3269        parent_options = tmp_parent_options;
3270    }
3271
3272    bs->open_flags &= ~BDRV_O_NO_BACKING;
3273
3274    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3275    qdict_extract_subqdict(parent_options, &options, bdref_key_dot);
3276    g_free(bdref_key_dot);
3277
3278    /*
3279     * Caution: while qdict_get_try_str() is fine, getting non-string
3280     * types would require more care.  When @parent_options come from
3281     * -blockdev or blockdev_add, its members are typed according to
3282     * the QAPI schema, but when they come from -drive, they're all
3283     * QString.
3284     */
3285    reference = qdict_get_try_str(parent_options, bdref_key);
3286    if (reference || qdict_haskey(options, "file.filename")) {
3287        /* keep backing_filename NULL */
3288    } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
3289        qobject_unref(options);
3290        goto free_exit;
3291    } else {
3292        if (qdict_size(options) == 0) {
3293            /* If the user specifies options that do not modify the
3294             * backing file's behavior, we might still consider it the
3295             * implicit backing file.  But it's easier this way, and
3296             * just specifying some of the backing BDS's options is
3297             * only possible with -drive anyway (otherwise the QAPI
3298             * schema forces the user to specify everything). */
3299            implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
3300        }
3301
3302        backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
3303        if (local_err) {
3304            ret = -EINVAL;
3305            error_propagate(errp, local_err);
3306            qobject_unref(options);
3307            goto free_exit;
3308        }
3309    }
3310
3311    if (!bs->drv || !bs->drv->supports_backing) {
3312        ret = -EINVAL;
3313        error_setg(errp, "Driver doesn't support backing files");
3314        qobject_unref(options);
3315        goto free_exit;
3316    }
3317
3318    if (!reference &&
3319        bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
3320        qdict_put_str(options, "driver", bs->backing_format);
3321    }
3322
3323    backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
3324                                   &child_of_bds, bdrv_backing_role(bs), errp);
3325    if (!backing_hd) {
3326        bs->open_flags |= BDRV_O_NO_BACKING;
3327        error_prepend(errp, "Could not open backing file: ");
3328        ret = -EINVAL;
3329        goto free_exit;
3330    }
3331
3332    if (implicit_backing) {
3333        bdrv_refresh_filename(backing_hd);
3334        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
3335                backing_hd->filename);
3336    }
3337
3338    /* Hook up the backing file link; drop our reference, bs owns the
3339     * backing_hd reference now */
3340    ret = bdrv_set_backing_hd(bs, backing_hd, errp);
3341    bdrv_unref(backing_hd);
3342    if (ret < 0) {
3343        goto free_exit;
3344    }
3345
3346    qdict_del(parent_options, bdref_key);
3347
3348free_exit:
3349    g_free(backing_filename);
3350    qobject_unref(tmp_parent_options);
3351    return ret;
3352}
3353
3354static BlockDriverState *
3355bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
3356                   BlockDriverState *parent, const BdrvChildClass *child_class,
3357                   BdrvChildRole child_role, bool allow_none, Error **errp)
3358{
3359    BlockDriverState *bs = NULL;
3360    QDict *image_options;
3361    char *bdref_key_dot;
3362    const char *reference;
3363
3364    assert(child_class != NULL);
3365
3366    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3367    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
3368    g_free(bdref_key_dot);
3369
3370    /*
3371     * Caution: while qdict_get_try_str() is fine, getting non-string
3372     * types would require more care.  When @options come from
3373     * -blockdev or blockdev_add, its members are typed according to
3374     * the QAPI schema, but when they come from -drive, they're all
3375     * QString.
3376     */
3377    reference = qdict_get_try_str(options, bdref_key);
3378    if (!filename && !reference && !qdict_size(image_options)) {
3379        if (!allow_none) {
3380            error_setg(errp, "A block device must be specified for \"%s\"",
3381                       bdref_key);
3382        }
3383        qobject_unref(image_options);
3384        goto done;
3385    }
3386
3387    bs = bdrv_open_inherit(filename, reference, image_options, 0,
3388                           parent, child_class, child_role, errp);
3389    if (!bs) {
3390        goto done;
3391    }
3392
3393done:
3394    qdict_del(options, bdref_key);
3395    return bs;
3396}
3397
3398/*
3399 * Opens a disk image whose options are given as BlockdevRef in another block
3400 * device's options.
3401 *
3402 * If allow_none is true, no image will be opened if filename is false and no
3403 * BlockdevRef is given. NULL will be returned, but errp remains unset.
3404 *
3405 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
3406 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3407 * itself, all options starting with "${bdref_key}." are considered part of the
3408 * BlockdevRef.
3409 *
3410 * The BlockdevRef will be removed from the options QDict.
3411 */
3412BdrvChild *bdrv_open_child(const char *filename,
3413                           QDict *options, const char *bdref_key,
3414                           BlockDriverState *parent,
3415                           const BdrvChildClass *child_class,
3416                           BdrvChildRole child_role,
3417                           bool allow_none, Error **errp)
3418{
3419    BlockDriverState *bs;
3420
3421    bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
3422                            child_role, allow_none, errp);
3423    if (bs == NULL) {
3424        return NULL;
3425    }
3426
3427    return bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
3428                             errp);
3429}
3430
3431/*
3432 * TODO Future callers may need to specify parent/child_class in order for
3433 * option inheritance to work. Existing callers use it for the root node.
3434 */
3435BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
3436{
3437    BlockDriverState *bs = NULL;
3438    QObject *obj = NULL;
3439    QDict *qdict = NULL;
3440    const char *reference = NULL;
3441    Visitor *v = NULL;
3442
3443    if (ref->type == QTYPE_QSTRING) {
3444        reference = ref->u.reference;
3445    } else {
3446        BlockdevOptions *options = &ref->u.definition;
3447        assert(ref->type == QTYPE_QDICT);
3448
3449        v = qobject_output_visitor_new(&obj);
3450        visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
3451        visit_complete(v, &obj);
3452
3453        qdict = qobject_to(QDict, obj);
3454        qdict_flatten(qdict);
3455
3456        /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
3457         * compatibility with other callers) rather than what we want as the
3458         * real defaults. Apply the defaults here instead. */
3459        qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
3460        qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
3461        qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
3462        qdict_set_default_str(qdict, BDRV_OPT_AUTO_READ_ONLY, "off");
3463
3464    }
3465
3466    bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
3467    obj = NULL;
3468    qobject_unref(obj);
3469    visit_free(v);
3470    return bs;
3471}
3472
3473static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
3474                                                   int flags,
3475                                                   QDict *snapshot_options,
3476                                                   Error **errp)
3477{
3478    /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
3479    char *tmp_filename = g_malloc0(PATH_MAX + 1);
3480    int64_t total_size;
3481    QemuOpts *opts = NULL;
3482    BlockDriverState *bs_snapshot = NULL;
3483    int ret;
3484
3485    /* if snapshot, we create a temporary backing file and open it
3486       instead of opening 'filename' directly */
3487
3488    /* Get the required size from the image */
3489    total_size = bdrv_getlength(bs);
3490    if (total_size < 0) {
3491        error_setg_errno(errp, -total_size, "Could not get image size");
3492        goto out;
3493    }
3494
3495    /* Create the temporary image */
3496    ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
3497    if (ret < 0) {
3498        error_setg_errno(errp, -ret, "Could not get temporary filename");
3499        goto out;
3500    }
3501
3502    opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
3503                            &error_abort);
3504    qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
3505    ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, errp);
3506    qemu_opts_del(opts);
3507    if (ret < 0) {
3508        error_prepend(errp, "Could not create temporary overlay '%s': ",
3509                      tmp_filename);
3510        goto out;
3511    }
3512
3513    /* Prepare options QDict for the temporary file */
3514    qdict_put_str(snapshot_options, "file.driver", "file");
3515    qdict_put_str(snapshot_options, "file.filename", tmp_filename);
3516    qdict_put_str(snapshot_options, "driver", "qcow2");
3517
3518    bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
3519    snapshot_options = NULL;
3520    if (!bs_snapshot) {
3521        goto out;
3522    }
3523
3524    ret = bdrv_append(bs_snapshot, bs, errp);
3525    if (ret < 0) {
3526        bs_snapshot = NULL;
3527        goto out;
3528    }
3529
3530out:
3531    qobject_unref(snapshot_options);
3532    g_free(tmp_filename);
3533    return bs_snapshot;
3534}
3535
3536/*
3537 * Opens a disk image (raw, qcow2, vmdk, ...)
3538 *
3539 * options is a QDict of options to pass to the block drivers, or NULL for an
3540 * empty set of options. The reference to the QDict belongs to the block layer
3541 * after the call (even on failure), so if the caller intends to reuse the
3542 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
3543 *
3544 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
3545 * If it is not NULL, the referenced BDS will be reused.
3546 *
3547 * The reference parameter may be used to specify an existing block device which
3548 * should be opened. If specified, neither options nor a filename may be given,
3549 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
3550 */
3551static BlockDriverState *bdrv_open_inherit(const char *filename,
3552                                           const char *reference,
3553                                           QDict *options, int flags,
3554                                           BlockDriverState *parent,
3555                                           const BdrvChildClass *child_class,
3556                                           BdrvChildRole child_role,
3557                                           Error **errp)
3558{
3559    int ret;
3560    BlockBackend *file = NULL;
3561    BlockDriverState *bs;
3562    BlockDriver *drv = NULL;
3563    BdrvChild *child;
3564    const char *drvname;
3565    const char *backing;
3566    Error *local_err = NULL;
3567    QDict *snapshot_options = NULL;
3568    int snapshot_flags = 0;
3569
3570    assert(!child_class || !flags);
3571    assert(!child_class == !parent);
3572
3573    if (reference) {
3574        bool options_non_empty = options ? qdict_size(options) : false;
3575        qobject_unref(options);
3576
3577        if (filename || options_non_empty) {
3578            error_setg(errp, "Cannot reference an existing block device with "
3579                       "additional options or a new filename");
3580            return NULL;
3581        }
3582
3583        bs = bdrv_lookup_bs(reference, reference, errp);
3584        if (!bs) {
3585            return NULL;
3586        }
3587
3588        bdrv_ref(bs);
3589        return bs;
3590    }
3591
3592    bs = bdrv_new();
3593
3594    /* NULL means an empty set of options */
3595    if (options == NULL) {
3596        options = qdict_new();
3597    }
3598
3599    /* json: syntax counts as explicit options, as if in the QDict */
3600    parse_json_protocol(options, &filename, &local_err);
3601    if (local_err) {
3602        goto fail;
3603    }
3604
3605    bs->explicit_options = qdict_clone_shallow(options);
3606
3607    if (child_class) {
3608        bool parent_is_format;
3609
3610        if (parent->drv) {
3611            parent_is_format = parent->drv->is_format;
3612        } else {
3613            /*
3614             * parent->drv is not set yet because this node is opened for
3615             * (potential) format probing.  That means that @parent is going
3616             * to be a format node.
3617             */
3618            parent_is_format = true;
3619        }
3620
3621        bs->inherits_from = parent;
3622        child_class->inherit_options(child_role, parent_is_format,
3623                                     &flags, options,
3624                                     parent->open_flags, parent->options);
3625    }
3626
3627    ret = bdrv_fill_options(&options, filename, &flags, &local_err);
3628    if (ret < 0) {
3629        goto fail;
3630    }
3631
3632    /*
3633     * Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
3634     * Caution: getting a boolean member of @options requires care.
3635     * When @options come from -blockdev or blockdev_add, members are
3636     * typed according to the QAPI schema, but when they come from
3637     * -drive, they're all QString.
3638     */
3639    if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
3640        !qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
3641        flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
3642    } else {
3643        flags &= ~BDRV_O_RDWR;
3644    }
3645
3646    if (flags & BDRV_O_SNAPSHOT) {
3647        snapshot_options = qdict_new();
3648        bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
3649                                   flags, options);
3650        /* Let bdrv_backing_options() override "read-only" */
3651        qdict_del(options, BDRV_OPT_READ_ONLY);
3652        bdrv_inherited_options(BDRV_CHILD_COW, true,
3653                               &flags, options, flags, options);
3654    }
3655
3656    bs->open_flags = flags;
3657    bs->options = options;
3658    options = qdict_clone_shallow(options);
3659
3660    /* Find the right image format driver */
3661    /* See cautionary note on accessing @options above */
3662    drvname = qdict_get_try_str(options, "driver");
3663    if (drvname) {
3664        drv = bdrv_find_format(drvname);
3665        if (!drv) {
3666            error_setg(errp, "Unknown driver: '%s'", drvname);
3667            goto fail;
3668        }
3669    }
3670
3671    assert(drvname || !(flags & BDRV_O_PROTOCOL));
3672
3673    /* See cautionary note on accessing @options above */
3674    backing = qdict_get_try_str(options, "backing");
3675    if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
3676        (backing && *backing == '\0'))
3677    {
3678        if (backing) {
3679            warn_report("Use of \"backing\": \"\" is deprecated; "
3680                        "use \"backing\": null instead");
3681        }
3682        flags |= BDRV_O_NO_BACKING;
3683        qdict_del(bs->explicit_options, "backing");
3684        qdict_del(bs->options, "backing");
3685        qdict_del(options, "backing");
3686    }
3687
3688    /* Open image file without format layer. This BlockBackend is only used for
3689     * probing, the block drivers will do their own bdrv_open_child() for the
3690     * same BDS, which is why we put the node name back into options. */
3691    if ((flags & BDRV_O_PROTOCOL) == 0) {
3692        BlockDriverState *file_bs;
3693
3694        file_bs = bdrv_open_child_bs(filename, options, "file", bs,
3695                                     &child_of_bds, BDRV_CHILD_IMAGE,
3696                                     true, &local_err);
3697        if (local_err) {
3698            goto fail;
3699        }
3700        if (file_bs != NULL) {
3701            /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
3702             * looking at the header to guess the image format. This works even
3703             * in cases where a guest would not see a consistent state. */
3704            file = blk_new(bdrv_get_aio_context(file_bs), 0, BLK_PERM_ALL);
3705            blk_insert_bs(file, file_bs, &local_err);
3706            bdrv_unref(file_bs);
3707            if (local_err) {
3708                goto fail;
3709            }
3710
3711            qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
3712        }
3713    }
3714
3715    /* Image format probing */
3716    bs->probed = !drv;
3717    if (!drv && file) {
3718        ret = find_image_format(file, filename, &drv, &local_err);
3719        if (ret < 0) {
3720            goto fail;
3721        }
3722        /*
3723         * This option update would logically belong in bdrv_fill_options(),
3724         * but we first need to open bs->file for the probing to work, while
3725         * opening bs->file already requires the (mostly) final set of options
3726         * so that cache mode etc. can be inherited.
3727         *
3728         * Adding the driver later is somewhat ugly, but it's not an option
3729         * that would ever be inherited, so it's correct. We just need to make
3730         * sure to update both bs->options (which has the full effective
3731         * options for bs) and options (which has file.* already removed).
3732         */
3733        qdict_put_str(bs->options, "driver", drv->format_name);
3734        qdict_put_str(options, "driver", drv->format_name);
3735    } else if (!drv) {
3736        error_setg(errp, "Must specify either driver or file");
3737        goto fail;
3738    }
3739
3740    /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
3741    assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
3742    /* file must be NULL if a protocol BDS is about to be created
3743     * (the inverse results in an error message from bdrv_open_common()) */
3744    assert(!(flags & BDRV_O_PROTOCOL) || !file);
3745
3746    /* Open the image */
3747    ret = bdrv_open_common(bs, file, options, &local_err);
3748    if (ret < 0) {
3749        goto fail;
3750    }
3751
3752    if (file) {
3753        blk_unref(file);
3754        file = NULL;
3755    }
3756
3757    /* If there is a backing file, use it */
3758    if ((flags & BDRV_O_NO_BACKING) == 0) {
3759        ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
3760        if (ret < 0) {
3761            goto close_and_fail;
3762        }
3763    }
3764
3765    /* Remove all children options and references
3766     * from bs->options and bs->explicit_options */
3767    QLIST_FOREACH(child, &bs->children, next) {
3768        char *child_key_dot;
3769        child_key_dot = g_strdup_printf("%s.", child->name);
3770        qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
3771        qdict_extract_subqdict(bs->options, NULL, child_key_dot);
3772        qdict_del(bs->explicit_options, child->name);
3773        qdict_del(bs->options, child->name);
3774        g_free(child_key_dot);
3775    }
3776
3777    /* Check if any unknown options were used */
3778    if (qdict_size(options) != 0) {
3779        const QDictEntry *entry = qdict_first(options);
3780        if (flags & BDRV_O_PROTOCOL) {
3781            error_setg(errp, "Block protocol '%s' doesn't support the option "
3782                       "'%s'", drv->format_name, entry->key);
3783        } else {
3784            error_setg(errp,
3785                       "Block format '%s' does not support the option '%s'",
3786                       drv->format_name, entry->key);
3787        }
3788
3789        goto close_and_fail;
3790    }
3791
3792    bdrv_parent_cb_change_media(bs, true);
3793
3794    qobject_unref(options);
3795    options = NULL;
3796
3797    /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
3798     * temporary snapshot afterwards. */
3799    if (snapshot_flags) {
3800        BlockDriverState *snapshot_bs;
3801        snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
3802                                                snapshot_options, &local_err);
3803        snapshot_options = NULL;
3804        if (local_err) {
3805            goto close_and_fail;
3806        }
3807        /* We are not going to return bs but the overlay on top of it
3808         * (snapshot_bs); thus, we have to drop the strong reference to bs
3809         * (which we obtained by calling bdrv_new()). bs will not be deleted,
3810         * though, because the overlay still has a reference to it. */
3811        bdrv_unref(bs);
3812        bs = snapshot_bs;
3813    }
3814
3815    return bs;
3816
3817fail:
3818    blk_unref(file);
3819    qobject_unref(snapshot_options);
3820    qobject_unref(bs->explicit_options);
3821    qobject_unref(bs->options);
3822    qobject_unref(options);
3823    bs->options = NULL;
3824    bs->explicit_options = NULL;
3825    bdrv_unref(bs);
3826    error_propagate(errp, local_err);
3827    return NULL;
3828
3829close_and_fail:
3830    bdrv_unref(bs);
3831    qobject_unref(snapshot_options);
3832    qobject_unref(options);
3833    error_propagate(errp, local_err);
3834    return NULL;
3835}
3836
3837BlockDriverState *bdrv_open(const char *filename, const char *reference,
3838                            QDict *options, int flags, Error **errp)
3839{
3840    return bdrv_open_inherit(filename, reference, options, flags, NULL,
3841                             NULL, 0, errp);
3842}
3843
3844/* Return true if the NULL-terminated @list contains @str */
3845static bool is_str_in_list(const char *str, const char *const *list)
3846{
3847    if (str && list) {
3848        int i;
3849        for (i = 0; list[i] != NULL; i++) {
3850            if (!strcmp(str, list[i])) {
3851                return true;
3852            }
3853        }
3854    }
3855    return false;
3856}
3857
3858/*
3859 * Check that every option set in @bs->options is also set in
3860 * @new_opts.
3861 *
3862 * Options listed in the common_options list and in
3863 * @bs->drv->mutable_opts are skipped.
3864 *
3865 * Return 0 on success, otherwise return -EINVAL and set @errp.
3866 */
3867static int bdrv_reset_options_allowed(BlockDriverState *bs,
3868                                      const QDict *new_opts, Error **errp)
3869{
3870    const QDictEntry *e;
3871    /* These options are common to all block drivers and are handled
3872     * in bdrv_reopen_prepare() so they can be left out of @new_opts */
3873    const char *const common_options[] = {
3874        "node-name", "discard", "cache.direct", "cache.no-flush",
3875        "read-only", "auto-read-only", "detect-zeroes", NULL
3876    };
3877
3878    for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
3879        if (!qdict_haskey(new_opts, e->key) &&
3880            !is_str_in_list(e->key, common_options) &&
3881            !is_str_in_list(e->key, bs->drv->mutable_opts)) {
3882            error_setg(errp, "Option '%s' cannot be reset "
3883                       "to its default value", e->key);
3884            return -EINVAL;
3885        }
3886    }
3887
3888    return 0;
3889}
3890
3891/*
3892 * Returns true if @child can be reached recursively from @bs
3893 */
3894static bool bdrv_recurse_has_child(BlockDriverState *bs,
3895                                   BlockDriverState *child)
3896{
3897    BdrvChild *c;
3898
3899    if (bs == child) {
3900        return true;
3901    }
3902
3903    QLIST_FOREACH(c, &bs->children, next) {
3904        if (bdrv_recurse_has_child(c->bs, child)) {
3905            return true;
3906        }
3907    }
3908
3909    return false;
3910}
3911
3912/*
3913 * Adds a BlockDriverState to a simple queue for an atomic, transactional
3914 * reopen of multiple devices.
3915 *
3916 * bs_queue can either be an existing BlockReopenQueue that has had QTAILQ_INIT
3917 * already performed, or alternatively may be NULL a new BlockReopenQueue will
3918 * be created and initialized. This newly created BlockReopenQueue should be
3919 * passed back in for subsequent calls that are intended to be of the same
3920 * atomic 'set'.
3921 *
3922 * bs is the BlockDriverState to add to the reopen queue.
3923 *
3924 * options contains the changed options for the associated bs
3925 * (the BlockReopenQueue takes ownership)
3926 *
3927 * flags contains the open flags for the associated bs
3928 *
3929 * returns a pointer to bs_queue, which is either the newly allocated
3930 * bs_queue, or the existing bs_queue being used.
3931 *
3932 * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
3933 */
3934static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
3935                                                 BlockDriverState *bs,
3936                                                 QDict *options,
3937                                                 const BdrvChildClass *klass,
3938                                                 BdrvChildRole role,
3939                                                 bool parent_is_format,
3940                                                 QDict *parent_options,
3941                                                 int parent_flags,
3942                                                 bool keep_old_opts)
3943{
3944    assert(bs != NULL);
3945
3946    BlockReopenQueueEntry *bs_entry;
3947    BdrvChild *child;
3948    QDict *old_options, *explicit_options, *options_copy;
3949    int flags;
3950    QemuOpts *opts;
3951
3952    /* Make sure that the caller remembered to use a drained section. This is
3953     * important to avoid graph changes between the recursive queuing here and
3954     * bdrv_reopen_multiple(). */
3955    assert(bs->quiesce_counter > 0);
3956
3957    if (bs_queue == NULL) {
3958        bs_queue = g_new0(BlockReopenQueue, 1);
3959        QTAILQ_INIT(bs_queue);
3960    }
3961
3962    if (!options) {
3963        options = qdict_new();
3964    }
3965
3966    /* Check if this BlockDriverState is already in the queue */
3967    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
3968        if (bs == bs_entry->state.bs) {
3969            break;
3970        }
3971    }
3972
3973    /*
3974     * Precedence of options:
3975     * 1. Explicitly passed in options (highest)
3976     * 2. Retained from explicitly set options of bs
3977     * 3. Inherited from parent node
3978     * 4. Retained from effective options of bs
3979     */
3980
3981    /* Old explicitly set values (don't overwrite by inherited value) */
3982    if (bs_entry || keep_old_opts) {
3983        old_options = qdict_clone_shallow(bs_entry ?
3984                                          bs_entry->state.explicit_options :
3985                                          bs->explicit_options);
3986        bdrv_join_options(bs, options, old_options);
3987        qobject_unref(old_options);
3988    }
3989
3990    explicit_options = qdict_clone_shallow(options);
3991
3992    /* Inherit from parent node */
3993    if (parent_options) {
3994        flags = 0;
3995        klass->inherit_options(role, parent_is_format, &flags, options,
3996                               parent_flags, parent_options);
3997    } else {
3998        flags = bdrv_get_flags(bs);
3999    }
4000
4001    if (keep_old_opts) {
4002        /* Old values are used for options that aren't set yet */
4003        old_options = qdict_clone_shallow(bs->options);
4004        bdrv_join_options(bs, options, old_options);
4005        qobject_unref(old_options);
4006    }
4007
4008    /* We have the final set of options so let's update the flags */
4009    options_copy = qdict_clone_shallow(options);
4010    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4011    qemu_opts_absorb_qdict(opts, options_copy, NULL);
4012    update_flags_from_options(&flags, opts);
4013    qemu_opts_del(opts);
4014    qobject_unref(options_copy);
4015
4016    /* bdrv_open_inherit() sets and clears some additional flags internally */
4017    flags &= ~BDRV_O_PROTOCOL;
4018    if (flags & BDRV_O_RDWR) {
4019        flags |= BDRV_O_ALLOW_RDWR;
4020    }
4021
4022    if (!bs_entry) {
4023        bs_entry = g_new0(BlockReopenQueueEntry, 1);
4024        QTAILQ_INSERT_TAIL(bs_queue, bs_entry, entry);
4025    } else {
4026        qobject_unref(bs_entry->state.options);
4027        qobject_unref(bs_entry->state.explicit_options);
4028    }
4029
4030    bs_entry->state.bs = bs;
4031    bs_entry->state.options = options;
4032    bs_entry->state.explicit_options = explicit_options;
4033    bs_entry->state.flags = flags;
4034
4035    /*
4036     * If keep_old_opts is false then it means that unspecified
4037     * options must be reset to their original value. We don't allow
4038     * resetting 'backing' but we need to know if the option is
4039     * missing in order to decide if we have to return an error.
4040     */
4041    if (!keep_old_opts) {
4042        bs_entry->state.backing_missing =
4043            !qdict_haskey(options, "backing") &&
4044            !qdict_haskey(options, "backing.driver");
4045    }
4046
4047    QLIST_FOREACH(child, &bs->children, next) {
4048        QDict *new_child_options = NULL;
4049        bool child_keep_old = keep_old_opts;
4050
4051        /* reopen can only change the options of block devices that were
4052         * implicitly created and inherited options. For other (referenced)
4053         * block devices, a syntax like "backing.foo" results in an error. */
4054        if (child->bs->inherits_from != bs) {
4055            continue;
4056        }
4057
4058        /* Check if the options contain a child reference */
4059        if (qdict_haskey(options, child->name)) {
4060            const char *childref = qdict_get_try_str(options, child->name);
4061            /*
4062             * The current child must not be reopened if the child
4063             * reference is null or points to a different node.
4064             */
4065            if (g_strcmp0(childref, child->bs->node_name)) {
4066                continue;
4067            }
4068            /*
4069             * If the child reference points to the current child then
4070             * reopen it with its existing set of options (note that
4071             * it can still inherit new options from the parent).
4072             */
4073            child_keep_old = true;
4074        } else {
4075            /* Extract child options ("child-name.*") */
4076            char *child_key_dot = g_strdup_printf("%s.", child->name);
4077            qdict_extract_subqdict(explicit_options, NULL, child_key_dot);
4078            qdict_extract_subqdict(options, &new_child_options, child_key_dot);
4079            g_free(child_key_dot);
4080        }
4081
4082        bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options,
4083                                child->klass, child->role, bs->drv->is_format,
4084                                options, flags, child_keep_old);
4085    }
4086
4087    return bs_queue;
4088}
4089
4090BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
4091                                    BlockDriverState *bs,
4092                                    QDict *options, bool keep_old_opts)
4093{
4094    return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false,
4095                                   NULL, 0, keep_old_opts);
4096}
4097
4098void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)
4099{
4100    if (bs_queue) {
4101        BlockReopenQueueEntry *bs_entry, *next;
4102        QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4103            qobject_unref(bs_entry->state.explicit_options);
4104            qobject_unref(bs_entry->state.options);
4105            g_free(bs_entry);
4106        }
4107        g_free(bs_queue);
4108    }
4109}
4110
4111/*
4112 * Reopen multiple BlockDriverStates atomically & transactionally.
4113 *
4114 * The queue passed in (bs_queue) must have been built up previous
4115 * via bdrv_reopen_queue().
4116 *
4117 * Reopens all BDS specified in the queue, with the appropriate
4118 * flags.  All devices are prepared for reopen, and failure of any
4119 * device will cause all device changes to be abandoned, and intermediate
4120 * data cleaned up.
4121 *
4122 * If all devices prepare successfully, then the changes are committed
4123 * to all devices.
4124 *
4125 * All affected nodes must be drained between bdrv_reopen_queue() and
4126 * bdrv_reopen_multiple().
4127 *
4128 * To be called from the main thread, with all other AioContexts unlocked.
4129 */
4130int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
4131{
4132    int ret = -1;
4133    BlockReopenQueueEntry *bs_entry, *next;
4134    AioContext *ctx;
4135    Transaction *tran = tran_new();
4136    g_autoptr(GHashTable) found = NULL;
4137    g_autoptr(GSList) refresh_list = NULL;
4138
4139    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
4140    assert(bs_queue != NULL);
4141
4142    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4143        ctx = bdrv_get_aio_context(bs_entry->state.bs);
4144        aio_context_acquire(ctx);
4145        ret = bdrv_flush(bs_entry->state.bs);
4146        aio_context_release(ctx);
4147        if (ret < 0) {
4148            error_setg_errno(errp, -ret, "Error flushing drive");
4149            goto abort;
4150        }
4151    }
4152
4153    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4154        assert(bs_entry->state.bs->quiesce_counter > 0);
4155        ctx = bdrv_get_aio_context(bs_entry->state.bs);
4156        aio_context_acquire(ctx);
4157        ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
4158        aio_context_release(ctx);
4159        if (ret < 0) {
4160            goto abort;
4161        }
4162        bs_entry->prepared = true;
4163    }
4164
4165    found = g_hash_table_new(NULL, NULL);
4166    QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4167        BDRVReopenState *state = &bs_entry->state;
4168
4169        refresh_list = bdrv_topological_dfs(refresh_list, found, state->bs);
4170        if (state->old_backing_bs) {
4171            refresh_list = bdrv_topological_dfs(refresh_list, found,
4172                                                state->old_backing_bs);
4173        }
4174        if (state->old_file_bs) {
4175            refresh_list = bdrv_topological_dfs(refresh_list, found,
4176                                                state->old_file_bs);
4177        }
4178    }
4179
4180    /*
4181     * Note that file-posix driver rely on permission update done during reopen
4182     * (even if no permission changed), because it wants "new" permissions for
4183     * reconfiguring the fd and that's why it does it in raw_check_perm(), not
4184     * in raw_reopen_prepare() which is called with "old" permissions.
4185     */
4186    ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
4187    if (ret < 0) {
4188        goto abort;
4189    }
4190
4191    /*
4192     * If we reach this point, we have success and just need to apply the
4193     * changes.
4194     *
4195     * Reverse order is used to comfort qcow2 driver: on commit it need to write
4196     * IN_USE flag to the image, to mark bitmaps in the image as invalid. But
4197     * children are usually goes after parents in reopen-queue, so go from last
4198     * to first element.
4199     */
4200    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4201        ctx = bdrv_get_aio_context(bs_entry->state.bs);
4202        aio_context_acquire(ctx);
4203        bdrv_reopen_commit(&bs_entry->state);
4204        aio_context_release(ctx);
4205    }
4206
4207    tran_commit(tran);
4208
4209    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
4210        BlockDriverState *bs = bs_entry->state.bs;
4211
4212        if (bs->drv->bdrv_reopen_commit_post) {
4213            ctx = bdrv_get_aio_context(bs);
4214            aio_context_acquire(ctx);
4215            bs->drv->bdrv_reopen_commit_post(&bs_entry->state);
4216            aio_context_release(ctx);
4217        }
4218    }
4219
4220    ret = 0;
4221    goto cleanup;
4222
4223abort:
4224    tran_abort(tran);
4225    QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
4226        if (bs_entry->prepared) {
4227            ctx = bdrv_get_aio_context(bs_entry->state.bs);
4228            aio_context_acquire(ctx);
4229            bdrv_reopen_abort(&bs_entry->state);
4230            aio_context_release(ctx);
4231        }
4232    }
4233
4234cleanup:
4235    bdrv_reopen_queue_free(bs_queue);
4236
4237    return ret;
4238}
4239
4240int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
4241                Error **errp)
4242{
4243    AioContext *ctx = bdrv_get_aio_context(bs);
4244    BlockReopenQueue *queue;
4245    int ret;
4246
4247    bdrv_subtree_drained_begin(bs);
4248    if (ctx != qemu_get_aio_context()) {
4249        aio_context_release(ctx);
4250    }
4251
4252    queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts);
4253    ret = bdrv_reopen_multiple(queue, errp);
4254
4255    if (ctx != qemu_get_aio_context()) {
4256        aio_context_acquire(ctx);
4257    }
4258    bdrv_subtree_drained_end(bs);
4259
4260    return ret;
4261}
4262
4263int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
4264                              Error **errp)
4265{
4266    QDict *opts = qdict_new();
4267
4268    qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only);
4269
4270    return bdrv_reopen(bs, opts, true, errp);
4271}
4272
4273/*
4274 * Take a BDRVReopenState and check if the value of 'backing' in the
4275 * reopen_state->options QDict is valid or not.
4276 *
4277 * If 'backing' is missing from the QDict then return 0.
4278 *
4279 * If 'backing' contains the node name of the backing file of
4280 * reopen_state->bs then return 0.
4281 *
4282 * If 'backing' contains a different node name (or is null) then check
4283 * whether the current backing file can be replaced with the new one.
4284 * If that's the case then reopen_state->replace_backing_bs is set to
4285 * true and reopen_state->new_backing_bs contains a pointer to the new
4286 * backing BlockDriverState (or NULL).
4287 *
4288 * Return 0 on success, otherwise return < 0 and set @errp.
4289 */
4290static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
4291                                             bool is_backing, Transaction *tran,
4292                                             Error **errp)
4293{
4294    BlockDriverState *bs = reopen_state->bs;
4295    BlockDriverState *new_child_bs;
4296    BlockDriverState *old_child_bs = is_backing ? child_bs(bs->backing) :
4297                                                  child_bs(bs->file);
4298    const char *child_name = is_backing ? "backing" : "file";
4299    QObject *value;
4300    const char *str;
4301
4302    value = qdict_get(reopen_state->options, child_name);
4303    if (value == NULL) {
4304        return 0;
4305    }
4306
4307    switch (qobject_type(value)) {
4308    case QTYPE_QNULL:
4309        assert(is_backing); /* The 'file' option does not allow a null value */
4310        new_child_bs = NULL;
4311        break;
4312    case QTYPE_QSTRING:
4313        str = qstring_get_str(qobject_to(QString, value));
4314        new_child_bs = bdrv_lookup_bs(NULL, str, errp);
4315        if (new_child_bs == NULL) {
4316            return -EINVAL;
4317        } else if (bdrv_recurse_has_child(new_child_bs, bs)) {
4318            error_setg(errp, "Making '%s' a %s child of '%s' would create a "
4319                       "cycle", str, child_name, bs->node_name);
4320            return -EINVAL;
4321        }
4322        break;
4323    default:
4324        /*
4325         * The options QDict has been flattened, so 'backing' and 'file'
4326         * do not allow any other data type here.
4327         */
4328        g_assert_not_reached();
4329    }
4330
4331    if (old_child_bs == new_child_bs) {
4332        return 0;
4333    }
4334
4335    if (old_child_bs) {
4336        if (bdrv_skip_implicit_filters(old_child_bs) == new_child_bs) {
4337            return 0;
4338        }
4339
4340        if (old_child_bs->implicit) {
4341            error_setg(errp, "Cannot replace implicit %s child of %s",
4342                       child_name, bs->node_name);
4343            return -EPERM;
4344        }
4345    }
4346
4347    if (bs->drv->is_filter && !old_child_bs) {
4348        /*
4349         * Filters always have a file or a backing child, so we are trying to
4350         * change wrong child
4351         */
4352        error_setg(errp, "'%s' is a %s filter node that does not support a "
4353                   "%s child", bs->node_name, bs->drv->format_name, child_name);
4354        return -EINVAL;
4355    }
4356
4357    if (is_backing) {
4358        reopen_state->old_backing_bs = old_child_bs;
4359    } else {
4360        reopen_state->old_file_bs = old_child_bs;
4361    }
4362
4363    return bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
4364                                           tran, errp);
4365}
4366
4367/*
4368 * Prepares a BlockDriverState for reopen. All changes are staged in the
4369 * 'opaque' field of the BDRVReopenState, which is used and allocated by
4370 * the block driver layer .bdrv_reopen_prepare()
4371 *
4372 * bs is the BlockDriverState to reopen
4373 * flags are the new open flags
4374 * queue is the reopen queue
4375 *
4376 * Returns 0 on success, non-zero on error.  On error errp will be set
4377 * as well.
4378 *
4379 * On failure, bdrv_reopen_abort() will be called to clean up any data.
4380 * It is the responsibility of the caller to then call the abort() or
4381 * commit() for any other BDS that have been left in a prepare() state
4382 *
4383 */
4384static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
4385                               BlockReopenQueue *queue,
4386                               Transaction *change_child_tran, Error **errp)
4387{
4388    int ret = -1;
4389    int old_flags;
4390    Error *local_err = NULL;
4391    BlockDriver *drv;
4392    QemuOpts *opts;
4393    QDict *orig_reopen_opts;
4394    char *discard = NULL;
4395    bool read_only;
4396    bool drv_prepared = false;
4397
4398    assert(reopen_state != NULL);
4399    assert(reopen_state->bs->drv != NULL);
4400    drv = reopen_state->bs->drv;
4401
4402    /* This function and each driver's bdrv_reopen_prepare() remove
4403     * entries from reopen_state->options as they are processed, so
4404     * we need to make a copy of the original QDict. */
4405    orig_reopen_opts = qdict_clone_shallow(reopen_state->options);
4406
4407    /* Process generic block layer options */
4408    opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
4409    if (!qemu_opts_absorb_qdict(opts, reopen_state->options, errp)) {
4410        ret = -EINVAL;
4411        goto error;
4412    }
4413
4414    /* This was already called in bdrv_reopen_queue_child() so the flags
4415     * are up-to-date. This time we simply want to remove the options from
4416     * QemuOpts in order to indicate that they have been processed. */
4417    old_flags = reopen_state->flags;
4418    update_flags_from_options(&reopen_state->flags, opts);
4419    assert(old_flags == reopen_state->flags);
4420
4421    discard = qemu_opt_get_del(opts, BDRV_OPT_DISCARD);
4422    if (discard != NULL) {
4423        if (bdrv_parse_discard_flags(discard, &reopen_state->flags) != 0) {
4424            error_setg(errp, "Invalid discard option");
4425            ret = -EINVAL;
4426            goto error;
4427        }
4428    }
4429
4430    reopen_state->detect_zeroes =
4431        bdrv_parse_detect_zeroes(opts, reopen_state->flags, &local_err);
4432    if (local_err) {
4433        error_propagate(errp, local_err);
4434        ret = -EINVAL;
4435        goto error;
4436    }
4437
4438    /* All other options (including node-name and driver) must be unchanged.
4439     * Put them back into the QDict, so that they are checked at the end
4440     * of this function. */
4441    qemu_opts_to_qdict(opts, reopen_state->options);
4442
4443    /* If we are to stay read-only, do not allow permission change
4444     * to r/w. Attempting to set to r/w may fail if either BDRV_O_ALLOW_RDWR is
4445     * not set, or if the BDS still has copy_on_read enabled */
4446    read_only = !(reopen_state->flags & BDRV_O_RDWR);
4447    ret = bdrv_can_set_read_only(reopen_state->bs, read_only, true, &local_err);
4448    if (local_err) {
4449        error_propagate(errp, local_err);
4450        goto error;
4451    }
4452
4453    if (drv->bdrv_reopen_prepare) {
4454        /*
4455         * If a driver-specific option is missing, it means that we
4456         * should reset it to its default value.
4457         * But not all options allow that, so we need to check it first.
4458         */
4459        ret = bdrv_reset_options_allowed(reopen_state->bs,
4460                                         reopen_state->options, errp);
4461        if (ret) {
4462            goto error;
4463        }
4464
4465        ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
4466        if (ret) {
4467            if (local_err != NULL) {
4468                error_propagate(errp, local_err);
4469            } else {
4470                bdrv_refresh_filename(reopen_state->bs);
4471                error_setg(errp, "failed while preparing to reopen image '%s'",
4472                           reopen_state->bs->filename);
4473            }
4474            goto error;
4475        }
4476    } else {
4477        /* It is currently mandatory to have a bdrv_reopen_prepare()
4478         * handler for each supported drv. */
4479        error_setg(errp, "Block format '%s' used by node '%s' "
4480                   "does not support reopening files", drv->format_name,
4481                   bdrv_get_device_or_node_name(reopen_state->bs));
4482        ret = -1;
4483        goto error;
4484    }
4485
4486    drv_prepared = true;
4487
4488    /*
4489     * We must provide the 'backing' option if the BDS has a backing
4490     * file or if the image file has a backing file name as part of
4491     * its metadata. Otherwise the 'backing' option can be omitted.
4492     */
4493    if (drv->supports_backing && reopen_state->backing_missing &&
4494        (reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
4495        error_setg(errp, "backing is missing for '%s'",
4496                   reopen_state->bs->node_name);
4497        ret = -EINVAL;
4498        goto error;
4499    }
4500
4501    /*
4502     * Allow changing the 'backing' option. The new value can be
4503     * either a reference to an existing node (using its node name)
4504     * or NULL to simply detach the current backing file.
4505     */
4506    ret = bdrv_reopen_parse_file_or_backing(reopen_state, true,
4507                                            change_child_tran, errp);
4508    if (ret < 0) {
4509        goto error;
4510    }
4511    qdict_del(reopen_state->options, "backing");
4512
4513    /* Allow changing the 'file' option. In this case NULL is not allowed */
4514    ret = bdrv_reopen_parse_file_or_backing(reopen_state, false,
4515                                            change_child_tran, errp);
4516    if (ret < 0) {
4517        goto error;
4518    }
4519    qdict_del(reopen_state->options, "file");
4520
4521    /* Options that are not handled are only okay if they are unchanged
4522     * compared to the old state. It is expected that some options are only
4523     * used for the initial open, but not reopen (e.g. filename) */
4524    if (qdict_size(reopen_state->options)) {
4525        const QDictEntry *entry = qdict_first(reopen_state->options);
4526
4527        do {
4528            QObject *new = entry->value;
4529            QObject *old = qdict_get(reopen_state->bs->options, entry->key);
4530
4531            /* Allow child references (child_name=node_name) as long as they
4532             * point to the current child (i.e. everything stays the same). */
4533            if (qobject_type(new) == QTYPE_QSTRING) {
4534                BdrvChild *child;
4535                QLIST_FOREACH(child, &reopen_state->bs->children, next) {
4536                    if (!strcmp(child->name, entry->key)) {
4537                        break;
4538                    }
4539                }
4540
4541                if (child) {
4542                    if (!strcmp(child->bs->node_name,
4543                                qstring_get_str(qobject_to(QString, new)))) {
4544                        continue; /* Found child with this name, skip option */
4545                    }
4546                }
4547            }
4548
4549            /*
4550             * TODO: When using -drive to specify blockdev options, all values
4551             * will be strings; however, when using -blockdev, blockdev-add or
4552             * filenames using the json:{} pseudo-protocol, they will be
4553             * correctly typed.
4554             * In contrast, reopening options are (currently) always strings
4555             * (because you can only specify them through qemu-io; all other
4556             * callers do not specify any options).
4557             * Therefore, when using anything other than -drive to create a BDS,
4558             * this cannot detect non-string options as unchanged, because
4559             * qobject_is_equal() always returns false for objects of different
4560             * type.  In the future, this should be remedied by correctly typing
4561             * all options.  For now, this is not too big of an issue because
4562             * the user can simply omit options which cannot be changed anyway,
4563             * so they will stay unchanged.
4564             */
4565            if (!qobject_is_equal(new, old)) {
4566                error_setg(errp, "Cannot change the option '%s'", entry->key);
4567                ret = -EINVAL;
4568                goto error;
4569            }
4570        } while ((entry = qdict_next(reopen_state->options, entry)));
4571    }
4572
4573    ret = 0;
4574
4575    /* Restore the original reopen_state->options QDict */
4576    qobject_unref(reopen_state->options);
4577    reopen_state->options = qobject_ref(orig_reopen_opts);
4578
4579error:
4580    if (ret < 0 && drv_prepared) {
4581        /* drv->bdrv_reopen_prepare() has succeeded, so we need to
4582         * call drv->bdrv_reopen_abort() before signaling an error
4583         * (bdrv_reopen_multiple() will not call bdrv_reopen_abort()
4584         * when the respective bdrv_reopen_prepare() has failed) */
4585        if (drv->bdrv_reopen_abort) {
4586            drv->bdrv_reopen_abort(reopen_state);
4587        }
4588    }
4589    qemu_opts_del(opts);
4590    qobject_unref(orig_reopen_opts);
4591    g_free(discard);
4592    return ret;
4593}
4594
4595/*
4596 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
4597 * makes them final by swapping the staging BlockDriverState contents into
4598 * the active BlockDriverState contents.
4599 */
4600static void bdrv_reopen_commit(BDRVReopenState *reopen_state)
4601{
4602    BlockDriver *drv;
4603    BlockDriverState *bs;
4604    BdrvChild *child;
4605
4606    assert(reopen_state != NULL);
4607    bs = reopen_state->bs;
4608    drv = bs->drv;
4609    assert(drv != NULL);
4610
4611    /* If there are any driver level actions to take */
4612    if (drv->bdrv_reopen_commit) {
4613        drv->bdrv_reopen_commit(reopen_state);
4614    }
4615
4616    /* set BDS specific flags now */
4617    qobject_unref(bs->explicit_options);
4618    qobject_unref(bs->options);
4619    qobject_ref(reopen_state->explicit_options);
4620    qobject_ref(reopen_state->options);
4621
4622    bs->explicit_options   = reopen_state->explicit_options;
4623    bs->options            = reopen_state->options;
4624    bs->open_flags         = reopen_state->flags;
4625    bs->detect_zeroes      = reopen_state->detect_zeroes;
4626
4627    /* Remove child references from bs->options and bs->explicit_options.
4628     * Child options were already removed in bdrv_reopen_queue_child() */
4629    QLIST_FOREACH(child, &bs->children, next) {
4630        qdict_del(bs->explicit_options, child->name);
4631        qdict_del(bs->options, child->name);
4632    }
4633    /* backing is probably removed, so it's not handled by previous loop */
4634    qdict_del(bs->explicit_options, "backing");
4635    qdict_del(bs->options, "backing");
4636
4637    bdrv_refresh_limits(bs, NULL, NULL);
4638}
4639
4640/*
4641 * Abort the reopen, and delete and free the staged changes in
4642 * reopen_state
4643 */
4644static void bdrv_reopen_abort(BDRVReopenState *reopen_state)
4645{
4646    BlockDriver *drv;
4647
4648    assert(reopen_state != NULL);
4649    drv = reopen_state->bs->drv;
4650    assert(drv != NULL);
4651
4652    if (drv->bdrv_reopen_abort) {
4653        drv->bdrv_reopen_abort(reopen_state);
4654    }
4655}
4656
4657
4658static void bdrv_close(BlockDriverState *bs)
4659{
4660    BdrvAioNotifier *ban, *ban_next;
4661    BdrvChild *child, *next;
4662
4663    assert(!bs->refcnt);
4664
4665    bdrv_drained_begin(bs); /* complete I/O */
4666    bdrv_flush(bs);
4667    bdrv_drain(bs); /* in case flush left pending I/O */
4668
4669    if (bs->drv) {
4670        if (bs->drv->bdrv_close) {
4671            /* Must unfreeze all children, so bdrv_unref_child() works */
4672            bs->drv->bdrv_close(bs);
4673        }
4674        bs->drv = NULL;
4675    }
4676
4677    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
4678        bdrv_unref_child(bs, child);
4679    }
4680
4681    bs->backing = NULL;
4682    bs->file = NULL;
4683    g_free(bs->opaque);
4684    bs->opaque = NULL;
4685    qatomic_set(&bs->copy_on_read, 0);
4686    bs->backing_file[0] = '\0';
4687    bs->backing_format[0] = '\0';
4688    bs->total_sectors = 0;
4689    bs->encrypted = false;
4690    bs->sg = false;
4691    qobject_unref(bs->options);
4692    qobject_unref(bs->explicit_options);
4693    bs->options = NULL;
4694    bs->explicit_options = NULL;
4695    qobject_unref(bs->full_open_options);
4696    bs->full_open_options = NULL;
4697
4698    bdrv_release_named_dirty_bitmaps(bs);
4699    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
4700
4701    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
4702        g_free(ban);
4703    }
4704    QLIST_INIT(&bs->aio_notifiers);
4705    bdrv_drained_end(bs);
4706
4707    /*
4708     * If we're still inside some bdrv_drain_all_begin()/end() sections, end
4709     * them now since this BDS won't exist anymore when bdrv_drain_all_end()
4710     * gets called.
4711     */
4712    if (bs->quiesce_counter) {
4713        bdrv_drain_all_end_quiesce(bs);
4714    }
4715}
4716
4717void bdrv_close_all(void)
4718{
4719    assert(job_next(NULL) == NULL);
4720
4721    /* Drop references from requests still in flight, such as canceled block
4722     * jobs whose AIO context has not been polled yet */
4723    bdrv_drain_all();
4724
4725    blk_remove_all_bs();
4726    blockdev_close_all_bdrv_states();
4727
4728    assert(QTAILQ_EMPTY(&all_bdrv_states));
4729}
4730
4731static bool should_update_child(BdrvChild *c, BlockDriverState *to)
4732{
4733    GQueue *queue;
4734    GHashTable *found;
4735    bool ret;
4736
4737    if (c->klass->stay_at_node) {
4738        return false;
4739    }
4740
4741    /* If the child @c belongs to the BDS @to, replacing the current
4742     * c->bs by @to would mean to create a loop.
4743     *
4744     * Such a case occurs when appending a BDS to a backing chain.
4745     * For instance, imagine the following chain:
4746     *
4747     *   guest device -> node A -> further backing chain...
4748     *
4749     * Now we create a new BDS B which we want to put on top of this
4750     * chain, so we first attach A as its backing node:
4751     *
4752     *                   node B
4753     *                     |
4754     *                     v
4755     *   guest device -> node A -> further backing chain...
4756     *
4757     * Finally we want to replace A by B.  When doing that, we want to
4758     * replace all pointers to A by pointers to B -- except for the
4759     * pointer from B because (1) that would create a loop, and (2)
4760     * that pointer should simply stay intact:
4761     *
4762     *   guest device -> node B
4763     *                     |
4764     *                     v
4765     *                   node A -> further backing chain...
4766     *
4767     * In general, when replacing a node A (c->bs) by a node B (@to),
4768     * if A is a child of B, that means we cannot replace A by B there
4769     * because that would create a loop.  Silently detaching A from B
4770     * is also not really an option.  So overall just leaving A in
4771     * place there is the most sensible choice.
4772     *
4773     * We would also create a loop in any cases where @c is only
4774     * indirectly referenced by @to. Prevent this by returning false
4775     * if @c is found (by breadth-first search) anywhere in the whole
4776     * subtree of @to.
4777     */
4778
4779    ret = true;
4780    found = g_hash_table_new(NULL, NULL);
4781    g_hash_table_add(found, to);
4782    queue = g_queue_new();
4783    g_queue_push_tail(queue, to);
4784
4785    while (!g_queue_is_empty(queue)) {
4786        BlockDriverState *v = g_queue_pop_head(queue);
4787        BdrvChild *c2;
4788
4789        QLIST_FOREACH(c2, &v->children, next) {
4790            if (c2 == c) {
4791                ret = false;
4792                break;
4793            }
4794
4795            if (g_hash_table_contains(found, c2->bs)) {
4796                continue;
4797            }
4798
4799            g_queue_push_tail(queue, c2->bs);
4800            g_hash_table_add(found, c2->bs);
4801        }
4802    }
4803
4804    g_queue_free(queue);
4805    g_hash_table_destroy(found);
4806
4807    return ret;
4808}
4809
4810typedef struct BdrvRemoveFilterOrCowChild {
4811    BdrvChild *child;
4812    bool is_backing;
4813} BdrvRemoveFilterOrCowChild;
4814
4815static void bdrv_remove_filter_or_cow_child_abort(void *opaque)
4816{
4817    BdrvRemoveFilterOrCowChild *s = opaque;
4818    BlockDriverState *parent_bs = s->child->opaque;
4819
4820    QLIST_INSERT_HEAD(&parent_bs->children, s->child, next);
4821    if (s->is_backing) {
4822        parent_bs->backing = s->child;
4823    } else {
4824        parent_bs->file = s->child;
4825    }
4826
4827    /*
4828     * We don't have to restore child->bs here to undo bdrv_replace_child_tran()
4829     * because that function is transactionable and it registered own completion
4830     * entries in @tran, so .abort() for bdrv_replace_child_safe() will be
4831     * called automatically.
4832     */
4833}
4834
4835static void bdrv_remove_filter_or_cow_child_commit(void *opaque)
4836{
4837    BdrvRemoveFilterOrCowChild *s = opaque;
4838
4839    bdrv_child_free(s->child);
4840}
4841
4842static TransactionActionDrv bdrv_remove_filter_or_cow_child_drv = {
4843    .abort = bdrv_remove_filter_or_cow_child_abort,
4844    .commit = bdrv_remove_filter_or_cow_child_commit,
4845    .clean = g_free,
4846};
4847
4848/*
4849 * A function to remove backing or file child of @bs.
4850 * Function doesn't update permissions, caller is responsible for this.
4851 */
4852static void bdrv_remove_file_or_backing_child(BlockDriverState *bs,
4853                                              BdrvChild *child,
4854                                              Transaction *tran)
4855{
4856    BdrvRemoveFilterOrCowChild *s;
4857
4858    assert(child == bs->backing || child == bs->file);
4859
4860    if (!child) {
4861        return;
4862    }
4863
4864    if (child->bs) {
4865        bdrv_replace_child_tran(child, NULL, tran);
4866    }
4867
4868    s = g_new(BdrvRemoveFilterOrCowChild, 1);
4869    *s = (BdrvRemoveFilterOrCowChild) {
4870        .child = child,
4871        .is_backing = (child == bs->backing),
4872    };
4873    tran_add(tran, &bdrv_remove_filter_or_cow_child_drv, s);
4874
4875    QLIST_SAFE_REMOVE(child, next);
4876    if (s->is_backing) {
4877        bs->backing = NULL;
4878    } else {
4879        bs->file = NULL;
4880    }
4881}
4882
4883/*
4884 * A function to remove backing-chain child of @bs if exists: cow child for
4885 * format nodes (always .backing) and filter child for filters (may be .file or
4886 * .backing)
4887 */
4888static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
4889                                            Transaction *tran)
4890{
4891    bdrv_remove_file_or_backing_child(bs, bdrv_filter_or_cow_child(bs), tran);
4892}
4893
4894static int bdrv_replace_node_noperm(BlockDriverState *from,
4895                                    BlockDriverState *to,
4896                                    bool auto_skip, Transaction *tran,
4897                                    Error **errp)
4898{
4899    BdrvChild *c, *next;
4900
4901    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
4902        assert(c->bs == from);
4903        if (!should_update_child(c, to)) {
4904            if (auto_skip) {
4905                continue;
4906            }
4907            error_setg(errp, "Should not change '%s' link to '%s'",
4908                       c->name, from->node_name);
4909            return -EINVAL;
4910        }
4911        if (c->frozen) {
4912            error_setg(errp, "Cannot change '%s' link to '%s'",
4913                       c->name, from->node_name);
4914            return -EPERM;
4915        }
4916        bdrv_replace_child_tran(c, to, tran);
4917    }
4918
4919    return 0;
4920}
4921
4922/*
4923 * With auto_skip=true bdrv_replace_node_common skips updating from parents
4924 * if it creates a parent-child relation loop or if parent is block-job.
4925 *
4926 * With auto_skip=false the error is returned if from has a parent which should
4927 * not be updated.
4928 *
4929 * With @detach_subchain=true @to must be in a backing chain of @from. In this
4930 * case backing link of the cow-parent of @to is removed.
4931 */
4932static int bdrv_replace_node_common(BlockDriverState *from,
4933                                    BlockDriverState *to,
4934                                    bool auto_skip, bool detach_subchain,
4935                                    Error **errp)
4936{
4937    Transaction *tran = tran_new();
4938    g_autoptr(GHashTable) found = NULL;
4939    g_autoptr(GSList) refresh_list = NULL;
4940    BlockDriverState *to_cow_parent = NULL;
4941    int ret;
4942
4943    if (detach_subchain) {
4944        assert(bdrv_chain_contains(from, to));
4945        assert(from != to);
4946        for (to_cow_parent = from;
4947             bdrv_filter_or_cow_bs(to_cow_parent) != to;
4948             to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
4949        {
4950            ;
4951        }
4952    }
4953
4954    /* Make sure that @from doesn't go away until we have successfully attached
4955     * all of its parents to @to. */
4956    bdrv_ref(from);
4957
4958    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
4959    assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
4960    bdrv_drained_begin(from);
4961
4962    /*
4963     * Do the replacement without permission update.
4964     * Replacement may influence the permissions, we should calculate new
4965     * permissions based on new graph. If we fail, we'll roll-back the
4966     * replacement.
4967     */
4968    ret = bdrv_replace_node_noperm(from, to, auto_skip, tran, errp);
4969    if (ret < 0) {
4970        goto out;
4971    }
4972
4973    if (detach_subchain) {
4974        bdrv_remove_filter_or_cow_child(to_cow_parent, tran);
4975    }
4976
4977    found = g_hash_table_new(NULL, NULL);
4978
4979    refresh_list = bdrv_topological_dfs(refresh_list, found, to);
4980    refresh_list = bdrv_topological_dfs(refresh_list, found, from);
4981
4982    ret = bdrv_list_refresh_perms(refresh_list, NULL, tran, errp);
4983    if (ret < 0) {
4984        goto out;
4985    }
4986
4987    ret = 0;
4988
4989out:
4990    tran_finalize(tran, ret);
4991
4992    bdrv_drained_end(from);
4993    bdrv_unref(from);
4994
4995    return ret;
4996}
4997
4998int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
4999                      Error **errp)
5000{
5001    return bdrv_replace_node_common(from, to, true, false, errp);
5002}
5003
5004int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
5005{
5006    return bdrv_replace_node_common(bs, bdrv_filter_or_cow_bs(bs), true, true,
5007                                    errp);
5008}
5009
5010/*
5011 * Add new bs contents at the top of an image chain while the chain is
5012 * live, while keeping required fields on the top layer.
5013 *
5014 * This will modify the BlockDriverState fields, and swap contents
5015 * between bs_new and bs_top. Both bs_new and bs_top are modified.
5016 *
5017 * bs_new must not be attached to a BlockBackend and must not have backing
5018 * child.
5019 *
5020 * This function does not create any image files.
5021 */
5022int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
5023                Error **errp)
5024{
5025    int ret;
5026    Transaction *tran = tran_new();
5027
5028    assert(!bs_new->backing);
5029
5030    ret = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
5031                                   &child_of_bds, bdrv_backing_role(bs_new),
5032                                   &bs_new->backing, tran, errp);
5033    if (ret < 0) {
5034        goto out;
5035    }
5036
5037    ret = bdrv_replace_node_noperm(bs_top, bs_new, true, tran, errp);
5038    if (ret < 0) {
5039        goto out;
5040    }
5041
5042    ret = bdrv_refresh_perms(bs_new, errp);
5043out:
5044    tran_finalize(tran, ret);
5045
5046    bdrv_refresh_limits(bs_top, NULL, NULL);
5047
5048    return ret;
5049}
5050
5051static void bdrv_delete(BlockDriverState *bs)
5052{
5053    assert(bdrv_op_blocker_is_empty(bs));
5054    assert(!bs->refcnt);
5055
5056    /* remove from list, if necessary */
5057    if (bs->node_name[0] != '\0') {
5058        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
5059    }
5060    QTAILQ_REMOVE(&all_bdrv_states, bs, bs_list);
5061
5062    bdrv_close(bs);
5063
5064    g_free(bs);
5065}
5066
5067BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options,
5068                                   int flags, Error **errp)
5069{
5070    BlockDriverState *new_node_bs;
5071    Error *local_err = NULL;
5072
5073    new_node_bs = bdrv_open(NULL, NULL, node_options, flags, errp);
5074    if (new_node_bs == NULL) {
5075        error_prepend(errp, "Could not create node: ");
5076        return NULL;
5077    }
5078
5079    bdrv_drained_begin(bs);
5080    bdrv_replace_node(bs, new_node_bs, &local_err);
5081    bdrv_drained_end(bs);
5082
5083    if (local_err) {
5084        bdrv_unref(new_node_bs);
5085        error_propagate(errp, local_err);
5086        return NULL;
5087    }
5088
5089    return new_node_bs;
5090}
5091
5092/*
5093 * Run consistency checks on an image
5094 *
5095 * Returns 0 if the check could be completed (it doesn't mean that the image is
5096 * free of errors) or -errno when an internal error occurred. The results of the
5097 * check are stored in res.
5098 */
5099int coroutine_fn bdrv_co_check(BlockDriverState *bs,
5100                               BdrvCheckResult *res, BdrvCheckMode fix)
5101{
5102    if (bs->drv == NULL) {
5103        return -ENOMEDIUM;
5104    }
5105    if (bs->drv->bdrv_co_check == NULL) {
5106        return -ENOTSUP;
5107    }
5108
5109    memset(res, 0, sizeof(*res));
5110    return bs->drv->bdrv_co_check(bs, res, fix);
5111}
5112
5113/*
5114 * Return values:
5115 * 0        - success
5116 * -EINVAL  - backing format specified, but no file
5117 * -ENOSPC  - can't update the backing file because no space is left in the
5118 *            image file header
5119 * -ENOTSUP - format driver doesn't support changing the backing file
5120 */
5121int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
5122                             const char *backing_fmt, bool require)
5123{
5124    BlockDriver *drv = bs->drv;
5125    int ret;
5126
5127    if (!drv) {
5128        return -ENOMEDIUM;
5129    }
5130
5131    /* Backing file format doesn't make sense without a backing file */
5132    if (backing_fmt && !backing_file) {
5133        return -EINVAL;
5134    }
5135
5136    if (require && backing_file && !backing_fmt) {
5137        return -EINVAL;
5138    }
5139
5140    if (drv->bdrv_change_backing_file != NULL) {
5141        ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
5142    } else {
5143        ret = -ENOTSUP;
5144    }
5145
5146    if (ret == 0) {
5147        pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
5148        pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
5149        pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
5150                backing_file ?: "");
5151    }
5152    return ret;
5153}
5154
5155/*
5156 * Finds the first non-filter node above bs in the chain between
5157 * active and bs.  The returned node is either an immediate parent of
5158 * bs, or there are only filter nodes between the two.
5159 *
5160 * Returns NULL if bs is not found in active's image chain,
5161 * or if active == bs.
5162 *
5163 * Returns the bottommost base image if bs == NULL.
5164 */
5165BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
5166                                    BlockDriverState *bs)
5167{
5168    bs = bdrv_skip_filters(bs);
5169    active = bdrv_skip_filters(active);
5170
5171    while (active) {
5172        BlockDriverState *next = bdrv_backing_chain_next(active);
5173        if (bs == next) {
5174            return active;
5175        }
5176        active = next;
5177    }
5178
5179    return NULL;
5180}
5181
5182/* Given a BDS, searches for the base layer. */
5183BlockDriverState *bdrv_find_base(BlockDriverState *bs)
5184{
5185    return bdrv_find_overlay(bs, NULL);
5186}
5187
5188/*
5189 * Return true if at least one of the COW (backing) and filter links
5190 * between @bs and @base is frozen. @errp is set if that's the case.
5191 * @base must be reachable from @bs, or NULL.
5192 */
5193bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
5194                                  Error **errp)
5195{
5196    BlockDriverState *i;
5197    BdrvChild *child;
5198
5199    for (i = bs; i != base; i = child_bs(child)) {
5200        child = bdrv_filter_or_cow_child(i);
5201
5202        if (child && child->frozen) {
5203            error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
5204                       child->name, i->node_name, child->bs->node_name);
5205            return true;
5206        }
5207    }
5208
5209    return false;
5210}
5211
5212/*
5213 * Freeze all COW (backing) and filter links between @bs and @base.
5214 * If any of the links is already frozen the operation is aborted and
5215 * none of the links are modified.
5216 * @base must be reachable from @bs, or NULL.
5217 * Returns 0 on success. On failure returns < 0 and sets @errp.
5218 */
5219int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
5220                              Error **errp)
5221{
5222    BlockDriverState *i;
5223    BdrvChild *child;
5224
5225    if (bdrv_is_backing_chain_frozen(bs, base, errp)) {
5226        return -EPERM;
5227    }
5228
5229    for (i = bs; i != base; i = child_bs(child)) {
5230        child = bdrv_filter_or_cow_child(i);
5231        if (child && child->bs->never_freeze) {
5232            error_setg(errp, "Cannot freeze '%s' link to '%s'",
5233                       child->name, child->bs->node_name);
5234            return -EPERM;
5235        }
5236    }
5237
5238    for (i = bs; i != base; i = child_bs(child)) {
5239        child = bdrv_filter_or_cow_child(i);
5240        if (child) {
5241            child->frozen = true;
5242        }
5243    }
5244
5245    return 0;
5246}
5247
5248/*
5249 * Unfreeze all COW (backing) and filter links between @bs and @base.
5250 * The caller must ensure that all links are frozen before using this
5251 * function.
5252 * @base must be reachable from @bs, or NULL.
5253 */
5254void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base)
5255{
5256    BlockDriverState *i;
5257    BdrvChild *child;
5258
5259    for (i = bs; i != base; i = child_bs(child)) {
5260        child = bdrv_filter_or_cow_child(i);
5261        if (child) {
5262            assert(child->frozen);
5263            child->frozen = false;
5264        }
5265    }
5266}
5267
5268/*
5269 * Drops images above 'base' up to and including 'top', and sets the image
5270 * above 'top' to have base as its backing file.
5271 *
5272 * Requires that the overlay to 'top' is opened r/w, so that the backing file
5273 * information in 'bs' can be properly updated.
5274 *
5275 * E.g., this will convert the following chain:
5276 * bottom <- base <- intermediate <- top <- active
5277 *
5278 * to
5279 *
5280 * bottom <- base <- active
5281 *
5282 * It is allowed for bottom==base, in which case it converts:
5283 *
5284 * base <- intermediate <- top <- active
5285 *
5286 * to
5287 *
5288 * base <- active
5289 *
5290 * If backing_file_str is non-NULL, it will be used when modifying top's
5291 * overlay image metadata.
5292 *
5293 * Error conditions:
5294 *  if active == top, that is considered an error
5295 *
5296 */
5297int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
5298                           const char *backing_file_str)
5299{
5300    BlockDriverState *explicit_top = top;
5301    bool update_inherits_from;
5302    BdrvChild *c;
5303    Error *local_err = NULL;
5304    int ret = -EIO;
5305    g_autoptr(GSList) updated_children = NULL;
5306    GSList *p;
5307
5308    bdrv_ref(top);
5309    bdrv_subtree_drained_begin(top);
5310
5311    if (!top->drv || !base->drv) {
5312        goto exit;
5313    }
5314
5315    /* Make sure that base is in the backing chain of top */
5316    if (!bdrv_chain_contains(top, base)) {
5317        goto exit;
5318    }
5319
5320    /* If 'base' recursively inherits from 'top' then we should set
5321     * base->inherits_from to top->inherits_from after 'top' and all
5322     * other intermediate nodes have been dropped.
5323     * If 'top' is an implicit node (e.g. "commit_top") we should skip
5324     * it because no one inherits from it. We use explicit_top for that. */
5325    explicit_top = bdrv_skip_implicit_filters(explicit_top);
5326    update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
5327
5328    /* success - we can delete the intermediate states, and link top->base */
5329    /* TODO Check graph modification op blockers (BLK_PERM_GRAPH_MOD) once
5330     * we've figured out how they should work. */
5331    if (!backing_file_str) {
5332        bdrv_refresh_filename(base);
5333        backing_file_str = base->filename;
5334    }
5335
5336    QLIST_FOREACH(c, &top->parents, next_parent) {
5337        updated_children = g_slist_prepend(updated_children, c);
5338    }
5339
5340    /*
5341     * It seems correct to pass detach_subchain=true here, but it triggers
5342     * one more yet not fixed bug, when due to nested aio_poll loop we switch to
5343     * another drained section, which modify the graph (for example, removing
5344     * the child, which we keep in updated_children list). So, it's a TODO.
5345     *
5346     * Note, bug triggered if pass detach_subchain=true here and run
5347     * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
5348     * That's a FIXME.
5349     */
5350    bdrv_replace_node_common(top, base, false, false, &local_err);
5351    if (local_err) {
5352        error_report_err(local_err);
5353        goto exit;
5354    }
5355
5356    for (p = updated_children; p; p = p->next) {
5357        c = p->data;
5358
5359        if (c->klass->update_filename) {
5360            ret = c->klass->update_filename(c, base, backing_file_str,
5361                                            &local_err);
5362            if (ret < 0) {
5363                /*
5364                 * TODO: Actually, we want to rollback all previous iterations
5365                 * of this loop, and (which is almost impossible) previous
5366                 * bdrv_replace_node()...
5367                 *
5368                 * Note, that c->klass->update_filename may lead to permission
5369                 * update, so it's a bad idea to call it inside permission
5370                 * update transaction of bdrv_replace_node.
5371                 */
5372                error_report_err(local_err);
5373                goto exit;
5374            }
5375        }
5376    }
5377
5378    if (update_inherits_from) {
5379        base->inherits_from = explicit_top->inherits_from;
5380    }
5381
5382    ret = 0;
5383exit:
5384    bdrv_subtree_drained_end(top);
5385    bdrv_unref(top);
5386    return ret;
5387}
5388
5389/**
5390 * Implementation of BlockDriver.bdrv_get_allocated_file_size() that
5391 * sums the size of all data-bearing children.  (This excludes backing
5392 * children.)
5393 */
5394static int64_t bdrv_sum_allocated_file_size(BlockDriverState *bs)
5395{
5396    BdrvChild *child;
5397    int64_t child_size, sum = 0;
5398
5399    QLIST_FOREACH(child, &bs->children, next) {
5400        if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
5401                           BDRV_CHILD_FILTERED))
5402        {
5403            child_size = bdrv_get_allocated_file_size(child->bs);
5404            if (child_size < 0) {
5405                return child_size;
5406            }
5407            sum += child_size;
5408        }
5409    }
5410
5411    return sum;
5412}
5413
5414/**
5415 * Length of a allocated file in bytes. Sparse files are counted by actual
5416 * allocated space. Return < 0 if error or unknown.
5417 */
5418int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
5419{
5420    BlockDriver *drv = bs->drv;
5421    if (!drv) {
5422        return -ENOMEDIUM;
5423    }
5424    if (drv->bdrv_get_allocated_file_size) {
5425        return drv->bdrv_get_allocated_file_size(bs);
5426    }
5427
5428    if (drv->bdrv_file_open) {
5429        /*
5430         * Protocol drivers default to -ENOTSUP (most of their data is
5431         * not stored in any of their children (if they even have any),
5432         * so there is no generic way to figure it out).
5433         */
5434        return -ENOTSUP;
5435    } else if (drv->is_filter) {
5436        /* Filter drivers default to the size of their filtered child */
5437        return bdrv_get_allocated_file_size(bdrv_filter_bs(bs));
5438    } else {
5439        /* Other drivers default to summing their children's sizes */
5440        return bdrv_sum_allocated_file_size(bs);
5441    }
5442}
5443
5444/*
5445 * bdrv_measure:
5446 * @drv: Format driver
5447 * @opts: Creation options for new image
5448 * @in_bs: Existing image containing data for new image (may be NULL)
5449 * @errp: Error object
5450 * Returns: A #BlockMeasureInfo (free using qapi_free_BlockMeasureInfo())
5451 *          or NULL on error
5452 *
5453 * Calculate file size required to create a new image.
5454 *
5455 * If @in_bs is given then space for allocated clusters and zero clusters
5456 * from that image are included in the calculation.  If @opts contains a
5457 * backing file that is shared by @in_bs then backing clusters may be omitted
5458 * from the calculation.
5459 *
5460 * If @in_bs is NULL then the calculation includes no allocated clusters
5461 * unless a preallocation option is given in @opts.
5462 *
5463 * Note that @in_bs may use a different BlockDriver from @drv.
5464 *
5465 * If an error occurs the @errp pointer is set.
5466 */
5467BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
5468                               BlockDriverState *in_bs, Error **errp)
5469{
5470    if (!drv->bdrv_measure) {
5471        error_setg(errp, "Block driver '%s' does not support size measurement",
5472                   drv->format_name);
5473        return NULL;
5474    }
5475
5476    return drv->bdrv_measure(opts, in_bs, errp);
5477}
5478
5479/**
5480 * Return number of sectors on success, -errno on error.
5481 */
5482int64_t bdrv_nb_sectors(BlockDriverState *bs)
5483{
5484    BlockDriver *drv = bs->drv;
5485
5486    if (!drv)
5487        return -ENOMEDIUM;
5488
5489    if (drv->has_variable_length) {
5490        int ret = refresh_total_sectors(bs, bs->total_sectors);
5491        if (ret < 0) {
5492            return ret;
5493        }
5494    }
5495    return bs->total_sectors;
5496}
5497
5498/**
5499 * Return length in bytes on success, -errno on error.
5500 * The length is always a multiple of BDRV_SECTOR_SIZE.
5501 */
5502int64_t bdrv_getlength(BlockDriverState *bs)
5503{
5504    int64_t ret = bdrv_nb_sectors(bs);
5505
5506    if (ret < 0) {
5507        return ret;
5508    }
5509    if (ret > INT64_MAX / BDRV_SECTOR_SIZE) {
5510        return -EFBIG;
5511    }
5512    return ret * BDRV_SECTOR_SIZE;
5513}
5514
5515/* return 0 as number of sectors if no device present or error */
5516void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
5517{
5518    int64_t nb_sectors = bdrv_nb_sectors(bs);
5519
5520    *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
5521}
5522
5523bool bdrv_is_sg(BlockDriverState *bs)
5524{
5525    return bs->sg;
5526}
5527
5528/**
5529 * Return whether the given node supports compressed writes.
5530 */
5531bool bdrv_supports_compressed_writes(BlockDriverState *bs)
5532{
5533    BlockDriverState *filtered;
5534
5535    if (!bs->drv || !block_driver_can_compress(bs->drv)) {
5536        return false;
5537    }
5538
5539    filtered = bdrv_filter_bs(bs);
5540    if (filtered) {
5541        /*
5542         * Filters can only forward compressed writes, so we have to
5543         * check the child.
5544         */
5545        return bdrv_supports_compressed_writes(filtered);
5546    }
5547
5548    return true;
5549}
5550
5551const char *bdrv_get_format_name(BlockDriverState *bs)
5552{
5553    return bs->drv ? bs->drv->format_name : NULL;
5554}
5555
5556static int qsort_strcmp(const void *a, const void *b)
5557{
5558    return strcmp(*(char *const *)a, *(char *const *)b);
5559}
5560
5561void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
5562                         void *opaque, bool read_only)
5563{
5564    BlockDriver *drv;
5565    int count = 0;
5566    int i;
5567    const char **formats = NULL;
5568
5569    QLIST_FOREACH(drv, &bdrv_drivers, list) {
5570        if (drv->format_name) {
5571            bool found = false;
5572            int i = count;
5573
5574            if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, read_only)) {
5575                continue;
5576            }
5577
5578            while (formats && i && !found) {
5579                found = !strcmp(formats[--i], drv->format_name);
5580            }
5581
5582            if (!found) {
5583                formats = g_renew(const char *, formats, count + 1);
5584                formats[count++] = drv->format_name;
5585            }
5586        }
5587    }
5588
5589    for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); i++) {
5590        const char *format_name = block_driver_modules[i].format_name;
5591
5592        if (format_name) {
5593            bool found = false;
5594            int j = count;
5595
5596            if (use_bdrv_whitelist &&
5597                !bdrv_format_is_whitelisted(format_name, read_only)) {
5598                continue;
5599            }
5600
5601            while (formats && j && !found) {
5602                found = !strcmp(formats[--j], format_name);
5603            }
5604
5605            if (!found) {
5606                formats = g_renew(const char *, formats, count + 1);
5607                formats[count++] = format_name;
5608            }
5609        }
5610    }
5611
5612    qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
5613
5614    for (i = 0; i < count; i++) {
5615        it(opaque, formats[i]);
5616    }
5617
5618    g_free(formats);
5619}
5620
5621/* This function is to find a node in the bs graph */
5622BlockDriverState *bdrv_find_node(const char *node_name)
5623{
5624    BlockDriverState *bs;
5625
5626    assert(node_name);
5627
5628    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
5629        if (!strcmp(node_name, bs->node_name)) {
5630            return bs;
5631        }
5632    }
5633    return NULL;
5634}
5635
5636/* Put this QMP function here so it can access the static graph_bdrv_states. */
5637BlockDeviceInfoList *bdrv_named_nodes_list(bool flat,
5638                                           Error **errp)
5639{
5640    BlockDeviceInfoList *list;
5641    BlockDriverState *bs;
5642
5643    list = NULL;
5644    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
5645        BlockDeviceInfo *info = bdrv_block_device_info(NULL, bs, flat, errp);
5646        if (!info) {
5647            qapi_free_BlockDeviceInfoList(list);
5648            return NULL;
5649        }
5650        QAPI_LIST_PREPEND(list, info);
5651    }
5652
5653    return list;
5654}
5655
5656typedef struct XDbgBlockGraphConstructor {
5657    XDbgBlockGraph *graph;
5658    GHashTable *graph_nodes;
5659} XDbgBlockGraphConstructor;
5660
5661static XDbgBlockGraphConstructor *xdbg_graph_new(void)
5662{
5663    XDbgBlockGraphConstructor *gr = g_new(XDbgBlockGraphConstructor, 1);
5664
5665    gr->graph = g_new0(XDbgBlockGraph, 1);
5666    gr->graph_nodes = g_hash_table_new(NULL, NULL);
5667
5668    return gr;
5669}
5670
5671static XDbgBlockGraph *xdbg_graph_finalize(XDbgBlockGraphConstructor *gr)
5672{
5673    XDbgBlockGraph *graph = gr->graph;
5674
5675    g_hash_table_destroy(gr->graph_nodes);
5676    g_free(gr);
5677
5678    return graph;
5679}
5680
5681static uintptr_t xdbg_graph_node_num(XDbgBlockGraphConstructor *gr, void *node)
5682{
5683    uintptr_t ret = (uintptr_t)g_hash_table_lookup(gr->graph_nodes, node);
5684
5685    if (ret != 0) {
5686        return ret;
5687    }
5688
5689    /*
5690     * Start counting from 1, not 0, because 0 interferes with not-found (NULL)
5691     * answer of g_hash_table_lookup.
5692     */
5693    ret = g_hash_table_size(gr->graph_nodes) + 1;
5694    g_hash_table_insert(gr->graph_nodes, node, (void *)ret);
5695
5696    return ret;
5697}
5698
5699static void xdbg_graph_add_node(XDbgBlockGraphConstructor *gr, void *node,
5700                                XDbgBlockGraphNodeType type, const char *name)
5701{
5702    XDbgBlockGraphNode *n;
5703
5704    n = g_new0(XDbgBlockGraphNode, 1);
5705
5706    n->id = xdbg_graph_node_num(gr, node);
5707    n->type = type;
5708    n->name = g_strdup(name);
5709
5710    QAPI_LIST_PREPEND(gr->graph->nodes, n);
5711}
5712
5713static void xdbg_graph_add_edge(XDbgBlockGraphConstructor *gr, void *parent,
5714                                const BdrvChild *child)
5715{
5716    BlockPermission qapi_perm;
5717    XDbgBlockGraphEdge *edge;
5718
5719    edge = g_new0(XDbgBlockGraphEdge, 1);
5720
5721    edge->parent = xdbg_graph_node_num(gr, parent);
5722    edge->child = xdbg_graph_node_num(gr, child->bs);
5723    edge->name = g_strdup(child->name);
5724
5725    for (qapi_perm = 0; qapi_perm < BLOCK_PERMISSION__MAX; qapi_perm++) {
5726        uint64_t flag = bdrv_qapi_perm_to_blk_perm(qapi_perm);
5727
5728        if (flag & child->perm) {
5729            QAPI_LIST_PREPEND(edge->perm, qapi_perm);
5730        }
5731        if (flag & child->shared_perm) {
5732            QAPI_LIST_PREPEND(edge->shared_perm, qapi_perm);
5733        }
5734    }
5735
5736    QAPI_LIST_PREPEND(gr->graph->edges, edge);
5737}
5738
5739
5740XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
5741{
5742    BlockBackend *blk;
5743    BlockJob *job;
5744    BlockDriverState *bs;
5745    BdrvChild *child;
5746    XDbgBlockGraphConstructor *gr = xdbg_graph_new();
5747
5748    for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
5749        char *allocated_name = NULL;
5750        const char *name = blk_name(blk);
5751
5752        if (!*name) {
5753            name = allocated_name = blk_get_attached_dev_id(blk);
5754        }
5755        xdbg_graph_add_node(gr, blk, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_BACKEND,
5756                           name);
5757        g_free(allocated_name);
5758        if (blk_root(blk)) {
5759            xdbg_graph_add_edge(gr, blk, blk_root(blk));
5760        }
5761    }
5762
5763    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
5764        GSList *el;
5765
5766        xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
5767                           job->job.id);
5768        for (el = job->nodes; el; el = el->next) {
5769            xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
5770        }
5771    }
5772
5773    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
5774        xdbg_graph_add_node(gr, bs, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_DRIVER,
5775                           bs->node_name);
5776        QLIST_FOREACH(child, &bs->children, next) {
5777            xdbg_graph_add_edge(gr, bs, child);
5778        }
5779    }
5780
5781    return xdbg_graph_finalize(gr);
5782}
5783
5784BlockDriverState *bdrv_lookup_bs(const char *device,
5785                                 const char *node_name,
5786                                 Error **errp)
5787{
5788    BlockBackend *blk;
5789    BlockDriverState *bs;
5790
5791    if (device) {
5792        blk = blk_by_name(device);
5793
5794        if (blk) {
5795            bs = blk_bs(blk);
5796            if (!bs) {
5797                error_setg(errp, "Device '%s' has no medium", device);
5798            }
5799
5800            return bs;
5801        }
5802    }
5803
5804    if (node_name) {
5805        bs = bdrv_find_node(node_name);
5806
5807        if (bs) {
5808            return bs;
5809        }
5810    }
5811
5812    error_setg(errp, "Cannot find device=\'%s\' nor node-name=\'%s\'",
5813                     device ? device : "",
5814                     node_name ? node_name : "");
5815    return NULL;
5816}
5817
5818/* If 'base' is in the same chain as 'top', return true. Otherwise,
5819 * return false.  If either argument is NULL, return false. */
5820bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
5821{
5822    while (top && top != base) {
5823        top = bdrv_filter_or_cow_bs(top);
5824    }
5825
5826    return top != NULL;
5827}
5828
5829BlockDriverState *bdrv_next_node(BlockDriverState *bs)
5830{
5831    if (!bs) {
5832        return QTAILQ_FIRST(&graph_bdrv_states);
5833    }
5834    return QTAILQ_NEXT(bs, node_list);
5835}
5836
5837BlockDriverState *bdrv_next_all_states(BlockDriverState *bs)
5838{
5839    if (!bs) {
5840        return QTAILQ_FIRST(&all_bdrv_states);
5841    }
5842    return QTAILQ_NEXT(bs, bs_list);
5843}
5844
5845const char *bdrv_get_node_name(const BlockDriverState *bs)
5846{
5847    return bs->node_name;
5848}
5849
5850const char *bdrv_get_parent_name(const BlockDriverState *bs)
5851{
5852    BdrvChild *c;
5853    const char *name;
5854
5855    /* If multiple parents have a name, just pick the first one. */
5856    QLIST_FOREACH(c, &bs->parents, next_parent) {
5857        if (c->klass->get_name) {
5858            name = c->klass->get_name(c);
5859            if (name && *name) {
5860                return name;
5861            }
5862        }
5863    }
5864
5865    return NULL;
5866}
5867
5868/* TODO check what callers really want: bs->node_name or blk_name() */
5869const char *bdrv_get_device_name(const BlockDriverState *bs)
5870{
5871    return bdrv_get_parent_name(bs) ?: "";
5872}
5873
5874/* This can be used to identify nodes that might not have a device
5875 * name associated. Since node and device names live in the same
5876 * namespace, the result is unambiguous. The exception is if both are
5877 * absent, then this returns an empty (non-null) string. */
5878const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
5879{
5880    return bdrv_get_parent_name(bs) ?: bs->node_name;
5881}
5882
5883int bdrv_get_flags(BlockDriverState *bs)
5884{
5885    return bs->open_flags;
5886}
5887
5888int bdrv_has_zero_init_1(BlockDriverState *bs)
5889{
5890    return 1;
5891}
5892
5893int bdrv_has_zero_init(BlockDriverState *bs)
5894{
5895    BlockDriverState *filtered;
5896
5897    if (!bs->drv) {
5898        return 0;
5899    }
5900
5901    /* If BS is a copy on write image, it is initialized to
5902       the contents of the base image, which may not be zeroes.  */
5903    if (bdrv_cow_child(bs)) {
5904        return 0;
5905    }
5906    if (bs->drv->bdrv_has_zero_init) {
5907        return bs->drv->bdrv_has_zero_init(bs);
5908    }
5909
5910    filtered = bdrv_filter_bs(bs);
5911    if (filtered) {
5912        return bdrv_has_zero_init(filtered);
5913    }
5914
5915    /* safe default */
5916    return 0;
5917}
5918
5919bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
5920{
5921    if (!(bs->open_flags & BDRV_O_UNMAP)) {
5922        return false;
5923    }
5924
5925    return bs->supported_zero_flags & BDRV_REQ_MAY_UNMAP;
5926}
5927
5928void bdrv_get_backing_filename(BlockDriverState *bs,
5929                               char *filename, int filename_size)
5930{
5931    pstrcpy(filename, filename_size, bs->backing_file);
5932}
5933
5934int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
5935{
5936    int ret;
5937    BlockDriver *drv = bs->drv;
5938    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
5939    if (!drv) {
5940        return -ENOMEDIUM;
5941    }
5942    if (!drv->bdrv_get_info) {
5943        BlockDriverState *filtered = bdrv_filter_bs(bs);
5944        if (filtered) {
5945            return bdrv_get_info(filtered, bdi);
5946        }
5947        return -ENOTSUP;
5948    }
5949    memset(bdi, 0, sizeof(*bdi));
5950    ret = drv->bdrv_get_info(bs, bdi);
5951    if (ret < 0) {
5952        return ret;
5953    }
5954
5955    if (bdi->cluster_size > BDRV_MAX_ALIGNMENT) {
5956        return -EINVAL;
5957    }
5958
5959    return 0;
5960}
5961
5962ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
5963                                          Error **errp)
5964{
5965    BlockDriver *drv = bs->drv;
5966    if (drv && drv->bdrv_get_specific_info) {
5967        return drv->bdrv_get_specific_info(bs, errp);
5968    }
5969    return NULL;
5970}
5971
5972BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs)
5973{
5974    BlockDriver *drv = bs->drv;
5975    if (!drv || !drv->bdrv_get_specific_stats) {
5976        return NULL;
5977    }
5978    return drv->bdrv_get_specific_stats(bs);
5979}
5980
5981void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event)
5982{
5983    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
5984        return;
5985    }
5986
5987    bs->drv->bdrv_debug_event(bs, event);
5988}
5989
5990static BlockDriverState *bdrv_find_debug_node(BlockDriverState *bs)
5991{
5992    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
5993        bs = bdrv_primary_bs(bs);
5994    }
5995
5996    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
5997        assert(bs->drv->bdrv_debug_remove_breakpoint);
5998        return bs;
5999    }
6000
6001    return NULL;
6002}
6003
6004int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
6005                          const char *tag)
6006{
6007    bs = bdrv_find_debug_node(bs);
6008    if (bs) {
6009        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
6010    }
6011
6012    return -ENOTSUP;
6013}
6014
6015int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
6016{
6017    bs = bdrv_find_debug_node(bs);
6018    if (bs) {
6019        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
6020    }
6021
6022    return -ENOTSUP;
6023}
6024
6025int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
6026{
6027    while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
6028        bs = bdrv_primary_bs(bs);
6029    }
6030
6031    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
6032        return bs->drv->bdrv_debug_resume(bs, tag);
6033    }
6034
6035    return -ENOTSUP;
6036}
6037
6038bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
6039{
6040    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
6041        bs = bdrv_primary_bs(bs);
6042    }
6043
6044    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
6045        return bs->drv->bdrv_debug_is_suspended(bs, tag);
6046    }
6047
6048    return false;
6049}
6050
6051/* backing_file can either be relative, or absolute, or a protocol.  If it is
6052 * relative, it must be relative to the chain.  So, passing in bs->filename
6053 * from a BDS as backing_file should not be done, as that may be relative to
6054 * the CWD rather than the chain. */
6055BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
6056        const char *backing_file)
6057{
6058    char *filename_full = NULL;
6059    char *backing_file_full = NULL;
6060    char *filename_tmp = NULL;
6061    int is_protocol = 0;
6062    bool filenames_refreshed = false;
6063    BlockDriverState *curr_bs = NULL;
6064    BlockDriverState *retval = NULL;
6065    BlockDriverState *bs_below;
6066
6067    if (!bs || !bs->drv || !backing_file) {
6068        return NULL;
6069    }
6070
6071    filename_full     = g_malloc(PATH_MAX);
6072    backing_file_full = g_malloc(PATH_MAX);
6073
6074    is_protocol = path_has_protocol(backing_file);
6075
6076    /*
6077     * Being largely a legacy function, skip any filters here
6078     * (because filters do not have normal filenames, so they cannot
6079     * match anyway; and allowing json:{} filenames is a bit out of
6080     * scope).
6081     */
6082    for (curr_bs = bdrv_skip_filters(bs);
6083         bdrv_cow_child(curr_bs) != NULL;
6084         curr_bs = bs_below)
6085    {
6086        bs_below = bdrv_backing_chain_next(curr_bs);
6087
6088        if (bdrv_backing_overridden(curr_bs)) {
6089            /*
6090             * If the backing file was overridden, we can only compare
6091             * directly against the backing node's filename.
6092             */
6093
6094            if (!filenames_refreshed) {
6095                /*
6096                 * This will automatically refresh all of the
6097                 * filenames in the rest of the backing chain, so we
6098                 * only need to do this once.
6099                 */
6100                bdrv_refresh_filename(bs_below);
6101                filenames_refreshed = true;
6102            }
6103
6104            if (strcmp(backing_file, bs_below->filename) == 0) {
6105                retval = bs_below;
6106                break;
6107            }
6108        } else if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
6109            /*
6110             * If either of the filename paths is actually a protocol, then
6111             * compare unmodified paths; otherwise make paths relative.
6112             */
6113            char *backing_file_full_ret;
6114
6115            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
6116                retval = bs_below;
6117                break;
6118            }
6119            /* Also check against the full backing filename for the image */
6120            backing_file_full_ret = bdrv_get_full_backing_filename(curr_bs,
6121                                                                   NULL);
6122            if (backing_file_full_ret) {
6123                bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
6124                g_free(backing_file_full_ret);
6125                if (equal) {
6126                    retval = bs_below;
6127                    break;
6128                }
6129            }
6130        } else {
6131            /* If not an absolute filename path, make it relative to the current
6132             * image's filename path */
6133            filename_tmp = bdrv_make_absolute_filename(curr_bs, backing_file,
6134                                                       NULL);
6135            /* We are going to compare canonicalized absolute pathnames */
6136            if (!filename_tmp || !realpath(filename_tmp, filename_full)) {
6137                g_free(filename_tmp);
6138                continue;
6139            }
6140            g_free(filename_tmp);
6141
6142            /* We need to make sure the backing filename we are comparing against
6143             * is relative to the current image filename (or absolute) */
6144            filename_tmp = bdrv_get_full_backing_filename(curr_bs, NULL);
6145            if (!filename_tmp || !realpath(filename_tmp, backing_file_full)) {
6146                g_free(filename_tmp);
6147                continue;
6148            }
6149            g_free(filename_tmp);
6150
6151            if (strcmp(backing_file_full, filename_full) == 0) {
6152                retval = bs_below;
6153                break;
6154            }
6155        }
6156    }
6157
6158    g_free(filename_full);
6159    g_free(backing_file_full);
6160    return retval;
6161}
6162
6163void bdrv_init(void)
6164{
6165#ifdef CONFIG_BDRV_WHITELIST_TOOLS
6166    use_bdrv_whitelist = 1;
6167#endif
6168    module_call_init(MODULE_INIT_BLOCK);
6169}
6170
6171void bdrv_init_with_whitelist(void)
6172{
6173    use_bdrv_whitelist = 1;
6174    bdrv_init();
6175}
6176
6177int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
6178{
6179    BdrvChild *child, *parent;
6180    Error *local_err = NULL;
6181    int ret;
6182    BdrvDirtyBitmap *bm;
6183
6184    if (!bs->drv)  {
6185        return -ENOMEDIUM;
6186    }
6187
6188    QLIST_FOREACH(child, &bs->children, next) {
6189        bdrv_co_invalidate_cache(child->bs, &local_err);
6190        if (local_err) {
6191            error_propagate(errp, local_err);
6192            return -EINVAL;
6193        }
6194    }
6195
6196    /*
6197     * Update permissions, they may differ for inactive nodes.
6198     *
6199     * Note that the required permissions of inactive images are always a
6200     * subset of the permissions required after activating the image. This
6201     * allows us to just get the permissions upfront without restricting
6202     * drv->bdrv_invalidate_cache().
6203     *
6204     * It also means that in error cases, we don't have to try and revert to
6205     * the old permissions (which is an operation that could fail, too). We can
6206     * just keep the extended permissions for the next time that an activation
6207     * of the image is tried.
6208     */
6209    if (bs->open_flags & BDRV_O_INACTIVE) {
6210        bs->open_flags &= ~BDRV_O_INACTIVE;
6211        ret = bdrv_refresh_perms(bs, errp);
6212        if (ret < 0) {
6213            bs->open_flags |= BDRV_O_INACTIVE;
6214            return ret;
6215        }
6216
6217        if (bs->drv->bdrv_co_invalidate_cache) {
6218            bs->drv->bdrv_co_invalidate_cache(bs, &local_err);
6219            if (local_err) {
6220                bs->open_flags |= BDRV_O_INACTIVE;
6221                error_propagate(errp, local_err);
6222                return -EINVAL;
6223            }
6224        }
6225
6226        FOR_EACH_DIRTY_BITMAP(bs, bm) {
6227            bdrv_dirty_bitmap_skip_store(bm, false);
6228        }
6229
6230        ret = refresh_total_sectors(bs, bs->total_sectors);
6231        if (ret < 0) {
6232            bs->open_flags |= BDRV_O_INACTIVE;
6233            error_setg_errno(errp, -ret, "Could not refresh total sector count");
6234            return ret;
6235        }
6236    }
6237
6238    QLIST_FOREACH(parent, &bs->parents, next_parent) {
6239        if (parent->klass->activate) {
6240            parent->klass->activate(parent, &local_err);
6241            if (local_err) {
6242                bs->open_flags |= BDRV_O_INACTIVE;
6243                error_propagate(errp, local_err);
6244                return -EINVAL;
6245            }
6246        }
6247    }
6248
6249    return 0;
6250}
6251
6252void bdrv_invalidate_cache_all(Error **errp)
6253{
6254    BlockDriverState *bs;
6255    BdrvNextIterator it;
6256
6257    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6258        AioContext *aio_context = bdrv_get_aio_context(bs);
6259        int ret;
6260
6261        aio_context_acquire(aio_context);
6262        ret = bdrv_invalidate_cache(bs, errp);
6263        aio_context_release(aio_context);
6264        if (ret < 0) {
6265            bdrv_next_cleanup(&it);
6266            return;
6267        }
6268    }
6269}
6270
6271static bool bdrv_has_bds_parent(BlockDriverState *bs, bool only_active)
6272{
6273    BdrvChild *parent;
6274
6275    QLIST_FOREACH(parent, &bs->parents, next_parent) {
6276        if (parent->klass->parent_is_bds) {
6277            BlockDriverState *parent_bs = parent->opaque;
6278            if (!only_active || !(parent_bs->open_flags & BDRV_O_INACTIVE)) {
6279                return true;
6280            }
6281        }
6282    }
6283
6284    return false;
6285}
6286
6287static int bdrv_inactivate_recurse(BlockDriverState *bs)
6288{
6289    BdrvChild *child, *parent;
6290    int ret;
6291
6292    if (!bs->drv) {
6293        return -ENOMEDIUM;
6294    }
6295
6296    /* Make sure that we don't inactivate a child before its parent.
6297     * It will be covered by recursion from the yet active parent. */
6298    if (bdrv_has_bds_parent(bs, true)) {
6299        return 0;
6300    }
6301
6302    assert(!(bs->open_flags & BDRV_O_INACTIVE));
6303
6304    /* Inactivate this node */
6305    if (bs->drv->bdrv_inactivate) {
6306        ret = bs->drv->bdrv_inactivate(bs);
6307        if (ret < 0) {
6308            return ret;
6309        }
6310    }
6311
6312    QLIST_FOREACH(parent, &bs->parents, next_parent) {
6313        if (parent->klass->inactivate) {
6314            ret = parent->klass->inactivate(parent);
6315            if (ret < 0) {
6316                return ret;
6317            }
6318        }
6319    }
6320
6321    bs->open_flags |= BDRV_O_INACTIVE;
6322
6323    /*
6324     * Update permissions, they may differ for inactive nodes.
6325     * We only tried to loosen restrictions, so errors are not fatal, ignore
6326     * them.
6327     */
6328    bdrv_refresh_perms(bs, NULL);
6329
6330    /* Recursively inactivate children */
6331    QLIST_FOREACH(child, &bs->children, next) {
6332        ret = bdrv_inactivate_recurse(child->bs);
6333        if (ret < 0) {
6334            return ret;
6335        }
6336    }
6337
6338    return 0;
6339}
6340
6341int bdrv_inactivate_all(void)
6342{
6343    BlockDriverState *bs = NULL;
6344    BdrvNextIterator it;
6345    int ret = 0;
6346    GSList *aio_ctxs = NULL, *ctx;
6347
6348    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
6349        AioContext *aio_context = bdrv_get_aio_context(bs);
6350
6351        if (!g_slist_find(aio_ctx