qemu/block/preallocate.c
<<
>>
Prefs
   1/*
   2 * preallocate filter driver
   3 *
   4 * The driver performs preallocate operation: it is injected above
   5 * some node, and before each write over EOF it does additional preallocating
   6 * write-zeroes request.
   7 *
   8 * Copyright (c) 2020 Virtuozzo International GmbH.
   9 *
  10 * Author:
  11 *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
  12 *
  13 * This program is free software; you can redistribute it and/or modify
  14 * it under the terms of the GNU General Public License as published by
  15 * the Free Software Foundation; either version 2 of the License, or
  16 * (at your option) any later version.
  17 *
  18 * This program is distributed in the hope that it will be useful,
  19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 * GNU General Public License for more details.
  22 *
  23 * You should have received a copy of the GNU General Public License
  24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
  25 */
  26
  27#include "qemu/osdep.h"
  28
  29#include "qapi/error.h"
  30#include "qemu/module.h"
  31#include "qemu/option.h"
  32#include "qemu/units.h"
  33#include "block/block_int.h"
  34
  35
  36typedef struct PreallocateOpts {
  37    int64_t prealloc_size;
  38    int64_t prealloc_align;
  39} PreallocateOpts;
  40
  41typedef struct BDRVPreallocateState {
  42    PreallocateOpts opts;
  43
  44    /*
  45     * Track real data end, to crop preallocation on close. If < 0 the status is
  46     * unknown.
  47     *
  48     * @data_end is a maximum of file size on open (or when we get write/resize
  49     * permissions) and all write request ends after it. So it's safe to
  50     * truncate to data_end if it is valid.
  51     */
  52    int64_t data_end;
  53
  54    /*
  55     * Start of trailing preallocated area which reads as zero. May be smaller
  56     * than data_end, if user does over-EOF write zero operation. If < 0 the
  57     * status is unknown.
  58     *
  59     * If both @zero_start and @file_end are valid, the region
  60     * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
  61     * is not valid, @zero_start doesn't make much sense.
  62     */
  63    int64_t zero_start;
  64
  65    /*
  66     * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
  67     * to avoid extra lseek() calls on each write operation. If < 0 the status
  68     * is unknown.
  69     */
  70    int64_t file_end;
  71
  72    /*
  73     * All three states @data_end, @zero_start and @file_end are guaranteed to
  74     * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
  75     * BLK_PERM_WRITE permissions on file child.
  76     */
  77} BDRVPreallocateState;
  78
  79#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
  80#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
  81static QemuOptsList runtime_opts = {
  82    .name = "preallocate",
  83    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
  84    .desc = {
  85        {
  86            .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
  87            .type = QEMU_OPT_SIZE,
  88            .help = "on preallocation, align file length to this number, "
  89                "default 1M",
  90        },
  91        {
  92            .name = PREALLOCATE_OPT_PREALLOC_SIZE,
  93            .type = QEMU_OPT_SIZE,
  94            .help = "how much to preallocate, default 128M",
  95        },
  96        { /* end of list */ }
  97    },
  98};
  99
 100static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
 101                                    BlockDriverState *child_bs, Error **errp)
 102{
 103    QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 104
 105    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
 106        return false;
 107    }
 108
 109    dest->prealloc_align =
 110        qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
 111    dest->prealloc_size =
 112        qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
 113
 114    qemu_opts_del(opts);
 115
 116    if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
 117        error_setg(errp, "prealloc-align parameter of preallocate filter "
 118                   "is not aligned to %llu", BDRV_SECTOR_SIZE);
 119        return false;
 120    }
 121
 122    if (!QEMU_IS_ALIGNED(dest->prealloc_align,
 123                         child_bs->bl.request_alignment)) {
 124        error_setg(errp, "prealloc-align parameter of preallocate filter "
 125                   "is not aligned to underlying node request alignment "
 126                   "(%" PRIi32 ")", child_bs->bl.request_alignment);
 127        return false;
 128    }
 129
 130    return true;
 131}
 132
 133static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
 134                            Error **errp)
 135{
 136    BDRVPreallocateState *s = bs->opaque;
 137
 138    /*
 139     * s->data_end and friends should be initialized on permission update.
 140     * For this to work, mark them invalid.
 141     */
 142    s->file_end = s->zero_start = s->data_end = -EINVAL;
 143
 144    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 145                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 146                               false, errp);
 147    if (!bs->file) {
 148        return -EINVAL;
 149    }
 150
 151    if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
 152        return -EINVAL;
 153    }
 154
 155    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 156        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
 157
 158    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
 159        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
 160            bs->file->bs->supported_zero_flags);
 161
 162    return 0;
 163}
 164
 165static void preallocate_close(BlockDriverState *bs)
 166{
 167    int ret;
 168    BDRVPreallocateState *s = bs->opaque;
 169
 170    if (s->data_end < 0) {
 171        return;
 172    }
 173
 174    if (s->file_end < 0) {
 175        s->file_end = bdrv_getlength(bs->file->bs);
 176        if (s->file_end < 0) {
 177            return;
 178        }
 179    }
 180
 181    if (s->data_end < s->file_end) {
 182        ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
 183                            NULL);
 184        s->file_end = ret < 0 ? ret : s->data_end;
 185    }
 186}
 187
 188
 189/*
 190 * Handle reopen.
 191 *
 192 * We must implement reopen handlers, otherwise reopen just don't work. Handle
 193 * new options and don't care about preallocation state, as it is handled in
 194 * set/check permission handlers.
 195 */
 196
 197static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
 198                                      BlockReopenQueue *queue, Error **errp)
 199{
 200    PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
 201
 202    if (!preallocate_absorb_opts(opts, reopen_state->options,
 203                                 reopen_state->bs->file->bs, errp)) {
 204        g_free(opts);
 205        return -EINVAL;
 206    }
 207
 208    reopen_state->opaque = opts;
 209
 210    return 0;
 211}
 212
 213static void preallocate_reopen_commit(BDRVReopenState *state)
 214{
 215    BDRVPreallocateState *s = state->bs->opaque;
 216
 217    s->opts = *(PreallocateOpts *)state->opaque;
 218
 219    g_free(state->opaque);
 220    state->opaque = NULL;
 221}
 222
 223static void preallocate_reopen_abort(BDRVReopenState *state)
 224{
 225    g_free(state->opaque);
 226    state->opaque = NULL;
 227}
 228
 229static coroutine_fn int preallocate_co_preadv_part(
 230        BlockDriverState *bs, int64_t offset, int64_t bytes,
 231        QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags)
 232{
 233    return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
 234                               flags);
 235}
 236
 237static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs,
 238                                               int64_t offset, int64_t bytes)
 239{
 240    return bdrv_co_pdiscard(bs->file, offset, bytes);
 241}
 242
 243static bool can_write_resize(uint64_t perm)
 244{
 245    return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
 246}
 247
 248static bool has_prealloc_perms(BlockDriverState *bs)
 249{
 250    BDRVPreallocateState *s = bs->opaque;
 251
 252    if (can_write_resize(bs->file->perm)) {
 253        assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
 254        assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
 255        return true;
 256    }
 257
 258    assert(s->data_end < 0);
 259    assert(s->zero_start < 0);
 260    assert(s->file_end < 0);
 261    return false;
 262}
 263
 264/*
 265 * Call on each write. Returns true if @want_merge_zero is true and the region
 266 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
 267 * preallocation).
 268 *
 269 * want_merge_zero is used to merge write-zero request with preallocation in
 270 * one bdrv_co_pwrite_zeroes() call.
 271 */
 272static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset,
 273                                      int64_t bytes, bool want_merge_zero)
 274{
 275    BDRVPreallocateState *s = bs->opaque;
 276    int64_t end = offset + bytes;
 277    int64_t prealloc_start, prealloc_end;
 278    int ret;
 279
 280    if (!has_prealloc_perms(bs)) {
 281        /* We don't have state neither should try to recover it */
 282        return false;
 283    }
 284
 285    if (s->data_end < 0) {
 286        s->data_end = bdrv_getlength(bs->file->bs);
 287        if (s->data_end < 0) {
 288            return false;
 289        }
 290
 291        if (s->file_end < 0) {
 292            s->file_end = s->data_end;
 293        }
 294    }
 295
 296    if (end <= s->data_end) {
 297        return false;
 298    }
 299
 300    /* We have valid s->data_end, and request writes beyond it. */
 301
 302    s->data_end = end;
 303    if (s->zero_start < 0 || !want_merge_zero) {
 304        s->zero_start = end;
 305    }
 306
 307    if (s->file_end < 0) {
 308        s->file_end = bdrv_getlength(bs->file->bs);
 309        if (s->file_end < 0) {
 310            return false;
 311        }
 312    }
 313
 314    /* Now s->data_end, s->zero_start and s->file_end are valid. */
 315
 316    if (end <= s->file_end) {
 317        /* No preallocation needed. */
 318        return want_merge_zero && offset >= s->zero_start;
 319    }
 320
 321    /* Now we want new preallocation, as request writes beyond s->file_end. */
 322
 323    prealloc_start = want_merge_zero ? MIN(offset, s->file_end) : s->file_end;
 324    prealloc_end = QEMU_ALIGN_UP(end + s->opts.prealloc_size,
 325                                 s->opts.prealloc_align);
 326
 327    ret = bdrv_co_pwrite_zeroes(
 328            bs->file, prealloc_start, prealloc_end - prealloc_start,
 329            BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
 330    if (ret < 0) {
 331        s->file_end = ret;
 332        return false;
 333    }
 334
 335    s->file_end = prealloc_end;
 336    return want_merge_zero;
 337}
 338
 339static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs,
 340        int64_t offset, int64_t bytes, BdrvRequestFlags flags)
 341{
 342    bool want_merge_zero =
 343        !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
 344    if (handle_write(bs, offset, bytes, want_merge_zero)) {
 345        return 0;
 346    }
 347
 348    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 349}
 350
 351static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs,
 352                                                    int64_t offset,
 353                                                    int64_t bytes,
 354                                                    QEMUIOVector *qiov,
 355                                                    size_t qiov_offset,
 356                                                    BdrvRequestFlags flags)
 357{
 358    handle_write(bs, offset, bytes, false);
 359
 360    return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
 361                                flags);
 362}
 363
 364static int coroutine_fn
 365preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
 366                        bool exact, PreallocMode prealloc,
 367                        BdrvRequestFlags flags, Error **errp)
 368{
 369    ERRP_GUARD();
 370    BDRVPreallocateState *s = bs->opaque;
 371    int ret;
 372
 373    if (s->data_end >= 0 && offset > s->data_end) {
 374        if (s->file_end < 0) {
 375            s->file_end = bdrv_getlength(bs->file->bs);
 376            if (s->file_end < 0) {
 377                error_setg(errp, "failed to get file length");
 378                return s->file_end;
 379            }
 380        }
 381
 382        if (prealloc == PREALLOC_MODE_FALLOC) {
 383            /*
 384             * If offset <= s->file_end, the task is already done, just
 385             * update s->data_end, to move part of "filter preallocation"
 386             * to "preallocation requested by user".
 387             * Otherwise just proceed to preallocate missing part.
 388             */
 389            if (offset <= s->file_end) {
 390                s->data_end = offset;
 391                return 0;
 392            }
 393        } else {
 394            /*
 395             * We have to drop our preallocation, to
 396             * - avoid "Cannot use preallocation for shrinking files" in
 397             *   case of offset < file_end
 398             * - give PREALLOC_MODE_OFF a chance to keep small disk
 399             *   usage
 400             * - give PREALLOC_MODE_FULL a chance to actually write the
 401             *   whole region as user expects
 402             */
 403            if (s->file_end > s->data_end) {
 404                ret = bdrv_co_truncate(bs->file, s->data_end, true,
 405                                       PREALLOC_MODE_OFF, 0, errp);
 406                if (ret < 0) {
 407                    s->file_end = ret;
 408                    error_prepend(errp, "preallocate-filter: failed to drop "
 409                                  "write-zero preallocation: ");
 410                    return ret;
 411                }
 412                s->file_end = s->data_end;
 413            }
 414        }
 415
 416        s->data_end = offset;
 417    }
 418
 419    ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
 420    if (ret < 0) {
 421        s->file_end = s->zero_start = s->data_end = ret;
 422        return ret;
 423    }
 424
 425    if (has_prealloc_perms(bs)) {
 426        s->file_end = s->zero_start = s->data_end = offset;
 427    }
 428    return 0;
 429}
 430
 431static int coroutine_fn preallocate_co_flush(BlockDriverState *bs)
 432{
 433    return bdrv_co_flush(bs->file->bs);
 434}
 435
 436static int64_t preallocate_getlength(BlockDriverState *bs)
 437{
 438    int64_t ret;
 439    BDRVPreallocateState *s = bs->opaque;
 440
 441    if (s->data_end >= 0) {
 442        return s->data_end;
 443    }
 444
 445    ret = bdrv_getlength(bs->file->bs);
 446
 447    if (has_prealloc_perms(bs)) {
 448        s->file_end = s->zero_start = s->data_end = ret;
 449    }
 450
 451    return ret;
 452}
 453
 454static int preallocate_check_perm(BlockDriverState *bs,
 455                                  uint64_t perm, uint64_t shared, Error **errp)
 456{
 457    BDRVPreallocateState *s = bs->opaque;
 458
 459    if (s->data_end >= 0 && !can_write_resize(perm)) {
 460        /*
 461         * Lose permissions.
 462         * We should truncate in check_perm, as in set_perm bs->file->perm will
 463         * be already changed, and we should not violate it.
 464         */
 465        if (s->file_end < 0) {
 466            s->file_end = bdrv_getlength(bs->file->bs);
 467            if (s->file_end < 0) {
 468                error_setg(errp, "Failed to get file length");
 469                return s->file_end;
 470            }
 471        }
 472
 473        if (s->data_end < s->file_end) {
 474            int ret = bdrv_truncate(bs->file, s->data_end, true,
 475                                    PREALLOC_MODE_OFF, 0, NULL);
 476            if (ret < 0) {
 477                error_setg(errp, "Failed to drop preallocation");
 478                s->file_end = ret;
 479                return ret;
 480            }
 481            s->file_end = s->data_end;
 482        }
 483    }
 484
 485    return 0;
 486}
 487
 488static void preallocate_set_perm(BlockDriverState *bs,
 489                                 uint64_t perm, uint64_t shared)
 490{
 491    BDRVPreallocateState *s = bs->opaque;
 492
 493    if (can_write_resize(perm)) {
 494        if (s->data_end < 0) {
 495            s->data_end = s->file_end = s->zero_start =
 496                bdrv_getlength(bs->file->bs);
 497        }
 498    } else {
 499        /*
 500         * We drop our permissions, as well as allow shared
 501         * permissions (see preallocate_child_perm), anyone will be able to
 502         * change the child, so mark all states invalid. We'll regain control if
 503         * get good permissions back.
 504         */
 505        s->data_end = s->file_end = s->zero_start = -EINVAL;
 506    }
 507}
 508
 509static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
 510    BdrvChildRole role, BlockReopenQueue *reopen_queue,
 511    uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
 512{
 513    bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
 514
 515    if (can_write_resize(perm)) {
 516        /* This should come by default, but let's enforce: */
 517        *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
 518
 519        /*
 520         * Don't share, to keep our states s->file_end, s->data_end and
 521         * s->zero_start valid.
 522         */
 523        *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
 524    }
 525}
 526
 527BlockDriver bdrv_preallocate_filter = {
 528    .format_name = "preallocate",
 529    .instance_size = sizeof(BDRVPreallocateState),
 530
 531    .bdrv_getlength = preallocate_getlength,
 532    .bdrv_open = preallocate_open,
 533    .bdrv_close = preallocate_close,
 534
 535    .bdrv_reopen_prepare  = preallocate_reopen_prepare,
 536    .bdrv_reopen_commit   = preallocate_reopen_commit,
 537    .bdrv_reopen_abort    = preallocate_reopen_abort,
 538
 539    .bdrv_co_preadv_part = preallocate_co_preadv_part,
 540    .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
 541    .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
 542    .bdrv_co_pdiscard = preallocate_co_pdiscard,
 543    .bdrv_co_flush = preallocate_co_flush,
 544    .bdrv_co_truncate = preallocate_co_truncate,
 545
 546    .bdrv_check_perm = preallocate_check_perm,
 547    .bdrv_set_perm = preallocate_set_perm,
 548    .bdrv_child_perm = preallocate_child_perm,
 549
 550    .has_variable_length = true,
 551    .is_filter = true,
 552};
 553
 554static void bdrv_preallocate_init(void)
 555{
 556    bdrv_register(&bdrv_preallocate_filter);
 557}
 558
 559block_init(bdrv_preallocate_init);
 560