linux/fs/nfs/blocklayout/extent_tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2014-2016 Christoph Hellwig.
   3 */
   4
   5#include <linux/vmalloc.h>
   6
   7#include "blocklayout.h"
   8
   9#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
  10
  11static inline struct pnfs_block_extent *
  12ext_node(struct rb_node *node)
  13{
  14        return rb_entry(node, struct pnfs_block_extent, be_node);
  15}
  16
  17static struct pnfs_block_extent *
  18ext_tree_first(struct rb_root *root)
  19{
  20        struct rb_node *node = rb_first(root);
  21        return node ? ext_node(node) : NULL;
  22}
  23
  24static struct pnfs_block_extent *
  25ext_tree_prev(struct pnfs_block_extent *be)
  26{
  27        struct rb_node *node = rb_prev(&be->be_node);
  28        return node ? ext_node(node) : NULL;
  29}
  30
  31static struct pnfs_block_extent *
  32ext_tree_next(struct pnfs_block_extent *be)
  33{
  34        struct rb_node *node = rb_next(&be->be_node);
  35        return node ? ext_node(node) : NULL;
  36}
  37
  38static inline sector_t
  39ext_f_end(struct pnfs_block_extent *be)
  40{
  41        return be->be_f_offset + be->be_length;
  42}
  43
  44static struct pnfs_block_extent *
  45__ext_tree_search(struct rb_root *root, sector_t start)
  46{
  47        struct rb_node *node = root->rb_node;
  48        struct pnfs_block_extent *be = NULL;
  49
  50        while (node) {
  51                be = ext_node(node);
  52                if (start < be->be_f_offset)
  53                        node = node->rb_left;
  54                else if (start >= ext_f_end(be))
  55                        node = node->rb_right;
  56                else
  57                        return be;
  58        }
  59
  60        if (be) {
  61                if (start < be->be_f_offset)
  62                        return be;
  63
  64                if (start >= ext_f_end(be))
  65                        return ext_tree_next(be);
  66        }
  67
  68        return NULL;
  69}
  70
  71static bool
  72ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
  73{
  74        if (be1->be_state != be2->be_state)
  75                return false;
  76        if (be1->be_device != be2->be_device)
  77                return false;
  78
  79        if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
  80                return false;
  81
  82        if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
  83            (be1->be_v_offset + be1->be_length != be2->be_v_offset))
  84                return false;
  85
  86        if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
  87            be1->be_tag != be2->be_tag)
  88                return false;
  89
  90        return true;
  91}
  92
  93static struct pnfs_block_extent *
  94ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
  95{
  96        struct pnfs_block_extent *left = ext_tree_prev(be);
  97
  98        if (left && ext_can_merge(left, be)) {
  99                left->be_length += be->be_length;
 100                rb_erase(&be->be_node, root);
 101                nfs4_put_deviceid_node(be->be_device);
 102                kfree(be);
 103                return left;
 104        }
 105
 106        return be;
 107}
 108
 109static struct pnfs_block_extent *
 110ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
 111{
 112        struct pnfs_block_extent *right = ext_tree_next(be);
 113
 114        if (right && ext_can_merge(be, right)) {
 115                be->be_length += right->be_length;
 116                rb_erase(&right->be_node, root);
 117                nfs4_put_deviceid_node(right->be_device);
 118                kfree(right);
 119        }
 120
 121        return be;
 122}
 123
 124static void
 125__ext_tree_insert(struct rb_root *root,
 126                struct pnfs_block_extent *new, bool merge_ok)
 127{
 128        struct rb_node **p = &root->rb_node, *parent = NULL;
 129        struct pnfs_block_extent *be;
 130
 131        while (*p) {
 132                parent = *p;
 133                be = ext_node(parent);
 134
 135                if (new->be_f_offset < be->be_f_offset) {
 136                        if (merge_ok && ext_can_merge(new, be)) {
 137                                be->be_f_offset = new->be_f_offset;
 138                                if (be->be_state != PNFS_BLOCK_NONE_DATA)
 139                                        be->be_v_offset = new->be_v_offset;
 140                                be->be_length += new->be_length;
 141                                be = ext_try_to_merge_left(root, be);
 142                                goto free_new;
 143                        }
 144                        p = &(*p)->rb_left;
 145                } else if (new->be_f_offset >= ext_f_end(be)) {
 146                        if (merge_ok && ext_can_merge(be, new)) {
 147                                be->be_length += new->be_length;
 148                                be = ext_try_to_merge_right(root, be);
 149                                goto free_new;
 150                        }
 151                        p = &(*p)->rb_right;
 152                } else {
 153                        BUG();
 154                }
 155        }
 156
 157        rb_link_node(&new->be_node, parent, p);
 158        rb_insert_color(&new->be_node, root);
 159        return;
 160free_new:
 161        nfs4_put_deviceid_node(new->be_device);
 162        kfree(new);
 163}
 164
 165static int
 166__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
 167{
 168        struct pnfs_block_extent *be;
 169        sector_t len1 = 0, len2 = 0;
 170        sector_t orig_v_offset;
 171        sector_t orig_len;
 172
 173        be = __ext_tree_search(root, start);
 174        if (!be)
 175                return 0;
 176        if (be->be_f_offset >= end)
 177                return 0;
 178
 179        orig_v_offset = be->be_v_offset;
 180        orig_len = be->be_length;
 181
 182        if (start > be->be_f_offset)
 183                len1 = start - be->be_f_offset;
 184        if (ext_f_end(be) > end)
 185                len2 = ext_f_end(be) - end;
 186
 187        if (len2 > 0) {
 188                if (len1 > 0) {
 189                        struct pnfs_block_extent *new;
 190
 191                        new = kzalloc(sizeof(*new), GFP_ATOMIC);
 192                        if (!new)
 193                                return -ENOMEM;
 194
 195                        be->be_length = len1;
 196
 197                        new->be_f_offset = end;
 198                        if (be->be_state != PNFS_BLOCK_NONE_DATA) {
 199                                new->be_v_offset =
 200                                        orig_v_offset + orig_len - len2;
 201                        }
 202                        new->be_length = len2;
 203                        new->be_state = be->be_state;
 204                        new->be_tag = be->be_tag;
 205                        new->be_device = nfs4_get_deviceid(be->be_device);
 206
 207                        __ext_tree_insert(root, new, true);
 208                } else {
 209                        be->be_f_offset = end;
 210                        if (be->be_state != PNFS_BLOCK_NONE_DATA) {
 211                                be->be_v_offset =
 212                                        orig_v_offset + orig_len - len2;
 213                        }
 214                        be->be_length = len2;
 215                }
 216        } else {
 217                if (len1 > 0) {
 218                        be->be_length = len1;
 219                        be = ext_tree_next(be);
 220                }
 221
 222                while (be && ext_f_end(be) <= end) {
 223                        struct pnfs_block_extent *next = ext_tree_next(be);
 224
 225                        rb_erase(&be->be_node, root);
 226                        nfs4_put_deviceid_node(be->be_device);
 227                        kfree(be);
 228                        be = next;
 229                }
 230
 231                if (be && be->be_f_offset < end) {
 232                        len1 = ext_f_end(be) - end;
 233                        be->be_f_offset = end;
 234                        if (be->be_state != PNFS_BLOCK_NONE_DATA)
 235                                be->be_v_offset += be->be_length - len1;
 236                        be->be_length = len1;
 237                }
 238        }
 239
 240        return 0;
 241}
 242
 243int
 244ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
 245{
 246        struct pnfs_block_extent *be;
 247        struct rb_root *root;
 248        int err = 0;
 249
 250        switch (new->be_state) {
 251        case PNFS_BLOCK_READWRITE_DATA:
 252        case PNFS_BLOCK_INVALID_DATA:
 253                root = &bl->bl_ext_rw;
 254                break;
 255        case PNFS_BLOCK_READ_DATA:
 256        case PNFS_BLOCK_NONE_DATA:
 257                root = &bl->bl_ext_ro;
 258                break;
 259        default:
 260                dprintk("invalid extent type\n");
 261                return -EINVAL;
 262        }
 263
 264        spin_lock(&bl->bl_ext_lock);
 265retry:
 266        be = __ext_tree_search(root, new->be_f_offset);
 267        if (!be || be->be_f_offset >= ext_f_end(new)) {
 268                __ext_tree_insert(root, new, true);
 269        } else if (new->be_f_offset >= be->be_f_offset) {
 270                if (ext_f_end(new) <= ext_f_end(be)) {
 271                        nfs4_put_deviceid_node(new->be_device);
 272                        kfree(new);
 273                } else {
 274                        sector_t new_len = ext_f_end(new) - ext_f_end(be);
 275                        sector_t diff = new->be_length - new_len;
 276
 277                        new->be_f_offset += diff;
 278                        new->be_v_offset += diff;
 279                        new->be_length = new_len;
 280                        goto retry;
 281                }
 282        } else if (ext_f_end(new) <= ext_f_end(be)) {
 283                new->be_length = be->be_f_offset - new->be_f_offset;
 284                __ext_tree_insert(root, new, true);
 285        } else {
 286                struct pnfs_block_extent *split;
 287                sector_t new_len = ext_f_end(new) - ext_f_end(be);
 288                sector_t diff = new->be_length - new_len;
 289
 290                split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
 291                if (!split) {
 292                        err = -EINVAL;
 293                        goto out;
 294                }
 295
 296                split->be_length = be->be_f_offset - split->be_f_offset;
 297                split->be_device = nfs4_get_deviceid(new->be_device);
 298                __ext_tree_insert(root, split, true);
 299
 300                new->be_f_offset += diff;
 301                new->be_v_offset += diff;
 302                new->be_length = new_len;
 303                goto retry;
 304        }
 305out:
 306        spin_unlock(&bl->bl_ext_lock);
 307        return err;
 308}
 309
 310static bool
 311__ext_tree_lookup(struct rb_root *root, sector_t isect,
 312                struct pnfs_block_extent *ret)
 313{
 314        struct rb_node *node;
 315        struct pnfs_block_extent *be;
 316
 317        node = root->rb_node;
 318        while (node) {
 319                be = ext_node(node);
 320                if (isect < be->be_f_offset)
 321                        node = node->rb_left;
 322                else if (isect >= ext_f_end(be))
 323                        node = node->rb_right;
 324                else {
 325                        *ret = *be;
 326                        return true;
 327                }
 328        }
 329
 330        return false;
 331}
 332
 333bool
 334ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 335            struct pnfs_block_extent *ret, bool rw)
 336{
 337        bool found = false;
 338
 339        spin_lock(&bl->bl_ext_lock);
 340        if (!rw)
 341                found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
 342        if (!found)
 343                found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
 344        spin_unlock(&bl->bl_ext_lock);
 345
 346        return found;
 347}
 348
 349int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
 350                sector_t start, sector_t end)
 351{
 352        int err, err2;
 353
 354        spin_lock(&bl->bl_ext_lock);
 355        err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
 356        if (rw) {
 357                err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
 358                if (!err)
 359                        err = err2;
 360        }
 361        spin_unlock(&bl->bl_ext_lock);
 362
 363        return err;
 364}
 365
 366static int
 367ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 368                sector_t split)
 369{
 370        struct pnfs_block_extent *new;
 371        sector_t orig_len = be->be_length;
 372
 373        new = kzalloc(sizeof(*new), GFP_ATOMIC);
 374        if (!new)
 375                return -ENOMEM;
 376
 377        be->be_length = split - be->be_f_offset;
 378
 379        new->be_f_offset = split;
 380        if (be->be_state != PNFS_BLOCK_NONE_DATA)
 381                new->be_v_offset = be->be_v_offset + be->be_length;
 382        new->be_length = orig_len - be->be_length;
 383        new->be_state = be->be_state;
 384        new->be_tag = be->be_tag;
 385        new->be_device = nfs4_get_deviceid(be->be_device);
 386
 387        __ext_tree_insert(root, new, false);
 388        return 0;
 389}
 390
 391int
 392ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 393                sector_t len)
 394{
 395        struct rb_root *root = &bl->bl_ext_rw;
 396        sector_t end = start + len;
 397        struct pnfs_block_extent *be;
 398        int err = 0;
 399
 400        spin_lock(&bl->bl_ext_lock);
 401        /*
 402         * First remove all COW extents or holes from written to range.
 403         */
 404        err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
 405        if (err)
 406                goto out;
 407
 408        /*
 409         * Then mark all invalid extents in the range as written to.
 410         */
 411        for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
 412                if (be->be_f_offset >= end)
 413                        break;
 414
 415                if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
 416                        continue;
 417
 418                if (be->be_f_offset < start) {
 419                        struct pnfs_block_extent *left = ext_tree_prev(be);
 420
 421                        if (left && ext_can_merge(left, be)) {
 422                                sector_t diff = start - be->be_f_offset;
 423
 424                                left->be_length += diff;
 425
 426                                be->be_f_offset += diff;
 427                                be->be_v_offset += diff;
 428                                be->be_length -= diff;
 429                        } else {
 430                                err = ext_tree_split(root, be, start);
 431                                if (err)
 432                                        goto out;
 433                        }
 434                }
 435
 436                if (ext_f_end(be) > end) {
 437                        struct pnfs_block_extent *right = ext_tree_next(be);
 438
 439                        if (right && ext_can_merge(be, right)) {
 440                                sector_t diff = end - be->be_f_offset;
 441
 442                                be->be_length -= diff;
 443
 444                                right->be_f_offset -= diff;
 445                                right->be_v_offset -= diff;
 446                                right->be_length += diff;
 447                        } else {
 448                                err = ext_tree_split(root, be, end);
 449                                if (err)
 450                                        goto out;
 451                        }
 452                }
 453
 454                if (be->be_f_offset >= start && ext_f_end(be) <= end) {
 455                        be->be_tag = EXTENT_WRITTEN;
 456                        be = ext_try_to_merge_left(root, be);
 457                        be = ext_try_to_merge_right(root, be);
 458                }
 459        }
 460out:
 461        spin_unlock(&bl->bl_ext_lock);
 462        return err;
 463}
 464
 465static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
 466{
 467        if (bl->bl_scsi_layout)
 468                return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
 469        else
 470                return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
 471}
 472
 473static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 474                size_t buffer_size)
 475{
 476        if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
 477                int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
 478
 479                for (i = 0; i < nr_pages; i++)
 480                        put_page(arg->layoutupdate_pages[i]);
 481                vfree(arg->start_p);
 482                kfree(arg->layoutupdate_pages);
 483        } else {
 484                put_page(arg->layoutupdate_page);
 485        }
 486}
 487
 488static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
 489{
 490        p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
 491                        NFS4_DEVICEID4_SIZE);
 492        p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 493        p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
 494        p = xdr_encode_hyper(p, 0LL);
 495        *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
 496        return p;
 497}
 498
 499static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
 500{
 501        p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
 502        return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
 503}
 504
 505static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 506                size_t buffer_size, size_t *count)
 507{
 508        struct pnfs_block_extent *be;
 509        int ret = 0;
 510
 511        spin_lock(&bl->bl_ext_lock);
 512        for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
 513                if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 514                    be->be_tag != EXTENT_WRITTEN)
 515                        continue;
 516
 517                (*count)++;
 518                if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
 519                        /* keep counting.. */
 520                        ret = -ENOSPC;
 521                        continue;
 522                }
 523
 524                if (bl->bl_scsi_layout)
 525                        p = encode_scsi_range(be, p);
 526                else
 527                        p = encode_block_extent(be, p);
 528                be->be_tag = EXTENT_COMMITTING;
 529        }
 530        spin_unlock(&bl->bl_ext_lock);
 531
 532        return ret;
 533}
 534
 535int
 536ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
 537{
 538        struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 539        size_t count = 0, buffer_size = PAGE_SIZE;
 540        __be32 *start_p;
 541        int ret;
 542
 543        dprintk("%s enter\n", __func__);
 544
 545        arg->layoutupdate_page = alloc_page(GFP_NOFS);
 546        if (!arg->layoutupdate_page)
 547                return -ENOMEM;
 548        start_p = page_address(arg->layoutupdate_page);
 549        arg->layoutupdate_pages = &arg->layoutupdate_page;
 550
 551retry:
 552        ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
 553        if (unlikely(ret)) {
 554                ext_tree_free_commitdata(arg, buffer_size);
 555
 556                buffer_size = ext_tree_layoutupdate_size(bl, count);
 557                count = 0;
 558
 559                arg->layoutupdate_pages =
 560                        kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
 561                                sizeof(struct page *), GFP_NOFS);
 562                if (!arg->layoutupdate_pages)
 563                        return -ENOMEM;
 564
 565                start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
 566                if (!start_p) {
 567                        kfree(arg->layoutupdate_pages);
 568                        return -ENOMEM;
 569                }
 570
 571                goto retry;
 572        }
 573
 574        *start_p = cpu_to_be32(count);
 575        arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
 576
 577        if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 578                void *p = start_p, *end = p + arg->layoutupdate_len;
 579                struct page *page = NULL;
 580                int i = 0;
 581
 582                arg->start_p = start_p;
 583                for ( ; p < end; p += PAGE_SIZE) {
 584                        page = vmalloc_to_page(p);
 585                        arg->layoutupdate_pages[i++] = page;
 586                        get_page(page);
 587                }
 588        }
 589
 590        dprintk("%s found %zu ranges\n", __func__, count);
 591        return 0;
 592}
 593
 594void
 595ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
 596{
 597        struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
 598        struct rb_root *root = &bl->bl_ext_rw;
 599        struct pnfs_block_extent *be;
 600
 601        dprintk("%s status %d\n", __func__, status);
 602
 603        ext_tree_free_commitdata(arg, arg->layoutupdate_len);
 604
 605        spin_lock(&bl->bl_ext_lock);
 606        for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
 607                if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
 608                    be->be_tag != EXTENT_COMMITTING)
 609                        continue;
 610
 611                if (status) {
 612                        /*
 613                         * Mark as written and try again.
 614                         *
 615                         * XXX: some real error handling here wouldn't hurt..
 616                         */
 617                        be->be_tag = EXTENT_WRITTEN;
 618                } else {
 619                        be->be_state = PNFS_BLOCK_READWRITE_DATA;
 620                        be->be_tag = 0;
 621                }
 622
 623                be = ext_try_to_merge_left(root, be);
 624                be = ext_try_to_merge_right(root, be);
 625        }
 626        spin_unlock(&bl->bl_ext_lock);
 627}
 628