linux/fs/afs/rotate.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* Handle fileserver selection and rotation.
   3 *
   4 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
   5 * Written by David Howells (dhowells@redhat.com)
   6 */
   7
   8#include <linux/kernel.h>
   9#include <linux/slab.h>
  10#include <linux/fs.h>
  11#include <linux/sched.h>
  12#include <linux/delay.h>
  13#include <linux/sched/signal.h>
  14#include "internal.h"
  15#include "afs_fs.h"
  16
  17/*
  18 * Begin an operation on the fileserver.
  19 *
  20 * Fileserver operations are serialised on the server by vnode, so we serialise
  21 * them here also using the io_lock.
  22 */
  23bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
  24                               struct key *key, bool intr)
  25{
  26        memset(fc, 0, sizeof(*fc));
  27        fc->vnode = vnode;
  28        fc->key = key;
  29        fc->ac.error = SHRT_MAX;
  30        fc->error = -EDESTADDRREQ;
  31
  32        if (intr) {
  33                fc->flags |= AFS_FS_CURSOR_INTR;
  34                if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
  35                        fc->error = -EINTR;
  36                        fc->flags |= AFS_FS_CURSOR_STOP;
  37                        return false;
  38                }
  39        } else {
  40                mutex_lock(&vnode->io_lock);
  41        }
  42
  43        if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
  44                fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
  45        return true;
  46}
  47
  48/*
  49 * Begin iteration through a server list, starting with the vnode's last used
  50 * server if possible, or the last recorded good server if not.
  51 */
  52static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
  53                                   struct afs_vnode *vnode)
  54{
  55        struct afs_cb_interest *cbi;
  56        int i;
  57
  58        read_lock(&vnode->volume->servers_lock);
  59        fc->server_list = afs_get_serverlist(vnode->volume->servers);
  60        read_unlock(&vnode->volume->servers_lock);
  61
  62        fc->untried = (1UL << fc->server_list->nr_servers) - 1;
  63        fc->index = READ_ONCE(fc->server_list->preferred);
  64
  65        cbi = rcu_dereference_protected(vnode->cb_interest,
  66                                        lockdep_is_held(&vnode->io_lock));
  67        if (cbi) {
  68                /* See if the vnode's preferred record is still available */
  69                for (i = 0; i < fc->server_list->nr_servers; i++) {
  70                        if (fc->server_list->servers[i].cb_interest == cbi) {
  71                                fc->index = i;
  72                                goto found_interest;
  73                        }
  74                }
  75
  76                /* If we have a lock outstanding on a server that's no longer
  77                 * serving this vnode, then we can't switch to another server
  78                 * and have to return an error.
  79                 */
  80                if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
  81                        fc->error = -ESTALE;
  82                        return false;
  83                }
  84
  85                /* Note that the callback promise is effectively broken */
  86                write_seqlock(&vnode->cb_lock);
  87                ASSERTCMP(cbi, ==, rcu_access_pointer(vnode->cb_interest));
  88                rcu_assign_pointer(vnode->cb_interest, NULL);
  89                if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
  90                        vnode->cb_break++;
  91                write_sequnlock(&vnode->cb_lock);
  92
  93                afs_put_cb_interest(afs_v2net(vnode), cbi);
  94                cbi = NULL;
  95        }
  96
  97found_interest:
  98        return true;
  99}
 100
 101/*
 102 * Post volume busy note.
 103 */
 104static void afs_busy(struct afs_volume *volume, u32 abort_code)
 105{
 106        const char *m;
 107
 108        switch (abort_code) {
 109        case VOFFLINE:          m = "offline";          break;
 110        case VRESTARTING:       m = "restarting";       break;
 111        case VSALVAGING:        m = "being salvaged";   break;
 112        default:                m = "busy";             break;
 113        }
 114
 115        pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
 116}
 117
 118/*
 119 * Sleep and retry the operation to the same fileserver.
 120 */
 121static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
 122{
 123        if (fc->flags & AFS_FS_CURSOR_INTR) {
 124                msleep_interruptible(1000);
 125                if (signal_pending(current)) {
 126                        fc->error = -ERESTARTSYS;
 127                        return false;
 128                }
 129        } else {
 130                msleep(1000);
 131        }
 132
 133        return true;
 134}
 135
 136/*
 137 * Select the fileserver to use.  May be called multiple times to rotate
 138 * through the fileservers.
 139 */
 140bool afs_select_fileserver(struct afs_fs_cursor *fc)
 141{
 142        struct afs_addr_list *alist;
 143        struct afs_server *server;
 144        struct afs_vnode *vnode = fc->vnode;
 145        struct afs_error e;
 146        u32 rtt;
 147        int error = fc->ac.error, i;
 148
 149        _enter("%lx[%d],%lx[%d],%d,%d",
 150               fc->untried, fc->index,
 151               fc->ac.tried, fc->ac.index,
 152               error, fc->ac.abort_code);
 153
 154        if (fc->flags & AFS_FS_CURSOR_STOP) {
 155                _leave(" = f [stopped]");
 156                return false;
 157        }
 158
 159        fc->nr_iterations++;
 160
 161        /* Evaluate the result of the previous operation, if there was one. */
 162        switch (error) {
 163        case SHRT_MAX:
 164                goto start;
 165
 166        case 0:
 167        default:
 168                /* Success or local failure.  Stop. */
 169                fc->error = error;
 170                fc->flags |= AFS_FS_CURSOR_STOP;
 171                _leave(" = f [okay/local %d]", error);
 172                return false;
 173
 174        case -ECONNABORTED:
 175                /* The far side rejected the operation on some grounds.  This
 176                 * might involve the server being busy or the volume having been moved.
 177                 */
 178                switch (fc->ac.abort_code) {
 179                case VNOVOL:
 180                        /* This fileserver doesn't know about the volume.
 181                         * - May indicate that the VL is wrong - retry once and compare
 182                         *   the results.
 183                         * - May indicate that the fileserver couldn't attach to the vol.
 184                         */
 185                        if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
 186                                fc->error = -EREMOTEIO;
 187                                goto next_server;
 188                        }
 189
 190                        write_lock(&vnode->volume->servers_lock);
 191                        fc->server_list->vnovol_mask |= 1 << fc->index;
 192                        write_unlock(&vnode->volume->servers_lock);
 193
 194                        set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
 195                        error = afs_check_volume_status(vnode->volume, fc->key);
 196                        if (error < 0)
 197                                goto failed_set_error;
 198
 199                        if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
 200                                fc->error = -ENOMEDIUM;
 201                                goto failed;
 202                        }
 203
 204                        /* If the server list didn't change, then assume that
 205                         * it's the fileserver having trouble.
 206                         */
 207                        if (vnode->volume->servers == fc->server_list) {
 208                                fc->error = -EREMOTEIO;
 209                                goto next_server;
 210                        }
 211
 212                        /* Try again */
 213                        fc->flags |= AFS_FS_CURSOR_VNOVOL;
 214                        _leave(" = t [vnovol]");
 215                        return true;
 216
 217                case VSALVAGE: /* TODO: Should this return an error or iterate? */
 218                case VVOLEXISTS:
 219                case VNOSERVICE:
 220                case VONLINE:
 221                case VDISKFULL:
 222                case VOVERQUOTA:
 223                        fc->error = afs_abort_to_error(fc->ac.abort_code);
 224                        goto next_server;
 225
 226                case VOFFLINE:
 227                        if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
 228                                afs_busy(vnode->volume, fc->ac.abort_code);
 229                                clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
 230                        }
 231                        if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
 232                                fc->error = -EADV;
 233                                goto failed;
 234                        }
 235                        if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
 236                                fc->error = -ESTALE;
 237                                goto failed;
 238                        }
 239                        goto busy;
 240
 241                case VSALVAGING:
 242                case VRESTARTING:
 243                case VBUSY:
 244                        /* Retry after going round all the servers unless we
 245                         * have a file lock we need to maintain.
 246                         */
 247                        if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
 248                                fc->error = -EBUSY;
 249                                goto failed;
 250                        }
 251                        if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
 252                                afs_busy(vnode->volume, fc->ac.abort_code);
 253                                clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
 254                        }
 255                busy:
 256                        if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
 257                                if (!afs_sleep_and_retry(fc))
 258                                        goto failed;
 259
 260                                 /* Retry with same server & address */
 261                                _leave(" = t [vbusy]");
 262                                return true;
 263                        }
 264
 265                        fc->flags |= AFS_FS_CURSOR_VBUSY;
 266                        goto next_server;
 267
 268                case VMOVED:
 269                        /* The volume migrated to another server.  We consider
 270                         * consider all locks and callbacks broken and request
 271                         * an update from the VLDB.
 272                         *
 273                         * We also limit the number of VMOVED hops we will
 274                         * honour, just in case someone sets up a loop.
 275                         */
 276                        if (fc->flags & AFS_FS_CURSOR_VMOVED) {
 277                                fc->error = -EREMOTEIO;
 278                                goto failed;
 279                        }
 280                        fc->flags |= AFS_FS_CURSOR_VMOVED;
 281
 282                        set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
 283                        set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
 284                        error = afs_check_volume_status(vnode->volume, fc->key);
 285                        if (error < 0)
 286                                goto failed_set_error;
 287
 288                        /* If the server list didn't change, then the VLDB is
 289                         * out of sync with the fileservers.  This is hopefully
 290                         * a temporary condition, however, so we don't want to
 291                         * permanently block access to the file.
 292                         *
 293                         * TODO: Try other fileservers if we can.
 294                         *
 295                         * TODO: Retry a few times with sleeps.
 296                         */
 297                        if (vnode->volume->servers == fc->server_list) {
 298                                fc->error = -ENOMEDIUM;
 299                                goto failed;
 300                        }
 301
 302                        goto restart_from_beginning;
 303
 304                default:
 305                        clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
 306                        clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
 307                        fc->error = afs_abort_to_error(fc->ac.abort_code);
 308                        goto failed;
 309                }
 310
 311        case -ETIMEDOUT:
 312        case -ETIME:
 313                if (fc->error != -EDESTADDRREQ)
 314                        goto iterate_address;
 315                /* Fall through */
 316        case -ERFKILL:
 317        case -EADDRNOTAVAIL:
 318        case -ENETUNREACH:
 319        case -EHOSTUNREACH:
 320        case -EHOSTDOWN:
 321        case -ECONNREFUSED:
 322                _debug("no conn");
 323                fc->error = error;
 324                goto iterate_address;
 325
 326        case -ECONNRESET:
 327                _debug("call reset");
 328                fc->error = error;
 329                goto failed;
 330        }
 331
 332restart_from_beginning:
 333        _debug("restart");
 334        afs_end_cursor(&fc->ac);
 335        afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
 336        fc->cbi = NULL;
 337        afs_put_serverlist(afs_v2net(vnode), fc->server_list);
 338        fc->server_list = NULL;
 339start:
 340        _debug("start");
 341        /* See if we need to do an update of the volume record.  Note that the
 342         * volume may have moved or even have been deleted.
 343         */
 344        error = afs_check_volume_status(vnode->volume, fc->key);
 345        if (error < 0)
 346                goto failed_set_error;
 347
 348        if (!afs_start_fs_iteration(fc, vnode))
 349                goto failed;
 350
 351        _debug("__ VOL %llx __", vnode->volume->vid);
 352        error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
 353        if (error < 0)
 354                goto failed_set_error;
 355
 356pick_server:
 357        _debug("pick [%lx]", fc->untried);
 358
 359        error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
 360        if (error < 0)
 361                goto failed_set_error;
 362
 363        /* Pick the untried server with the lowest RTT.  If we have outstanding
 364         * callbacks, we stick with the server we're already using if we can.
 365         */
 366        if (fc->cbi) {
 367                _debug("cbi %u", fc->index);
 368                if (test_bit(fc->index, &fc->untried))
 369                        goto selected_server;
 370                afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
 371                fc->cbi = NULL;
 372                _debug("nocbi");
 373        }
 374
 375        fc->index = -1;
 376        rtt = U32_MAX;
 377        for (i = 0; i < fc->server_list->nr_servers; i++) {
 378                struct afs_server *s = fc->server_list->servers[i].server;
 379
 380                if (!test_bit(i, &fc->untried) || !s->probe.responded)
 381                        continue;
 382                if (s->probe.rtt < rtt) {
 383                        fc->index = i;
 384                        rtt = s->probe.rtt;
 385                }
 386        }
 387
 388        if (fc->index == -1)
 389                goto no_more_servers;
 390
 391selected_server:
 392        _debug("use %d", fc->index);
 393        __clear_bit(fc->index, &fc->untried);
 394
 395        /* We're starting on a different fileserver from the list.  We need to
 396         * check it, create a callback intercept, find its address list and
 397         * probe its capabilities before we use it.
 398         */
 399        ASSERTCMP(fc->ac.alist, ==, NULL);
 400        server = fc->server_list->servers[fc->index].server;
 401
 402        if (!afs_check_server_record(fc, server))
 403                goto failed;
 404
 405        _debug("USING SERVER: %pU", &server->uuid);
 406
 407        /* Make sure we've got a callback interest record for this server.  We
 408         * have to link it in before we send the request as we can be sent a
 409         * break request before we've finished decoding the reply and
 410         * installing the vnode.
 411         */
 412        error = afs_register_server_cb_interest(vnode, fc->server_list,
 413                                                fc->index);
 414        if (error < 0)
 415                goto failed_set_error;
 416
 417        fc->cbi = afs_get_cb_interest(
 418                rcu_dereference_protected(vnode->cb_interest,
 419                                          lockdep_is_held(&vnode->io_lock)));
 420
 421        read_lock(&server->fs_lock);
 422        alist = rcu_dereference_protected(server->addresses,
 423                                          lockdep_is_held(&server->fs_lock));
 424        afs_get_addrlist(alist);
 425        read_unlock(&server->fs_lock);
 426
 427        memset(&fc->ac, 0, sizeof(fc->ac));
 428
 429        if (!fc->ac.alist)
 430                fc->ac.alist = alist;
 431        else
 432                afs_put_addrlist(alist);
 433
 434        fc->ac.index = -1;
 435
 436iterate_address:
 437        ASSERT(fc->ac.alist);
 438        /* Iterate over the current server's address list to try and find an
 439         * address on which it will respond to us.
 440         */
 441        if (!afs_iterate_addresses(&fc->ac))
 442                goto next_server;
 443
 444        _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
 445
 446        _leave(" = t");
 447        return true;
 448
 449next_server:
 450        _debug("next");
 451        afs_end_cursor(&fc->ac);
 452        goto pick_server;
 453
 454no_more_servers:
 455        /* That's all the servers poked to no good effect.  Try again if some
 456         * of them were busy.
 457         */
 458        if (fc->flags & AFS_FS_CURSOR_VBUSY)
 459                goto restart_from_beginning;
 460
 461        e.error = -EDESTADDRREQ;
 462        e.responded = false;
 463        for (i = 0; i < fc->server_list->nr_servers; i++) {
 464                struct afs_server *s = fc->server_list->servers[i].server;
 465
 466                afs_prioritise_error(&e, READ_ONCE(s->probe.error),
 467                                     s->probe.abort_code);
 468        }
 469
 470        error = e.error;
 471
 472failed_set_error:
 473        fc->error = error;
 474failed:
 475        fc->flags |= AFS_FS_CURSOR_STOP;
 476        afs_end_cursor(&fc->ac);
 477        _leave(" = f [failed %d]", fc->error);
 478        return false;
 479}
 480
 481/*
 482 * Select the same fileserver we used for a vnode before and only that
 483 * fileserver.  We use this when we have a lock on that file, which is backed
 484 * only by the fileserver we obtained it from.
 485 */
 486bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
 487{
 488        struct afs_vnode *vnode = fc->vnode;
 489        struct afs_cb_interest *cbi;
 490        struct afs_addr_list *alist;
 491        int error = fc->ac.error;
 492
 493        _enter("");
 494
 495        cbi = rcu_dereference_protected(vnode->cb_interest,
 496                                        lockdep_is_held(&vnode->io_lock));
 497
 498        switch (error) {
 499        case SHRT_MAX:
 500                if (!cbi) {
 501                        fc->error = -ESTALE;
 502                        fc->flags |= AFS_FS_CURSOR_STOP;
 503                        return false;
 504                }
 505
 506                fc->cbi = afs_get_cb_interest(cbi);
 507
 508                read_lock(&cbi->server->fs_lock);
 509                alist = rcu_dereference_protected(cbi->server->addresses,
 510                                                  lockdep_is_held(&cbi->server->fs_lock));
 511                afs_get_addrlist(alist);
 512                read_unlock(&cbi->server->fs_lock);
 513                if (!alist) {
 514                        fc->error = -ESTALE;
 515                        fc->flags |= AFS_FS_CURSOR_STOP;
 516                        return false;
 517                }
 518
 519                memset(&fc->ac, 0, sizeof(fc->ac));
 520                fc->ac.alist = alist;
 521                fc->ac.index = -1;
 522                goto iterate_address;
 523
 524        case 0:
 525        default:
 526                /* Success or local failure.  Stop. */
 527                fc->error = error;
 528                fc->flags |= AFS_FS_CURSOR_STOP;
 529                _leave(" = f [okay/local %d]", error);
 530                return false;
 531
 532        case -ECONNABORTED:
 533                fc->error = afs_abort_to_error(fc->ac.abort_code);
 534                fc->flags |= AFS_FS_CURSOR_STOP;
 535                _leave(" = f [abort]");
 536                return false;
 537
 538        case -ERFKILL:
 539        case -EADDRNOTAVAIL:
 540        case -ENETUNREACH:
 541        case -EHOSTUNREACH:
 542        case -EHOSTDOWN:
 543        case -ECONNREFUSED:
 544        case -ETIMEDOUT:
 545        case -ETIME:
 546                _debug("no conn");
 547                fc->error = error;
 548                goto iterate_address;
 549        }
 550
 551iterate_address:
 552        /* Iterate over the current server's address list to try and find an
 553         * address on which it will respond to us.
 554         */
 555        if (afs_iterate_addresses(&fc->ac)) {
 556                _leave(" = t");
 557                return true;
 558        }
 559
 560        afs_end_cursor(&fc->ac);
 561        return false;
 562}
 563
 564/*
 565 * Dump cursor state in the case of the error being EDESTADDRREQ.
 566 */
 567static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
 568{
 569        static int count;
 570        int i;
 571
 572        if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
 573                return;
 574        count++;
 575
 576        rcu_read_lock();
 577
 578        pr_notice("EDESTADDR occurred\n");
 579        pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
 580                  fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
 581        pr_notice("FC: ut=%lx ix=%d ni=%u\n",
 582                  fc->untried, fc->index, fc->nr_iterations);
 583
 584        if (fc->server_list) {
 585                const struct afs_server_list *sl = fc->server_list;
 586                pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
 587                          sl->nr_servers, sl->preferred, sl->vnovol_mask);
 588                for (i = 0; i < sl->nr_servers; i++) {
 589                        const struct afs_server *s = sl->servers[i].server;
 590                        pr_notice("FC: server fl=%lx av=%u %pU\n",
 591                                  s->flags, s->addr_version, &s->uuid);
 592                        if (s->addresses) {
 593                                const struct afs_addr_list *a =
 594                                        rcu_dereference(s->addresses);
 595                                pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
 596                                          a->version,
 597                                          a->nr_ipv4, a->nr_addrs, a->max_addrs,
 598                                          a->preferred);
 599                                pr_notice("FC:  - pr=%lx R=%lx F=%lx\n",
 600                                          a->probed, a->responded, a->failed);
 601                                if (a == fc->ac.alist)
 602                                        pr_notice("FC:  - current\n");
 603                        }
 604                }
 605        }
 606
 607        pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
 608                  fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
 609                  fc->ac.responded, fc->ac.nr_iterations);
 610        rcu_read_unlock();
 611}
 612
 613/*
 614 * Tidy up a filesystem cursor and unlock the vnode.
 615 */
 616int afs_end_vnode_operation(struct afs_fs_cursor *fc)
 617{
 618        struct afs_net *net = afs_v2net(fc->vnode);
 619
 620        if (fc->error == -EDESTADDRREQ ||
 621            fc->error == -EADDRNOTAVAIL ||
 622            fc->error == -ENETUNREACH ||
 623            fc->error == -EHOSTUNREACH)
 624                afs_dump_edestaddrreq(fc);
 625
 626        mutex_unlock(&fc->vnode->io_lock);
 627
 628        afs_end_cursor(&fc->ac);
 629        afs_put_cb_interest(net, fc->cbi);
 630        afs_put_serverlist(net, fc->server_list);
 631
 632        if (fc->error == -ECONNABORTED)
 633                fc->error = afs_abort_to_error(fc->ac.abort_code);
 634
 635        return fc->error;
 636}
 637