linux/fs/afs/rotate.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* Handle fileserver selection and rotation.
   3 *
   4 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
   5 * Written by David Howells (dhowells@redhat.com)
   6 */
   7
   8#include <linux/kernel.h>
   9#include <linux/slab.h>
  10#include <linux/fs.h>
  11#include <linux/sched.h>
  12#include <linux/delay.h>
  13#include <linux/sched/signal.h>
  14#include "internal.h"
  15#include "afs_fs.h"
  16
  17/*
  18 * Begin iteration through a server list, starting with the vnode's last used
  19 * server if possible, or the last recorded good server if not.
  20 */
  21static bool afs_start_fs_iteration(struct afs_operation *op,
  22                                   struct afs_vnode *vnode)
  23{
  24        struct afs_server *server;
  25        void *cb_server;
  26        int i;
  27
  28        read_lock(&op->volume->servers_lock);
  29        op->server_list = afs_get_serverlist(
  30                rcu_dereference_protected(op->volume->servers,
  31                                          lockdep_is_held(&op->volume->servers_lock)));
  32        read_unlock(&op->volume->servers_lock);
  33
  34        op->untried = (1UL << op->server_list->nr_servers) - 1;
  35        op->index = READ_ONCE(op->server_list->preferred);
  36
  37        cb_server = vnode->cb_server;
  38        if (cb_server) {
  39                /* See if the vnode's preferred record is still available */
  40                for (i = 0; i < op->server_list->nr_servers; i++) {
  41                        server = op->server_list->servers[i].server;
  42                        if (server == cb_server) {
  43                                op->index = i;
  44                                goto found_interest;
  45                        }
  46                }
  47
  48                /* If we have a lock outstanding on a server that's no longer
  49                 * serving this vnode, then we can't switch to another server
  50                 * and have to return an error.
  51                 */
  52                if (op->flags & AFS_OPERATION_CUR_ONLY) {
  53                        op->error = -ESTALE;
  54                        return false;
  55                }
  56
  57                /* Note that the callback promise is effectively broken */
  58                write_seqlock(&vnode->cb_lock);
  59                ASSERTCMP(cb_server, ==, vnode->cb_server);
  60                vnode->cb_server = NULL;
  61                if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
  62                        vnode->cb_break++;
  63                write_sequnlock(&vnode->cb_lock);
  64        }
  65
  66found_interest:
  67        return true;
  68}
  69
  70/*
  71 * Post volume busy note.
  72 */
  73static void afs_busy(struct afs_volume *volume, u32 abort_code)
  74{
  75        const char *m;
  76
  77        switch (abort_code) {
  78        case VOFFLINE:          m = "offline";          break;
  79        case VRESTARTING:       m = "restarting";       break;
  80        case VSALVAGING:        m = "being salvaged";   break;
  81        default:                m = "busy";             break;
  82        }
  83
  84        pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
  85}
  86
  87/*
  88 * Sleep and retry the operation to the same fileserver.
  89 */
  90static bool afs_sleep_and_retry(struct afs_operation *op)
  91{
  92        if (!(op->flags & AFS_OPERATION_UNINTR)) {
  93                msleep_interruptible(1000);
  94                if (signal_pending(current)) {
  95                        op->error = -ERESTARTSYS;
  96                        return false;
  97                }
  98        } else {
  99                msleep(1000);
 100        }
 101
 102        return true;
 103}
 104
 105/*
 106 * Select the fileserver to use.  May be called multiple times to rotate
 107 * through the fileservers.
 108 */
 109bool afs_select_fileserver(struct afs_operation *op)
 110{
 111        struct afs_addr_list *alist;
 112        struct afs_server *server;
 113        struct afs_vnode *vnode = op->file[0].vnode;
 114        struct afs_error e;
 115        u32 rtt;
 116        int error = op->ac.error, i;
 117
 118        _enter("%lx[%d],%lx[%d],%d,%d",
 119               op->untried, op->index,
 120               op->ac.tried, op->ac.index,
 121               error, op->ac.abort_code);
 122
 123        if (op->flags & AFS_OPERATION_STOP) {
 124                _leave(" = f [stopped]");
 125                return false;
 126        }
 127
 128        op->nr_iterations++;
 129
 130        /* Evaluate the result of the previous operation, if there was one. */
 131        switch (error) {
 132        case SHRT_MAX:
 133                goto start;
 134
 135        case 0:
 136        default:
 137                /* Success or local failure.  Stop. */
 138                op->error = error;
 139                op->flags |= AFS_OPERATION_STOP;
 140                _leave(" = f [okay/local %d]", error);
 141                return false;
 142
 143        case -ECONNABORTED:
 144                /* The far side rejected the operation on some grounds.  This
 145                 * might involve the server being busy or the volume having been moved.
 146                 */
 147                switch (op->ac.abort_code) {
 148                case VNOVOL:
 149                        /* This fileserver doesn't know about the volume.
 150                         * - May indicate that the VL is wrong - retry once and compare
 151                         *   the results.
 152                         * - May indicate that the fileserver couldn't attach to the vol.
 153                         */
 154                        if (op->flags & AFS_OPERATION_VNOVOL) {
 155                                op->error = -EREMOTEIO;
 156                                goto next_server;
 157                        }
 158
 159                        write_lock(&op->volume->servers_lock);
 160                        op->server_list->vnovol_mask |= 1 << op->index;
 161                        write_unlock(&op->volume->servers_lock);
 162
 163                        set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
 164                        error = afs_check_volume_status(op->volume, op);
 165                        if (error < 0)
 166                                goto failed_set_error;
 167
 168                        if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
 169                                op->error = -ENOMEDIUM;
 170                                goto failed;
 171                        }
 172
 173                        /* If the server list didn't change, then assume that
 174                         * it's the fileserver having trouble.
 175                         */
 176                        if (rcu_access_pointer(op->volume->servers) == op->server_list) {
 177                                op->error = -EREMOTEIO;
 178                                goto next_server;
 179                        }
 180
 181                        /* Try again */
 182                        op->flags |= AFS_OPERATION_VNOVOL;
 183                        _leave(" = t [vnovol]");
 184                        return true;
 185
 186                case VSALVAGE: /* TODO: Should this return an error or iterate? */
 187                case VVOLEXISTS:
 188                case VNOSERVICE:
 189                case VONLINE:
 190                case VDISKFULL:
 191                case VOVERQUOTA:
 192                        op->error = afs_abort_to_error(op->ac.abort_code);
 193                        goto next_server;
 194
 195                case VOFFLINE:
 196                        if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
 197                                afs_busy(op->volume, op->ac.abort_code);
 198                                clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
 199                        }
 200                        if (op->flags & AFS_OPERATION_NO_VSLEEP) {
 201                                op->error = -EADV;
 202                                goto failed;
 203                        }
 204                        if (op->flags & AFS_OPERATION_CUR_ONLY) {
 205                                op->error = -ESTALE;
 206                                goto failed;
 207                        }
 208                        goto busy;
 209
 210                case VSALVAGING:
 211                case VRESTARTING:
 212                case VBUSY:
 213                        /* Retry after going round all the servers unless we
 214                         * have a file lock we need to maintain.
 215                         */
 216                        if (op->flags & AFS_OPERATION_NO_VSLEEP) {
 217                                op->error = -EBUSY;
 218                                goto failed;
 219                        }
 220                        if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
 221                                afs_busy(op->volume, op->ac.abort_code);
 222                                clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
 223                        }
 224                busy:
 225                        if (op->flags & AFS_OPERATION_CUR_ONLY) {
 226                                if (!afs_sleep_and_retry(op))
 227                                        goto failed;
 228
 229                                 /* Retry with same server & address */
 230                                _leave(" = t [vbusy]");
 231                                return true;
 232                        }
 233
 234                        op->flags |= AFS_OPERATION_VBUSY;
 235                        goto next_server;
 236
 237                case VMOVED:
 238                        /* The volume migrated to another server.  We consider
 239                         * consider all locks and callbacks broken and request
 240                         * an update from the VLDB.
 241                         *
 242                         * We also limit the number of VMOVED hops we will
 243                         * honour, just in case someone sets up a loop.
 244                         */
 245                        if (op->flags & AFS_OPERATION_VMOVED) {
 246                                op->error = -EREMOTEIO;
 247                                goto failed;
 248                        }
 249                        op->flags |= AFS_OPERATION_VMOVED;
 250
 251                        set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
 252                        set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
 253                        error = afs_check_volume_status(op->volume, op);
 254                        if (error < 0)
 255                                goto failed_set_error;
 256
 257                        /* If the server list didn't change, then the VLDB is
 258                         * out of sync with the fileservers.  This is hopefully
 259                         * a temporary condition, however, so we don't want to
 260                         * permanently block access to the file.
 261                         *
 262                         * TODO: Try other fileservers if we can.
 263                         *
 264                         * TODO: Retry a few times with sleeps.
 265                         */
 266                        if (rcu_access_pointer(op->volume->servers) == op->server_list) {
 267                                op->error = -ENOMEDIUM;
 268                                goto failed;
 269                        }
 270
 271                        goto restart_from_beginning;
 272
 273                default:
 274                        clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
 275                        clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
 276                        op->error = afs_abort_to_error(op->ac.abort_code);
 277                        goto failed;
 278                }
 279
 280        case -ETIMEDOUT:
 281        case -ETIME:
 282                if (op->error != -EDESTADDRREQ)
 283                        goto iterate_address;
 284                fallthrough;
 285        case -ERFKILL:
 286        case -EADDRNOTAVAIL:
 287        case -ENETUNREACH:
 288        case -EHOSTUNREACH:
 289        case -EHOSTDOWN:
 290        case -ECONNREFUSED:
 291                _debug("no conn");
 292                op->error = error;
 293                goto iterate_address;
 294
 295        case -ECONNRESET:
 296                _debug("call reset");
 297                op->error = error;
 298                goto failed;
 299        }
 300
 301restart_from_beginning:
 302        _debug("restart");
 303        afs_end_cursor(&op->ac);
 304        op->server = NULL;
 305        afs_put_serverlist(op->net, op->server_list);
 306        op->server_list = NULL;
 307start:
 308        _debug("start");
 309        /* See if we need to do an update of the volume record.  Note that the
 310         * volume may have moved or even have been deleted.
 311         */
 312        error = afs_check_volume_status(op->volume, op);
 313        if (error < 0)
 314                goto failed_set_error;
 315
 316        if (!afs_start_fs_iteration(op, vnode))
 317                goto failed;
 318
 319        _debug("__ VOL %llx __", op->volume->vid);
 320
 321pick_server:
 322        _debug("pick [%lx]", op->untried);
 323
 324        error = afs_wait_for_fs_probes(op->server_list, op->untried);
 325        if (error < 0)
 326                goto failed_set_error;
 327
 328        /* Pick the untried server with the lowest RTT.  If we have outstanding
 329         * callbacks, we stick with the server we're already using if we can.
 330         */
 331        if (op->server) {
 332                _debug("server %u", op->index);
 333                if (test_bit(op->index, &op->untried))
 334                        goto selected_server;
 335                op->server = NULL;
 336                _debug("no server");
 337        }
 338
 339        op->index = -1;
 340        rtt = U32_MAX;
 341        for (i = 0; i < op->server_list->nr_servers; i++) {
 342                struct afs_server *s = op->server_list->servers[i].server;
 343
 344                if (!test_bit(i, &op->untried) ||
 345                    !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
 346                        continue;
 347                if (s->probe.rtt < rtt) {
 348                        op->index = i;
 349                        rtt = s->probe.rtt;
 350                }
 351        }
 352
 353        if (op->index == -1)
 354                goto no_more_servers;
 355
 356selected_server:
 357        _debug("use %d", op->index);
 358        __clear_bit(op->index, &op->untried);
 359
 360        /* We're starting on a different fileserver from the list.  We need to
 361         * check it, create a callback intercept, find its address list and
 362         * probe its capabilities before we use it.
 363         */
 364        ASSERTCMP(op->ac.alist, ==, NULL);
 365        server = op->server_list->servers[op->index].server;
 366
 367        if (!afs_check_server_record(op, server))
 368                goto failed;
 369
 370        _debug("USING SERVER: %pU", &server->uuid);
 371
 372        op->flags |= AFS_OPERATION_RETRY_SERVER;
 373        op->server = server;
 374        if (vnode->cb_server != server) {
 375                vnode->cb_server = server;
 376                vnode->cb_s_break = server->cb_s_break;
 377                vnode->cb_v_break = vnode->volume->cb_v_break;
 378                clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 379        }
 380
 381        read_lock(&server->fs_lock);
 382        alist = rcu_dereference_protected(server->addresses,
 383                                          lockdep_is_held(&server->fs_lock));
 384        afs_get_addrlist(alist);
 385        read_unlock(&server->fs_lock);
 386
 387retry_server:
 388        memset(&op->ac, 0, sizeof(op->ac));
 389
 390        if (!op->ac.alist)
 391                op->ac.alist = alist;
 392        else
 393                afs_put_addrlist(alist);
 394
 395        op->ac.index = -1;
 396
 397iterate_address:
 398        ASSERT(op->ac.alist);
 399        /* Iterate over the current server's address list to try and find an
 400         * address on which it will respond to us.
 401         */
 402        if (!afs_iterate_addresses(&op->ac))
 403                goto out_of_addresses;
 404
 405        _debug("address [%u] %u/%u %pISp",
 406               op->index, op->ac.index, op->ac.alist->nr_addrs,
 407               &op->ac.alist->addrs[op->ac.index].transport);
 408
 409        _leave(" = t");
 410        return true;
 411
 412out_of_addresses:
 413        /* We've now had a failure to respond on all of a server's addresses -
 414         * immediately probe them again and consider retrying the server.
 415         */
 416        afs_probe_fileserver(op->net, op->server);
 417        if (op->flags & AFS_OPERATION_RETRY_SERVER) {
 418                alist = op->ac.alist;
 419                error = afs_wait_for_one_fs_probe(
 420                        op->server, !(op->flags & AFS_OPERATION_UNINTR));
 421                switch (error) {
 422                case 0:
 423                        op->flags &= ~AFS_OPERATION_RETRY_SERVER;
 424                        goto retry_server;
 425                case -ERESTARTSYS:
 426                        goto failed_set_error;
 427                case -ETIME:
 428                case -EDESTADDRREQ:
 429                        goto next_server;
 430                }
 431        }
 432
 433next_server:
 434        _debug("next");
 435        afs_end_cursor(&op->ac);
 436        goto pick_server;
 437
 438no_more_servers:
 439        /* That's all the servers poked to no good effect.  Try again if some
 440         * of them were busy.
 441         */
 442        if (op->flags & AFS_OPERATION_VBUSY)
 443                goto restart_from_beginning;
 444
 445        e.error = -EDESTADDRREQ;
 446        e.responded = false;
 447        for (i = 0; i < op->server_list->nr_servers; i++) {
 448                struct afs_server *s = op->server_list->servers[i].server;
 449
 450                afs_prioritise_error(&e, READ_ONCE(s->probe.error),
 451                                     s->probe.abort_code);
 452        }
 453
 454        error = e.error;
 455
 456failed_set_error:
 457        op->error = error;
 458failed:
 459        op->flags |= AFS_OPERATION_STOP;
 460        afs_end_cursor(&op->ac);
 461        _leave(" = f [failed %d]", op->error);
 462        return false;
 463}
 464
 465/*
 466 * Dump cursor state in the case of the error being EDESTADDRREQ.
 467 */
 468void afs_dump_edestaddrreq(const struct afs_operation *op)
 469{
 470        static int count;
 471        int i;
 472
 473        if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
 474                return;
 475        count++;
 476
 477        rcu_read_lock();
 478
 479        pr_notice("EDESTADDR occurred\n");
 480        pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n",
 481                  op->file[0].cb_break_before,
 482                  op->file[1].cb_break_before, op->flags, op->error);
 483        pr_notice("FC: ut=%lx ix=%d ni=%u\n",
 484                  op->untried, op->index, op->nr_iterations);
 485
 486        if (op->server_list) {
 487                const struct afs_server_list *sl = op->server_list;
 488                pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
 489                          sl->nr_servers, sl->preferred, sl->vnovol_mask);
 490                for (i = 0; i < sl->nr_servers; i++) {
 491                        const struct afs_server *s = sl->servers[i].server;
 492                        pr_notice("FC: server fl=%lx av=%u %pU\n",
 493                                  s->flags, s->addr_version, &s->uuid);
 494                        if (s->addresses) {
 495                                const struct afs_addr_list *a =
 496                                        rcu_dereference(s->addresses);
 497                                pr_notice("FC:  - av=%u nr=%u/%u/%u pr=%u\n",
 498                                          a->version,
 499                                          a->nr_ipv4, a->nr_addrs, a->max_addrs,
 500                                          a->preferred);
 501                                pr_notice("FC:  - R=%lx F=%lx\n",
 502                                          a->responded, a->failed);
 503                                if (a == op->ac.alist)
 504                                        pr_notice("FC:  - current\n");
 505                        }
 506                }
 507        }
 508
 509        pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
 510                  op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error,
 511                  op->ac.responded, op->ac.nr_iterations);
 512        rcu_read_unlock();
 513}
 514