linux/fs/ceph/mdsmap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/ceph/ceph_debug.h>
   3
   4#include <linux/bug.h>
   5#include <linux/err.h>
   6#include <linux/random.h>
   7#include <linux/slab.h>
   8#include <linux/types.h>
   9
  10#include <linux/ceph/mdsmap.h>
  11#include <linux/ceph/messenger.h>
  12#include <linux/ceph/decode.h>
  13
  14#include "super.h"
  15
  16
  17/*
  18 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
  19 */
  20int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
  21{
  22        int n = 0;
  23        int i;
  24
  25        /* special case for one mds */
  26        if (1 == m->m_num_mds && m->m_info[0].state > 0)
  27                return 0;
  28
  29        /* count */
  30        for (i = 0; i < m->m_num_mds; i++)
  31                if (m->m_info[i].state > 0)
  32                        n++;
  33        if (n == 0)
  34                return -1;
  35
  36        /* pick */
  37        n = prandom_u32() % n;
  38        i = 0;
  39        for (i = 0; n > 0; i++, n--)
  40                while (m->m_info[i].state <= 0)
  41                        i++;
  42
  43        return i;
  44}
  45
  46#define __decode_and_drop_type(p, end, type, bad)               \
  47        do {                                                    \
  48                if (*p + sizeof(type) > end)                    \
  49                        goto bad;                               \
  50                *p += sizeof(type);                             \
  51        } while (0)
  52
  53#define __decode_and_drop_set(p, end, type, bad)                \
  54        do {                                                    \
  55                u32 n;                                          \
  56                size_t need;                                    \
  57                ceph_decode_32_safe(p, end, n, bad);            \
  58                need = sizeof(type) * n;                        \
  59                ceph_decode_need(p, end, need, bad);            \
  60                *p += need;                                     \
  61        } while (0)
  62
  63#define __decode_and_drop_map(p, end, ktype, vtype, bad)        \
  64        do {                                                    \
  65                u32 n;                                          \
  66                size_t need;                                    \
  67                ceph_decode_32_safe(p, end, n, bad);            \
  68                need = (sizeof(ktype) + sizeof(vtype)) * n;     \
  69                ceph_decode_need(p, end, need, bad);            \
  70                *p += need;                                     \
  71        } while (0)
  72
  73
  74static int __decode_and_drop_compat_set(void **p, void* end)
  75{
  76        int i;
  77        /* compat, ro_compat, incompat*/
  78        for (i = 0; i < 3; i++) {
  79                u32 n;
  80                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
  81                /* mask */
  82                *p += sizeof(u64);
  83                /* names (map<u64, string>) */
  84                n = ceph_decode_32(p);
  85                while (n-- > 0) {
  86                        u32 len;
  87                        ceph_decode_need(p, end, sizeof(u64) + sizeof(u32),
  88                                         bad);
  89                        *p += sizeof(u64);
  90                        len = ceph_decode_32(p);
  91                        ceph_decode_need(p, end, len, bad);
  92                        *p += len;
  93                }
  94        }
  95        return 0;
  96bad:
  97        return -1;
  98}
  99
 100/*
 101 * Decode an MDS map
 102 *
 103 * Ignore any fields we don't care about (there are quite a few of
 104 * them).
 105 */
 106struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 107{
 108        struct ceph_mdsmap *m;
 109        const void *start = *p;
 110        int i, j, n;
 111        int err = -EINVAL;
 112        u8 mdsmap_v, mdsmap_cv;
 113        u16 mdsmap_ev;
 114
 115        m = kzalloc(sizeof(*m), GFP_NOFS);
 116        if (!m)
 117                return ERR_PTR(-ENOMEM);
 118
 119        ceph_decode_need(p, end, 1 + 1, bad);
 120        mdsmap_v = ceph_decode_8(p);
 121        mdsmap_cv = ceph_decode_8(p);
 122        if (mdsmap_v >= 4) {
 123               u32 mdsmap_len;
 124               ceph_decode_32_safe(p, end, mdsmap_len, bad);
 125               if (end < *p + mdsmap_len)
 126                       goto bad;
 127               end = *p + mdsmap_len;
 128        }
 129
 130        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
 131        m->m_epoch = ceph_decode_32(p);
 132        m->m_client_epoch = ceph_decode_32(p);
 133        m->m_last_failure = ceph_decode_32(p);
 134        m->m_root = ceph_decode_32(p);
 135        m->m_session_timeout = ceph_decode_32(p);
 136        m->m_session_autoclose = ceph_decode_32(p);
 137        m->m_max_file_size = ceph_decode_64(p);
 138        m->m_max_mds = ceph_decode_32(p);
 139        m->m_num_mds = m->m_max_mds;
 140
 141        m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
 142        if (!m->m_info)
 143                goto nomem;
 144
 145        /* pick out active nodes from mds_info (state > 0) */
 146        n = ceph_decode_32(p);
 147        for (i = 0; i < n; i++) {
 148                u64 global_id;
 149                u32 namelen;
 150                s32 mds, inc, state;
 151                u64 state_seq;
 152                u8 info_v;
 153                void *info_end = NULL;
 154                struct ceph_entity_addr addr;
 155                u32 num_export_targets;
 156                void *pexport_targets = NULL;
 157                struct ceph_timespec laggy_since;
 158                struct ceph_mds_info *info;
 159
 160                ceph_decode_need(p, end, sizeof(u64) + 1, bad);
 161                global_id = ceph_decode_64(p);
 162                info_v= ceph_decode_8(p);
 163                if (info_v >= 4) {
 164                        u32 info_len;
 165                        u8 info_cv;
 166                        ceph_decode_need(p, end, 1 + sizeof(u32), bad);
 167                        info_cv = ceph_decode_8(p);
 168                        info_len = ceph_decode_32(p);
 169                        info_end = *p + info_len;
 170                        if (info_end > end)
 171                                goto bad;
 172                }
 173
 174                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
 175                *p += sizeof(u64);
 176                namelen = ceph_decode_32(p);  /* skip mds name */
 177                *p += namelen;
 178
 179                ceph_decode_need(p, end,
 180                                 4*sizeof(u32) + sizeof(u64) +
 181                                 sizeof(addr) + sizeof(struct ceph_timespec),
 182                                 bad);
 183                mds = ceph_decode_32(p);
 184                inc = ceph_decode_32(p);
 185                state = ceph_decode_32(p);
 186                state_seq = ceph_decode_64(p);
 187                ceph_decode_copy(p, &addr, sizeof(addr));
 188                ceph_decode_addr(&addr);
 189                ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
 190                *p += sizeof(u32);
 191                ceph_decode_32_safe(p, end, namelen, bad);
 192                *p += namelen;
 193                if (info_v >= 2) {
 194                        ceph_decode_32_safe(p, end, num_export_targets, bad);
 195                        pexport_targets = *p;
 196                        *p += num_export_targets * sizeof(u32);
 197                } else {
 198                        num_export_targets = 0;
 199                }
 200
 201                if (info_end && *p != info_end) {
 202                        if (*p > info_end)
 203                                goto bad;
 204                        *p = info_end;
 205                }
 206
 207                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
 208                     i+1, n, global_id, mds, inc,
 209                     ceph_pr_addr(&addr.in_addr),
 210                     ceph_mds_state_name(state));
 211
 212                if (mds < 0 || state <= 0)
 213                        continue;
 214
 215                if (mds >= m->m_num_mds) {
 216                        int new_num = max(mds + 1, m->m_num_mds * 2);
 217                        void *new_m_info = krealloc(m->m_info,
 218                                                new_num * sizeof(*m->m_info),
 219                                                GFP_NOFS | __GFP_ZERO);
 220                        if (!new_m_info)
 221                                goto nomem;
 222                        m->m_info = new_m_info;
 223                        m->m_num_mds = new_num;
 224                }
 225
 226                info = &m->m_info[mds];
 227                info->global_id = global_id;
 228                info->state = state;
 229                info->addr = addr;
 230                info->laggy = (laggy_since.tv_sec != 0 ||
 231                               laggy_since.tv_nsec != 0);
 232                info->num_export_targets = num_export_targets;
 233                if (num_export_targets) {
 234                        info->export_targets = kcalloc(num_export_targets,
 235                                                       sizeof(u32), GFP_NOFS);
 236                        if (!info->export_targets)
 237                                goto nomem;
 238                        for (j = 0; j < num_export_targets; j++)
 239                                info->export_targets[j] =
 240                                       ceph_decode_32(&pexport_targets);
 241                } else {
 242                        info->export_targets = NULL;
 243                }
 244        }
 245        if (m->m_num_mds > m->m_max_mds) {
 246                /* find max up mds */
 247                for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
 248                        if (i == 0 || m->m_info[i-1].state > 0)
 249                                break;
 250                }
 251                m->m_num_mds = i;
 252        }
 253
 254        /* pg_pools */
 255        ceph_decode_32_safe(p, end, n, bad);
 256        m->m_num_data_pg_pools = n;
 257        m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
 258        if (!m->m_data_pg_pools)
 259                goto nomem;
 260        ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
 261        for (i = 0; i < n; i++)
 262                m->m_data_pg_pools[i] = ceph_decode_64(p);
 263        m->m_cas_pg_pool = ceph_decode_64(p);
 264        m->m_enabled = m->m_epoch > 1;
 265
 266        mdsmap_ev = 1;
 267        if (mdsmap_v >= 2) {
 268                ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext);
 269        }
 270        if (mdsmap_ev >= 3) {
 271                if (__decode_and_drop_compat_set(p, end) < 0)
 272                        goto bad_ext;
 273        }
 274        /* metadata_pool */
 275        if (mdsmap_ev < 5) {
 276                __decode_and_drop_type(p, end, u32, bad_ext);
 277        } else {
 278                __decode_and_drop_type(p, end, u64, bad_ext);
 279        }
 280
 281        /* created + modified + tableserver */
 282        __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
 283        __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
 284        __decode_and_drop_type(p, end, u32, bad_ext);
 285
 286        /* in */
 287        {
 288                int num_laggy = 0;
 289                ceph_decode_32_safe(p, end, n, bad_ext);
 290                ceph_decode_need(p, end, sizeof(u32) * n, bad_ext);
 291
 292                for (i = 0; i < n; i++) {
 293                        s32 mds = ceph_decode_32(p);
 294                        if (mds >= 0 && mds < m->m_num_mds) {
 295                                if (m->m_info[mds].laggy)
 296                                        num_laggy++;
 297                        }
 298                }
 299                m->m_num_laggy = num_laggy;
 300
 301                if (n > m->m_num_mds) {
 302                        void *new_m_info = krealloc(m->m_info,
 303                                                    n * sizeof(*m->m_info),
 304                                                    GFP_NOFS | __GFP_ZERO);
 305                        if (!new_m_info)
 306                                goto nomem;
 307                        m->m_info = new_m_info;
 308                }
 309                m->m_num_mds = n;
 310        }
 311
 312        /* inc */
 313        __decode_and_drop_map(p, end, u32, u32, bad_ext);
 314        /* up */
 315        __decode_and_drop_map(p, end, u32, u64, bad_ext);
 316        /* failed */
 317        __decode_and_drop_set(p, end, u32, bad_ext);
 318        /* stopped */
 319        __decode_and_drop_set(p, end, u32, bad_ext);
 320
 321        if (mdsmap_ev >= 4) {
 322                /* last_failure_osd_epoch */
 323                __decode_and_drop_type(p, end, u32, bad_ext);
 324        }
 325        if (mdsmap_ev >= 6) {
 326                /* ever_allowed_snaps */
 327                __decode_and_drop_type(p, end, u8, bad_ext);
 328                /* explicitly_allowed_snaps */
 329                __decode_and_drop_type(p, end, u8, bad_ext);
 330        }
 331        if (mdsmap_ev >= 7) {
 332                /* inline_data_enabled */
 333                __decode_and_drop_type(p, end, u8, bad_ext);
 334        }
 335        if (mdsmap_ev >= 8) {
 336                u32 name_len;
 337                /* enabled */
 338                ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
 339                ceph_decode_32_safe(p, end, name_len, bad_ext);
 340                ceph_decode_need(p, end, name_len, bad_ext);
 341                *p += name_len;
 342        }
 343        /* damaged */
 344        if (mdsmap_ev >= 9) {
 345                size_t need;
 346                ceph_decode_32_safe(p, end, n, bad_ext);
 347                need = sizeof(u32) * n;
 348                ceph_decode_need(p, end, need, bad_ext);
 349                *p += need;
 350                m->m_damaged = n > 0;
 351        } else {
 352                m->m_damaged = false;
 353        }
 354bad_ext:
 355        *p = end;
 356        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
 357        return m;
 358nomem:
 359        err = -ENOMEM;
 360        goto out_err;
 361bad:
 362        pr_err("corrupt mdsmap\n");
 363        print_hex_dump(KERN_DEBUG, "mdsmap: ",
 364                       DUMP_PREFIX_OFFSET, 16, 1,
 365                       start, end - start, true);
 366out_err:
 367        ceph_mdsmap_destroy(m);
 368        return ERR_PTR(err);
 369}
 370
 371void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
 372{
 373        int i;
 374
 375        for (i = 0; i < m->m_num_mds; i++)
 376                kfree(m->m_info[i].export_targets);
 377        kfree(m->m_info);
 378        kfree(m->m_data_pg_pools);
 379        kfree(m);
 380}
 381
 382bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
 383{
 384        int i, nr_active = 0;
 385        if (!m->m_enabled)
 386                return false;
 387        if (m->m_damaged)
 388                return false;
 389        if (m->m_num_laggy > 0)
 390                return false;
 391        for (i = 0; i < m->m_num_mds; i++) {
 392                if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
 393                        nr_active++;
 394        }
 395        return nr_active > 0;
 396}
 397