linux/fs/ceph/mdsmap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/ceph/ceph_debug.h>
   3
   4#include <linux/bug.h>
   5#include <linux/err.h>
   6#include <linux/random.h>
   7#include <linux/slab.h>
   8#include <linux/types.h>
   9
  10#include <linux/ceph/mdsmap.h>
  11#include <linux/ceph/messenger.h>
  12#include <linux/ceph/decode.h>
  13
  14#include "super.h"
  15
  16
  17/*
  18 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
  19 */
  20int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
  21{
  22        int n = 0;
  23        int i;
  24
  25        /* special case for one mds */
  26        if (1 == m->m_num_mds && m->m_info[0].state > 0)
  27                return 0;
  28
  29        /* count */
  30        for (i = 0; i < m->m_num_mds; i++)
  31                if (m->m_info[i].state > 0)
  32                        n++;
  33        if (n == 0)
  34                return -1;
  35
  36        /* pick */
  37        n = prandom_u32() % n;
  38        for (i = 0; n > 0; i++, n--)
  39                while (m->m_info[i].state <= 0)
  40                        i++;
  41
  42        return i;
  43}
  44
  45#define __decode_and_drop_type(p, end, type, bad)               \
  46        do {                                                    \
  47                if (*p + sizeof(type) > end)                    \
  48                        goto bad;                               \
  49                *p += sizeof(type);                             \
  50        } while (0)
  51
  52#define __decode_and_drop_set(p, end, type, bad)                \
  53        do {                                                    \
  54                u32 n;                                          \
  55                size_t need;                                    \
  56                ceph_decode_32_safe(p, end, n, bad);            \
  57                need = sizeof(type) * n;                        \
  58                ceph_decode_need(p, end, need, bad);            \
  59                *p += need;                                     \
  60        } while (0)
  61
  62#define __decode_and_drop_map(p, end, ktype, vtype, bad)        \
  63        do {                                                    \
  64                u32 n;                                          \
  65                size_t need;                                    \
  66                ceph_decode_32_safe(p, end, n, bad);            \
  67                need = (sizeof(ktype) + sizeof(vtype)) * n;     \
  68                ceph_decode_need(p, end, need, bad);            \
  69                *p += need;                                     \
  70        } while (0)
  71
  72
  73static int __decode_and_drop_compat_set(void **p, void* end)
  74{
  75        int i;
  76        /* compat, ro_compat, incompat*/
  77        for (i = 0; i < 3; i++) {
  78                u32 n;
  79                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
  80                /* mask */
  81                *p += sizeof(u64);
  82                /* names (map<u64, string>) */
  83                n = ceph_decode_32(p);
  84                while (n-- > 0) {
  85                        u32 len;
  86                        ceph_decode_need(p, end, sizeof(u64) + sizeof(u32),
  87                                         bad);
  88                        *p += sizeof(u64);
  89                        len = ceph_decode_32(p);
  90                        ceph_decode_need(p, end, len, bad);
  91                        *p += len;
  92                }
  93        }
  94        return 0;
  95bad:
  96        return -1;
  97}
  98
  99/*
 100 * Decode an MDS map
 101 *
 102 * Ignore any fields we don't care about (there are quite a few of
 103 * them).
 104 */
 105struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 106{
 107        struct ceph_mdsmap *m;
 108        const void *start = *p;
 109        int i, j, n;
 110        int err;
 111        u8 mdsmap_v, mdsmap_cv;
 112        u16 mdsmap_ev;
 113
 114        m = kzalloc(sizeof(*m), GFP_NOFS);
 115        if (!m)
 116                return ERR_PTR(-ENOMEM);
 117
 118        ceph_decode_need(p, end, 1 + 1, bad);
 119        mdsmap_v = ceph_decode_8(p);
 120        mdsmap_cv = ceph_decode_8(p);
 121        if (mdsmap_v >= 4) {
 122               u32 mdsmap_len;
 123               ceph_decode_32_safe(p, end, mdsmap_len, bad);
 124               if (end < *p + mdsmap_len)
 125                       goto bad;
 126               end = *p + mdsmap_len;
 127        }
 128
 129        ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
 130        m->m_epoch = ceph_decode_32(p);
 131        m->m_client_epoch = ceph_decode_32(p);
 132        m->m_last_failure = ceph_decode_32(p);
 133        m->m_root = ceph_decode_32(p);
 134        m->m_session_timeout = ceph_decode_32(p);
 135        m->m_session_autoclose = ceph_decode_32(p);
 136        m->m_max_file_size = ceph_decode_64(p);
 137        m->m_max_mds = ceph_decode_32(p);
 138        m->m_num_mds = m->m_max_mds;
 139
 140        m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
 141        if (!m->m_info)
 142                goto nomem;
 143
 144        /* pick out active nodes from mds_info (state > 0) */
 145        n = ceph_decode_32(p);
 146        for (i = 0; i < n; i++) {
 147                u64 global_id;
 148                u32 namelen;
 149                s32 mds, inc, state;
 150                u64 state_seq;
 151                u8 info_v;
 152                void *info_end = NULL;
 153                struct ceph_entity_addr addr;
 154                u32 num_export_targets;
 155                void *pexport_targets = NULL;
 156                struct ceph_timespec laggy_since;
 157                struct ceph_mds_info *info;
 158
 159                ceph_decode_need(p, end, sizeof(u64) + 1, bad);
 160                global_id = ceph_decode_64(p);
 161                info_v= ceph_decode_8(p);
 162                if (info_v >= 4) {
 163                        u32 info_len;
 164                        u8 info_cv;
 165                        ceph_decode_need(p, end, 1 + sizeof(u32), bad);
 166                        info_cv = ceph_decode_8(p);
 167                        info_len = ceph_decode_32(p);
 168                        info_end = *p + info_len;
 169                        if (info_end > end)
 170                                goto bad;
 171                }
 172
 173                ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
 174                *p += sizeof(u64);
 175                namelen = ceph_decode_32(p);  /* skip mds name */
 176                *p += namelen;
 177
 178                ceph_decode_need(p, end,
 179                                 4*sizeof(u32) + sizeof(u64) +
 180                                 sizeof(addr) + sizeof(struct ceph_timespec),
 181                                 bad);
 182                mds = ceph_decode_32(p);
 183                inc = ceph_decode_32(p);
 184                state = ceph_decode_32(p);
 185                state_seq = ceph_decode_64(p);
 186                err = ceph_decode_entity_addr(p, end, &addr);
 187                if (err)
 188                        goto corrupt;
 189                ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
 190                *p += sizeof(u32);
 191                ceph_decode_32_safe(p, end, namelen, bad);
 192                *p += namelen;
 193                if (info_v >= 2) {
 194                        ceph_decode_32_safe(p, end, num_export_targets, bad);
 195                        pexport_targets = *p;
 196                        *p += num_export_targets * sizeof(u32);
 197                } else {
 198                        num_export_targets = 0;
 199                }
 200
 201                if (info_end && *p != info_end) {
 202                        if (*p > info_end)
 203                                goto bad;
 204                        *p = info_end;
 205                }
 206
 207                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
 208                     i+1, n, global_id, mds, inc,
 209                     ceph_pr_addr(&addr),
 210                     ceph_mds_state_name(state));
 211
 212                if (mds < 0 || state <= 0)
 213                        continue;
 214
 215                if (mds >= m->m_num_mds) {
 216                        int new_num = max(mds + 1, m->m_num_mds * 2);
 217                        void *new_m_info = krealloc(m->m_info,
 218                                                new_num * sizeof(*m->m_info),
 219                                                GFP_NOFS | __GFP_ZERO);
 220                        if (!new_m_info)
 221                                goto nomem;
 222                        m->m_info = new_m_info;
 223                        m->m_num_mds = new_num;
 224                }
 225
 226                info = &m->m_info[mds];
 227                info->global_id = global_id;
 228                info->state = state;
 229                info->addr = addr;
 230                info->laggy = (laggy_since.tv_sec != 0 ||
 231                               laggy_since.tv_nsec != 0);
 232                info->num_export_targets = num_export_targets;
 233                if (num_export_targets) {
 234                        info->export_targets = kcalloc(num_export_targets,
 235                                                       sizeof(u32), GFP_NOFS);
 236                        if (!info->export_targets)
 237                                goto nomem;
 238                        for (j = 0; j < num_export_targets; j++)
 239                                info->export_targets[j] =
 240                                       ceph_decode_32(&pexport_targets);
 241                } else {
 242                        info->export_targets = NULL;
 243                }
 244        }
 245        if (m->m_num_mds > m->m_max_mds) {
 246                /* find max up mds */
 247                for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
 248                        if (i == 0 || m->m_info[i-1].state > 0)
 249                                break;
 250                }
 251                m->m_num_mds = i;
 252        }
 253
 254        /* pg_pools */
 255        ceph_decode_32_safe(p, end, n, bad);
 256        m->m_num_data_pg_pools = n;
 257        m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
 258        if (!m->m_data_pg_pools)
 259                goto nomem;
 260        ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
 261        for (i = 0; i < n; i++)
 262                m->m_data_pg_pools[i] = ceph_decode_64(p);
 263        m->m_cas_pg_pool = ceph_decode_64(p);
 264        m->m_enabled = m->m_epoch > 1;
 265
 266        mdsmap_ev = 1;
 267        if (mdsmap_v >= 2) {
 268                ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext);
 269        }
 270        if (mdsmap_ev >= 3) {
 271                if (__decode_and_drop_compat_set(p, end) < 0)
 272                        goto bad_ext;
 273        }
 274        /* metadata_pool */
 275        if (mdsmap_ev < 5) {
 276                __decode_and_drop_type(p, end, u32, bad_ext);
 277        } else {
 278                __decode_and_drop_type(p, end, u64, bad_ext);
 279        }
 280
 281        /* created + modified + tableserver */
 282        __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
 283        __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
 284        __decode_and_drop_type(p, end, u32, bad_ext);
 285
 286        /* in */
 287        {
 288                int num_laggy = 0;
 289                ceph_decode_32_safe(p, end, n, bad_ext);
 290                ceph_decode_need(p, end, sizeof(u32) * n, bad_ext);
 291
 292                for (i = 0; i < n; i++) {
 293                        s32 mds = ceph_decode_32(p);
 294                        if (mds >= 0 && mds < m->m_num_mds) {
 295                                if (m->m_info[mds].laggy)
 296                                        num_laggy++;
 297                        }
 298                }
 299                m->m_num_laggy = num_laggy;
 300
 301                if (n > m->m_num_mds) {
 302                        void *new_m_info = krealloc(m->m_info,
 303                                                    n * sizeof(*m->m_info),
 304                                                    GFP_NOFS | __GFP_ZERO);
 305                        if (!new_m_info)
 306                                goto nomem;
 307                        m->m_info = new_m_info;
 308                }
 309                m->m_num_mds = n;
 310        }
 311
 312        /* inc */
 313        __decode_and_drop_map(p, end, u32, u32, bad_ext);
 314        /* up */
 315        __decode_and_drop_map(p, end, u32, u64, bad_ext);
 316        /* failed */
 317        __decode_and_drop_set(p, end, u32, bad_ext);
 318        /* stopped */
 319        __decode_and_drop_set(p, end, u32, bad_ext);
 320
 321        if (mdsmap_ev >= 4) {
 322                /* last_failure_osd_epoch */
 323                __decode_and_drop_type(p, end, u32, bad_ext);
 324        }
 325        if (mdsmap_ev >= 6) {
 326                /* ever_allowed_snaps */
 327                __decode_and_drop_type(p, end, u8, bad_ext);
 328                /* explicitly_allowed_snaps */
 329                __decode_and_drop_type(p, end, u8, bad_ext);
 330        }
 331        if (mdsmap_ev >= 7) {
 332                /* inline_data_enabled */
 333                __decode_and_drop_type(p, end, u8, bad_ext);
 334        }
 335        if (mdsmap_ev >= 8) {
 336                u32 name_len;
 337                /* enabled */
 338                ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
 339                ceph_decode_32_safe(p, end, name_len, bad_ext);
 340                ceph_decode_need(p, end, name_len, bad_ext);
 341                *p += name_len;
 342        }
 343        /* damaged */
 344        if (mdsmap_ev >= 9) {
 345                size_t need;
 346                ceph_decode_32_safe(p, end, n, bad_ext);
 347                need = sizeof(u32) * n;
 348                ceph_decode_need(p, end, need, bad_ext);
 349                *p += need;
 350                m->m_damaged = n > 0;
 351        } else {
 352                m->m_damaged = false;
 353        }
 354bad_ext:
 355        *p = end;
 356        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
 357        return m;
 358nomem:
 359        err = -ENOMEM;
 360        goto out_err;
 361corrupt:
 362        pr_err("corrupt mdsmap\n");
 363        print_hex_dump(KERN_DEBUG, "mdsmap: ",
 364                       DUMP_PREFIX_OFFSET, 16, 1,
 365                       start, end - start, true);
 366out_err:
 367        ceph_mdsmap_destroy(m);
 368        return ERR_PTR(err);
 369bad:
 370        err = -EINVAL;
 371        goto corrupt;
 372}
 373
 374void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
 375{
 376        int i;
 377
 378        for (i = 0; i < m->m_num_mds; i++)
 379                kfree(m->m_info[i].export_targets);
 380        kfree(m->m_info);
 381        kfree(m->m_data_pg_pools);
 382        kfree(m);
 383}
 384
 385bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
 386{
 387        int i, nr_active = 0;
 388        if (!m->m_enabled)
 389                return false;
 390        if (m->m_damaged)
 391                return false;
 392        if (m->m_num_laggy > 0)
 393                return false;
 394        for (i = 0; i < m->m_num_mds; i++) {
 395                if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
 396                        nr_active++;
 397        }
 398        return nr_active > 0;
 399}
 400