LXR linux/fs/ceph/super.c

   1
   2#include <linux/ceph/ceph_debug.h>
   3
   4#include <linux/backing-dev.h>
   5#include <linux/ctype.h>
   6#include <linux/fs.h>
   7#include <linux/inet.h>
   8#include <linux/in6.h>
   9#include <linux/module.h>
  10#include <linux/mount.h>
  11#include <linux/parser.h>
  12#include <linux/sched.h>
  13#include <linux/seq_file.h>
  14#include <linux/slab.h>
  15#include <linux/statfs.h>
  16#include <linux/string.h>
  17
  18#include "super.h"
  19#include "mds_client.h"
  20#include "cache.h"
  21
  22#include <linux/ceph/ceph_features.h>
  23#include <linux/ceph/decode.h>
  24#include <linux/ceph/mon_client.h>
  25#include <linux/ceph/auth.h>
  26#include <linux/ceph/debugfs.h>
  27
  28/*
  29 * Ceph superblock operations
  30 *
  31 * Handle the basics of mounting, unmounting.
  32 */
  33
  34/*
  35 * super ops
  36 */
  37static void ceph_put_super(struct super_block *s)
  38{
  39        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
  40
  41        dout("put_super\n");
  42        ceph_mdsc_close_sessions(fsc->mdsc);
  43}
  44
  45static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  46{
  47        struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
  48        struct ceph_monmap *monmap = fsc->client->monc.monmap;
  49        struct ceph_statfs st;
  50        u64 fsid;
  51        int err;
  52
  53        dout("statfs\n");
  54        err = ceph_monc_do_statfs(&fsc->client->monc, &st);
  55        if (err < 0)
  56                return err;
  57
  58        /* fill in kstatfs */
  59        buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
  60
  61        /*
  62         * express utilization in terms of large blocks to avoid
  63         * overflow on 32-bit machines.
  64         *
  65         * NOTE: for the time being, we make bsize == frsize to humor
  66         * not-yet-ancient versions of glibc that are broken.
  67         * Someday, we will probably want to report a real block
  68         * size...  whatever that may mean for a network file system!
  69         */
  70        buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
  71        buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
  72        buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
  73        buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  74        buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  75
  76        buf->f_files = le64_to_cpu(st.num_objects);
  77        buf->f_ffree = -1;
  78        buf->f_namelen = NAME_MAX;
  79
  80        /* leave fsid little-endian, regardless of host endianness */
  81        fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
  82        buf->f_fsid.val[0] = fsid & 0xffffffff;
  83        buf->f_fsid.val[1] = fsid >> 32;
  84
  85        return 0;
  86}
  87
  88
  89static int ceph_sync_fs(struct super_block *sb, int wait)
  90{
  91        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
  92
  93        if (!wait) {
  94                dout("sync_fs (non-blocking)\n");
  95                ceph_flush_dirty_caps(fsc->mdsc);
  96                dout("sync_fs (non-blocking) done\n");
  97                return 0;
  98        }
  99
 100        dout("sync_fs (blocking)\n");
 101        ceph_osdc_sync(&fsc->client->osdc);
 102        ceph_mdsc_sync(fsc->mdsc);
 103        dout("sync_fs (blocking) done\n");
 104        return 0;
 105}
 106
 107/*
 108 * mount options
 109 */
 110enum {
 111        Opt_wsize,
 112        Opt_rsize,
 113        Opt_rasize,
 114        Opt_caps_wanted_delay_min,
 115        Opt_caps_wanted_delay_max,
 116        Opt_cap_release_safety,
 117        Opt_readdir_max_entries,
 118        Opt_readdir_max_bytes,
 119        Opt_congestion_kb,
 120        Opt_last_int,
 121        /* int args above */
 122        Opt_snapdirname,
 123        Opt_last_string,
 124        /* string args above */
 125        Opt_dirstat,
 126        Opt_nodirstat,
 127        Opt_rbytes,
 128        Opt_norbytes,
 129        Opt_asyncreaddir,
 130        Opt_noasyncreaddir,
 131        Opt_dcache,
 132        Opt_nodcache,
 133        Opt_ino32,
 134        Opt_noino32,
 135        Opt_fscache,
 136        Opt_nofscache,
 137        Opt_poolperm,
 138        Opt_nopoolperm,
 139#ifdef CONFIG_CEPH_FS_POSIX_ACL
 140        Opt_acl,
 141#endif
 142        Opt_noacl,
 143};
 144
 145static match_table_t fsopt_tokens = {
 146        {Opt_wsize, "wsize=%d"},
 147        {Opt_rsize, "rsize=%d"},
 148        {Opt_rasize, "rasize=%d"},
 149        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 150        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 151        {Opt_cap_release_safety, "cap_release_safety=%d"},
 152        {Opt_readdir_max_entries, "readdir_max_entries=%d"},
 153        {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 154        {Opt_congestion_kb, "write_congestion_kb=%d"},
 155        /* int args above */
 156        {Opt_snapdirname, "snapdirname=%s"},
 157        /* string args above */
 158        {Opt_dirstat, "dirstat"},
 159        {Opt_nodirstat, "nodirstat"},
 160        {Opt_rbytes, "rbytes"},
 161        {Opt_norbytes, "norbytes"},
 162        {Opt_asyncreaddir, "asyncreaddir"},
 163        {Opt_noasyncreaddir, "noasyncreaddir"},
 164        {Opt_dcache, "dcache"},
 165        {Opt_nodcache, "nodcache"},
 166        {Opt_ino32, "ino32"},
 167        {Opt_noino32, "noino32"},
 168        {Opt_fscache, "fsc"},
 169        {Opt_nofscache, "nofsc"},
 170        {Opt_poolperm, "poolperm"},
 171        {Opt_nopoolperm, "nopoolperm"},
 172#ifdef CONFIG_CEPH_FS_POSIX_ACL
 173        {Opt_acl, "acl"},
 174#endif
 175        {Opt_noacl, "noacl"},
 176        {-1, NULL}
 177};
 178
 179static int parse_fsopt_token(char *c, void *private)
 180{
 181        struct ceph_mount_options *fsopt = private;
 182        substring_t argstr[MAX_OPT_ARGS];
 183        int token, intval, ret;
 184
 185        token = match_token((char *)c, fsopt_tokens, argstr);
 186        if (token < 0)
 187                return -EINVAL;
 188
 189        if (token < Opt_last_int) {
 190                ret = match_int(&argstr[0], &intval);
 191                if (ret < 0) {
 192                        pr_err("bad mount option arg (not int) "
 193                               "at '%s'\n", c);
 194                        return ret;
 195                }
 196                dout("got int token %d val %d\n", token, intval);
 197        } else if (token > Opt_last_int && token < Opt_last_string) {
 198                dout("got string token %d val %s\n", token,
 199                     argstr[0].from);
 200        } else {
 201                dout("got token %d\n", token);
 202        }
 203
 204        switch (token) {
 205        case Opt_snapdirname:
 206                kfree(fsopt->snapdir_name);
 207                fsopt->snapdir_name = kstrndup(argstr[0].from,
 208                                               argstr[0].to-argstr[0].from,
 209                                               GFP_KERNEL);
 210                if (!fsopt->snapdir_name)
 211                        return -ENOMEM;
 212                break;
 213
 214                /* misc */
 215        case Opt_wsize:
 216                fsopt->wsize = intval;
 217                break;
 218        case Opt_rsize:
 219                fsopt->rsize = intval;
 220                break;
 221        case Opt_rasize:
 222                fsopt->rasize = intval;
 223                break;
 224        case Opt_caps_wanted_delay_min:
 225                fsopt->caps_wanted_delay_min = intval;
 226                break;
 227        case Opt_caps_wanted_delay_max:
 228                fsopt->caps_wanted_delay_max = intval;
 229                break;
 230        case Opt_readdir_max_entries:
 231                fsopt->max_readdir = intval;
 232                break;
 233        case Opt_readdir_max_bytes:
 234                fsopt->max_readdir_bytes = intval;
 235                break;
 236        case Opt_congestion_kb:
 237                fsopt->congestion_kb = intval;
 238                break;
 239        case Opt_dirstat:
 240                fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
 241                break;
 242        case Opt_nodirstat:
 243                fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
 244                break;
 245        case Opt_rbytes:
 246                fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
 247                break;
 248        case Opt_norbytes:
 249                fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
 250                break;
 251        case Opt_asyncreaddir:
 252                fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
 253                break;
 254        case Opt_noasyncreaddir:
 255                fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
 256                break;
 257        case Opt_dcache:
 258                fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
 259                break;
 260        case Opt_nodcache:
 261                fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
 262                break;
 263        case Opt_ino32:
 264                fsopt->flags |= CEPH_MOUNT_OPT_INO32;
 265                break;
 266        case Opt_noino32:
 267                fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
 268                break;
 269        case Opt_fscache:
 270                fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
 271                break;
 272        case Opt_nofscache:
 273                fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
 274                break;
 275        case Opt_poolperm:
 276                fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
 277                printk ("pool perm");
 278                break;
 279        case Opt_nopoolperm:
 280                fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
 281                break;
 282#ifdef CONFIG_CEPH_FS_POSIX_ACL
 283        case Opt_acl:
 284                fsopt->sb_flags |= MS_POSIXACL;
 285                break;
 286#endif
 287        case Opt_noacl:
 288                fsopt->sb_flags &= ~MS_POSIXACL;
 289                break;
 290        default:
 291                BUG_ON(token);
 292        }
 293        return 0;
 294}
 295
 296static void destroy_mount_options(struct ceph_mount_options *args)
 297{
 298        dout("destroy_mount_options %p\n", args);
 299        kfree(args->snapdir_name);
 300        kfree(args);
 301}
 302
 303static int strcmp_null(const char *s1, const char *s2)
 304{
 305        if (!s1 && !s2)
 306                return 0;
 307        if (s1 && !s2)
 308                return -1;
 309        if (!s1 && s2)
 310                return 1;
 311        return strcmp(s1, s2);
 312}
 313
 314static int compare_mount_options(struct ceph_mount_options *new_fsopt,
 315                                 struct ceph_options *new_opt,
 316                                 struct ceph_fs_client *fsc)
 317{
 318        struct ceph_mount_options *fsopt1 = new_fsopt;
 319        struct ceph_mount_options *fsopt2 = fsc->mount_options;
 320        int ofs = offsetof(struct ceph_mount_options, snapdir_name);
 321        int ret;
 322
 323        ret = memcmp(fsopt1, fsopt2, ofs);
 324        if (ret)
 325                return ret;
 326
 327        ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
 328        if (ret)
 329                return ret;
 330
 331        return ceph_compare_options(new_opt, fsc->client);
 332}
 333
 334static int parse_mount_options(struct ceph_mount_options **pfsopt,
 335                               struct ceph_options **popt,
 336                               int flags, char *options,
 337                               const char *dev_name,
 338                               const char **path)
 339{
 340        struct ceph_mount_options *fsopt;
 341        const char *dev_name_end;
 342        int err;
 343
 344        if (!dev_name || !*dev_name)
 345                return -EINVAL;
 346
 347        fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
 348        if (!fsopt)
 349                return -ENOMEM;
 350
 351        dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
 352
 353        fsopt->sb_flags = flags;
 354        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
 355
 356        fsopt->rsize = CEPH_RSIZE_DEFAULT;
 357        fsopt->rasize = CEPH_RASIZE_DEFAULT;
 358        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 359        if (!fsopt->snapdir_name) {
 360                err = -ENOMEM;
 361                goto out;
 362        }
 363
 364        fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 365        fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
 366        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
 367        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
 368        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 369        fsopt->congestion_kb = default_congestion_kb();
 370
 371        /*
 372         * Distinguish the server list from the path in "dev_name".
 373         * Internally we do not include the leading '/' in the path.
 374         *
 375         * "dev_name" will look like:
 376         *     <server_spec>[,<server_spec>...]:[<path>]
 377         * where
 378         *     <server_spec> is <ip>[:<port>]
 379         *     <path> is optional, but if present must begin with '/'
 380         */
 381        dev_name_end = strchr(dev_name, '/');
 382        if (dev_name_end) {
 383                /* skip over leading '/' for path */
 384                *path = dev_name_end + 1;
 385        } else {
 386                /* path is empty */
 387                dev_name_end = dev_name + strlen(dev_name);
 388                *path = dev_name_end;
 389        }
 390        err = -EINVAL;
 391        dev_name_end--;         /* back up to ':' separator */
 392        if (dev_name_end < dev_name || *dev_name_end != ':') {
 393                pr_err("device name is missing path (no : separator in %s)\n",
 394                                dev_name);
 395                goto out;
 396        }
 397        dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 398        dout("server path '%s'\n", *path);
 399
 400        *popt = ceph_parse_options(options, dev_name, dev_name_end,
 401                                 parse_fsopt_token, (void *)fsopt);
 402        if (IS_ERR(*popt)) {
 403                err = PTR_ERR(*popt);
 404                goto out;
 405        }
 406
 407        /* success */
 408        *pfsopt = fsopt;
 409        return 0;
 410
 411out:
 412        destroy_mount_options(fsopt);
 413        return err;
 414}
 415
 416/**
 417 * ceph_show_options - Show mount options in /proc/mounts
 418 * @m: seq_file to write to
 419 * @root: root of that (sub)tree
 420 */
 421static int ceph_show_options(struct seq_file *m, struct dentry *root)
 422{
 423        struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
 424        struct ceph_mount_options *fsopt = fsc->mount_options;
 425        size_t pos;
 426        int ret;
 427
 428        /* a comma between MNT/MS and client options */
 429        seq_putc(m, ',');
 430        pos = m->count;
 431
 432        ret = ceph_print_client_options(m, fsc->client);
 433        if (ret)
 434                return ret;
 435
 436        /* retract our comma if no client options */
 437        if (m->count == pos)
 438                m->count--;
 439
 440        if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
 441                seq_puts(m, ",dirstat");
 442        if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
 443                seq_puts(m, ",rbytes");
 444        if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
 445                seq_puts(m, ",noasyncreaddir");
 446        if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
 447                seq_puts(m, ",nodcache");
 448        if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
 449                seq_puts(m, ",fsc");
 450        if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
 451                seq_puts(m, ",nopoolperm");
 452
 453#ifdef CONFIG_CEPH_FS_POSIX_ACL
 454        if (fsopt->sb_flags & MS_POSIXACL)
 455                seq_puts(m, ",acl");
 456        else
 457                seq_puts(m, ",noacl");
 458#endif
 459
 460        if (fsopt->wsize)
 461                seq_printf(m, ",wsize=%d", fsopt->wsize);
 462        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
 463                seq_printf(m, ",rsize=%d", fsopt->rsize);
 464        if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
 465                seq_printf(m, ",rasize=%d", fsopt->rasize);
 466        if (fsopt->congestion_kb != default_congestion_kb())
 467                seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
 468        if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
 469                seq_printf(m, ",caps_wanted_delay_min=%d",
 470                         fsopt->caps_wanted_delay_min);
 471        if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
 472                seq_printf(m, ",caps_wanted_delay_max=%d",
 473                           fsopt->caps_wanted_delay_max);
 474        if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
 475                seq_printf(m, ",cap_release_safety=%d",
 476                           fsopt->cap_release_safety);
 477        if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
 478                seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
 479        if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
 480                seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
 481        if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
 482                seq_show_option(m, "snapdirname", fsopt->snapdir_name);
 483
 484        return 0;
 485}
 486
 487/*
 488 * handle any mon messages the standard library doesn't understand.
 489 * return error if we don't either.
 490 */
 491static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 492{
 493        struct ceph_fs_client *fsc = client->private;
 494        int type = le16_to_cpu(msg->hdr.type);
 495
 496        switch (type) {
 497        case CEPH_MSG_MDS_MAP:
 498                ceph_mdsc_handle_map(fsc->mdsc, msg);
 499                return 0;
 500
 501        default:
 502                return -1;
 503        }
 504}
 505
 506/*
 507 * create a new fs client
 508 */
 509static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 510                                        struct ceph_options *opt)
 511{
 512        struct ceph_fs_client *fsc;
 513        const u64 supported_features =
 514                CEPH_FEATURE_FLOCK |
 515                CEPH_FEATURE_DIRLAYOUTHASH |
 516                CEPH_FEATURE_MDS_INLINE_DATA;
 517        const u64 required_features = 0;
 518        int page_count;
 519        size_t size;
 520        int err = -ENOMEM;
 521
 522        fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
 523        if (!fsc)
 524                return ERR_PTR(-ENOMEM);
 525
 526        fsc->client = ceph_create_client(opt, fsc, supported_features,
 527                                         required_features);
 528        if (IS_ERR(fsc->client)) {
 529                err = PTR_ERR(fsc->client);
 530                goto fail;
 531        }
 532        fsc->client->extra_mon_dispatch = extra_mon_dispatch;
 533        ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
 534
 535        fsc->mount_options = fsopt;
 536
 537        fsc->sb = NULL;
 538        fsc->mount_state = CEPH_MOUNT_MOUNTING;
 539
 540        atomic_long_set(&fsc->writeback_count, 0);
 541
 542        err = bdi_init(&fsc->backing_dev_info);
 543        if (err < 0)
 544                goto fail_client;
 545
 546        err = -ENOMEM;
 547        /*
 548         * The number of concurrent works can be high but they don't need
 549         * to be processed in parallel, limit concurrency.
 550         */
 551        fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
 552        if (fsc->wb_wq == NULL)
 553                goto fail_bdi;
 554        fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
 555        if (fsc->pg_inv_wq == NULL)
 556                goto fail_wb_wq;
 557        fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
 558        if (fsc->trunc_wq == NULL)
 559                goto fail_pg_inv_wq;
 560
 561        /* set up mempools */
 562        err = -ENOMEM;
 563        page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
 564        size = sizeof (struct page *) * (page_count ? page_count : 1);
 565        fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
 566        if (!fsc->wb_pagevec_pool)
 567                goto fail_trunc_wq;
 568
 569        /* setup fscache */
 570        if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
 571            (ceph_fscache_register_fs(fsc) != 0))
 572                goto fail_fscache;
 573
 574        /* caps */
 575        fsc->min_caps = fsopt->max_readdir;
 576
 577        return fsc;
 578
 579fail_fscache:
 580        ceph_fscache_unregister_fs(fsc);
 581fail_trunc_wq:
 582        destroy_workqueue(fsc->trunc_wq);
 583fail_pg_inv_wq:
 584        destroy_workqueue(fsc->pg_inv_wq);
 585fail_wb_wq:
 586        destroy_workqueue(fsc->wb_wq);
 587fail_bdi:
 588        bdi_destroy(&fsc->backing_dev_info);
 589fail_client:
 590        ceph_destroy_client(fsc->client);
 591fail:
 592        kfree(fsc);
 593        return ERR_PTR(err);
 594}
 595
 596static void destroy_fs_client(struct ceph_fs_client *fsc)
 597{
 598        dout("destroy_fs_client %p\n", fsc);
 599
 600        ceph_fscache_unregister_fs(fsc);
 601
 602        destroy_workqueue(fsc->wb_wq);
 603        destroy_workqueue(fsc->pg_inv_wq);
 604        destroy_workqueue(fsc->trunc_wq);
 605
 606        bdi_destroy(&fsc->backing_dev_info);
 607
 608        mempool_destroy(fsc->wb_pagevec_pool);
 609
 610        destroy_mount_options(fsc->mount_options);
 611
 612        ceph_fs_debugfs_cleanup(fsc);
 613
 614        ceph_destroy_client(fsc->client);
 615
 616        kfree(fsc);
 617        dout("destroy_fs_client %p done\n", fsc);
 618}
 619
 620/*
 621 * caches
 622 */
 623struct kmem_cache *ceph_inode_cachep;
 624struct kmem_cache *ceph_cap_cachep;
 625struct kmem_cache *ceph_cap_flush_cachep;
 626struct kmem_cache *ceph_dentry_cachep;
 627struct kmem_cache *ceph_file_cachep;
 628
 629static void ceph_inode_init_once(void *foo)
 630{
 631        struct ceph_inode_info *ci = foo;
 632        inode_init_once(&ci->vfs_inode);
 633}
 634
 635static int __init init_caches(void)
 636{
 637        int error = -ENOMEM;
 638
 639        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 640                                      sizeof(struct ceph_inode_info),
 641                                      __alignof__(struct ceph_inode_info),
 642                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
 643                                      SLAB_ACCOUNT, ceph_inode_init_once);
 644        if (ceph_inode_cachep == NULL)
 645                return -ENOMEM;
 646
 647        ceph_cap_cachep = KMEM_CACHE(ceph_cap,
 648                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 649        if (ceph_cap_cachep == NULL)
 650                goto bad_cap;
 651        ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
 652                                           SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 653        if (ceph_cap_flush_cachep == NULL)
 654                goto bad_cap_flush;
 655
 656        ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
 657                                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 658        if (ceph_dentry_cachep == NULL)
 659                goto bad_dentry;
 660
 661        ceph_file_cachep = KMEM_CACHE(ceph_file_info,
 662                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 663        if (ceph_file_cachep == NULL)
 664                goto bad_file;
 665
 666        if ((error = ceph_fscache_register()))
 667                goto bad_file;
 668
 669        return 0;
 670bad_file:
 671        kmem_cache_destroy(ceph_dentry_cachep);
 672bad_dentry:
 673        kmem_cache_destroy(ceph_cap_flush_cachep);
 674bad_cap_flush:
 675        kmem_cache_destroy(ceph_cap_cachep);
 676bad_cap:
 677        kmem_cache_destroy(ceph_inode_cachep);
 678        return error;
 679}
 680
 681static void destroy_caches(void)
 682{
 683        /*
 684         * Make sure all delayed rcu free inodes are flushed before we
 685         * destroy cache.
 686         */
 687        rcu_barrier();
 688
 689        kmem_cache_destroy(ceph_inode_cachep);
 690        kmem_cache_destroy(ceph_cap_cachep);
 691        kmem_cache_destroy(ceph_cap_flush_cachep);
 692        kmem_cache_destroy(ceph_dentry_cachep);
 693        kmem_cache_destroy(ceph_file_cachep);
 694
 695        ceph_fscache_unregister();
 696}
 697
 698
 699/*
 700 * ceph_umount_begin - initiate forced umount.  Tear down down the
 701 * mount, skipping steps that may hang while waiting for server(s).
 702 */
 703static void ceph_umount_begin(struct super_block *sb)
 704{
 705        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 706
 707        dout("ceph_umount_begin - starting forced umount\n");
 708        if (!fsc)
 709                return;
 710        fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
 711        ceph_mdsc_force_umount(fsc->mdsc);
 712        return;
 713}
 714
 715static const struct super_operations ceph_super_ops = {
 716        .alloc_inode    = ceph_alloc_inode,
 717        .destroy_inode  = ceph_destroy_inode,
 718        .write_inode    = ceph_write_inode,
 719        .drop_inode     = ceph_drop_inode,
 720        .sync_fs        = ceph_sync_fs,
 721        .put_super      = ceph_put_super,
 722        .show_options   = ceph_show_options,
 723        .statfs         = ceph_statfs,
 724        .umount_begin   = ceph_umount_begin,
 725};
 726
 727/*
 728 * Bootstrap mount by opening the root directory.  Note the mount
 729 * @started time from caller, and time out if this takes too long.
 730 */
 731static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 732                                       const char *path,
 733                                       unsigned long started)
 734{
 735        struct ceph_mds_client *mdsc = fsc->mdsc;
 736        struct ceph_mds_request *req = NULL;
 737        int err;
 738        struct dentry *root;
 739
 740        /* open dir */
 741        dout("open_root_inode opening '%s'\n", path);
 742        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
 743        if (IS_ERR(req))
 744                return ERR_CAST(req);
 745        req->r_path1 = kstrdup(path, GFP_NOFS);
 746        if (!req->r_path1) {
 747                root = ERR_PTR(-ENOMEM);
 748                goto out;
 749        }
 750
 751        req->r_ino1.ino = CEPH_INO_ROOT;
 752        req->r_ino1.snap = CEPH_NOSNAP;
 753        req->r_started = started;
 754        req->r_timeout = fsc->client->options->mount_timeout;
 755        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 756        req->r_num_caps = 2;
 757        err = ceph_mdsc_do_request(mdsc, NULL, req);
 758        if (err == 0) {
 759                struct inode *inode = req->r_target_inode;
 760                req->r_target_inode = NULL;
 761                dout("open_root_inode success\n");
 762                if (ceph_ino(inode) == CEPH_INO_ROOT &&
 763                    fsc->sb->s_root == NULL) {
 764                        root = d_make_root(inode);
 765                        if (!root) {
 766                                root = ERR_PTR(-ENOMEM);
 767                                goto out;
 768                        }
 769                } else {
 770                        root = d_obtain_root(inode);
 771                }
 772                ceph_init_dentry(root);
 773                dout("open_root_inode success, root dentry is %p\n", root);
 774        } else {
 775                root = ERR_PTR(err);
 776        }
 777out:
 778        ceph_mdsc_put_request(req);
 779        return root;
 780}
 781
 782
 783
 784
 785/*
 786 * mount: join the ceph cluster, and open root directory.
 787 */
 788static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 789                      const char *path)
 790{
 791        int err;
 792        unsigned long started = jiffies;  /* note the start time */
 793        struct dentry *root;
 794        int first = 0;   /* first vfsmount for this super_block */
 795
 796        dout("mount start %p\n", fsc);
 797        mutex_lock(&fsc->client->mount_mutex);
 798
 799        if (!fsc->sb->s_root) {
 800                err = __ceph_open_session(fsc->client, started);
 801                if (err < 0)
 802                        goto out;
 803
 804                dout("mount opening root\n");
 805                root = open_root_dentry(fsc, "", started);
 806                if (IS_ERR(root)) {
 807                        err = PTR_ERR(root);
 808                        goto out;
 809                }
 810                fsc->sb->s_root = root;
 811                first = 1;
 812
 813                err = ceph_fs_debugfs_init(fsc);
 814                if (err < 0)
 815                        goto fail;
 816        }
 817
 818        if (path[0] == 0) {
 819                root = fsc->sb->s_root;
 820                dget(root);
 821        } else {
 822                dout("mount opening base mountpoint\n");
 823                root = open_root_dentry(fsc, path, started);
 824                if (IS_ERR(root)) {
 825                        err = PTR_ERR(root);
 826                        goto fail;
 827                }
 828        }
 829
 830        fsc->mount_state = CEPH_MOUNT_MOUNTED;
 831        dout("mount success\n");
 832        mutex_unlock(&fsc->client->mount_mutex);
 833        return root;
 834
 835fail:
 836        if (first) {
 837                dput(fsc->sb->s_root);
 838                fsc->sb->s_root = NULL;
 839        }
 840out:
 841        mutex_unlock(&fsc->client->mount_mutex);
 842        return ERR_PTR(err);
 843}
 844
 845static int ceph_set_super(struct super_block *s, void *data)
 846{
 847        struct ceph_fs_client *fsc = data;
 848        int ret;
 849
 850        dout("set_super %p data %p\n", s, data);
 851
 852        s->s_flags = fsc->mount_options->sb_flags;
 853        s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 854
 855        s->s_xattr = ceph_xattr_handlers;
 856        s->s_fs_info = fsc;
 857        fsc->sb = s;
 858
 859        s->s_op = &ceph_super_ops;
 860        s->s_export_op = &ceph_export_ops;
 861
 862        s->s_time_gran = 1000;  /* 1000 ns == 1 us */
 863
 864        ret = set_anon_super(s, NULL);  /* what is that second arg for? */
 865        if (ret != 0)
 866                goto fail;
 867
 868        return ret;
 869
 870fail:
 871        s->s_fs_info = NULL;
 872        fsc->sb = NULL;
 873        return ret;
 874}
 875
 876/*
 877 * share superblock if same fs AND options
 878 */
 879static int ceph_compare_super(struct super_block *sb, void *data)
 880{
 881        struct ceph_fs_client *new = data;
 882        struct ceph_mount_options *fsopt = new->mount_options;
 883        struct ceph_options *opt = new->client->options;
 884        struct ceph_fs_client *other = ceph_sb_to_client(sb);
 885
 886        dout("ceph_compare_super %p\n", sb);
 887
 888        if (compare_mount_options(fsopt, opt, other)) {
 889                dout("monitor(s)/mount options don't match\n");
 890                return 0;
 891        }
 892        if ((opt->flags & CEPH_OPT_FSID) &&
 893            ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
 894                dout("fsid doesn't match\n");
 895                return 0;
 896        }
 897        if (fsopt->sb_flags != other->mount_options->sb_flags) {
 898                dout("flags differ\n");
 899                return 0;
 900        }
 901        return 1;
 902}
 903
 904/*
 905 * construct our own bdi so we can control readahead, etc.
 906 */
 907static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 908
 909static int ceph_register_bdi(struct super_block *sb,
 910                             struct ceph_fs_client *fsc)
 911{
 912        int err;
 913
 914        /* set ra_pages based on rasize mount option? */
 915        if (fsc->mount_options->rasize >= PAGE_SIZE)
 916                fsc->backing_dev_info.ra_pages =
 917                        (fsc->mount_options->rasize + PAGE_SIZE - 1)
 918                        >> PAGE_SHIFT;
 919        else
 920                fsc->backing_dev_info.ra_pages =
 921                        VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 922
 923        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 924                           atomic_long_inc_return(&bdi_seq));
 925        if (!err)
 926                sb->s_bdi = &fsc->backing_dev_info;
 927        return err;
 928}
 929
 930static struct dentry *ceph_mount(struct file_system_type *fs_type,
 931                       int flags, const char *dev_name, void *data)
 932{
 933        struct super_block *sb;
 934        struct ceph_fs_client *fsc;
 935        struct dentry *res;
 936        int err;
 937        int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 938        const char *path = NULL;
 939        struct ceph_mount_options *fsopt = NULL;
 940        struct ceph_options *opt = NULL;
 941
 942        dout("ceph_mount\n");
 943
 944#ifdef CONFIG_CEPH_FS_POSIX_ACL
 945        flags |= MS_POSIXACL;
 946#endif
 947        err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
 948        if (err < 0) {
 949                res = ERR_PTR(err);
 950                goto out_final;
 951        }
 952
 953        /* create client (which we may/may not use) */
 954        fsc = create_fs_client(fsopt, opt);
 955        if (IS_ERR(fsc)) {
 956                res = ERR_CAST(fsc);
 957                destroy_mount_options(fsopt);
 958                ceph_destroy_options(opt);
 959                goto out_final;
 960        }
 961
 962        err = ceph_mdsc_init(fsc);
 963        if (err < 0) {
 964                res = ERR_PTR(err);
 965                goto out;
 966        }
 967
 968        if (ceph_test_opt(fsc->client, NOSHARE))
 969                compare_super = NULL;
 970        sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
 971        if (IS_ERR(sb)) {
 972                res = ERR_CAST(sb);
 973                goto out;
 974        }
 975
 976        if (ceph_sb_to_client(sb) != fsc) {
 977                ceph_mdsc_destroy(fsc);
 978                destroy_fs_client(fsc);
 979                fsc = ceph_sb_to_client(sb);
 980                dout("get_sb got existing client %p\n", fsc);
 981        } else {
 982                dout("get_sb using new client %p\n", fsc);
 983                err = ceph_register_bdi(sb, fsc);
 984                if (err < 0) {
 985                        res = ERR_PTR(err);
 986                        goto out_splat;
 987                }
 988        }
 989
 990        res = ceph_real_mount(fsc, path);
 991        if (IS_ERR(res))
 992                goto out_splat;
 993        dout("root %p inode %p ino %llx.%llx\n", res,
 994             d_inode(res), ceph_vinop(d_inode(res)));
 995        return res;
 996
 997out_splat:
 998        ceph_mdsc_close_sessions(fsc->mdsc);
 999        deactivate_locked_super(sb);
1000        goto out_final;

1001
1002out:
1003        ceph_mdsc_destroy(fsc);
1004        destroy_fs_client(fsc);
1005out_final:
1006        dout("ceph_mount fail %ld\n", PTR_ERR(res));
1007        return res;
1008}
1009
1010static void ceph_kill_sb(struct super_block *s)
1011{
1012        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1013        dev_t dev = s->s_dev;
1014
1015        dout("kill_sb %p\n", s);
1016
1017        ceph_mdsc_pre_umount(fsc->mdsc);
1018        generic_shutdown_super(s);
1019        ceph_mdsc_destroy(fsc);
1020
1021        destroy_fs_client(fsc);
1022        free_anon_bdev(dev);
1023}
1024
1025static struct file_system_type ceph_fs_type = {
1026        .owner          = THIS_MODULE,
1027        .name           = "ceph",
1028        .mount          = ceph_mount,
1029        .kill_sb        = ceph_kill_sb,
1030        .fs_flags       = FS_RENAME_DOES_D_MOVE,
1031};
1032MODULE_ALIAS_FS("ceph");
1033
1034static int __init init_ceph(void)
1035{
1036        int ret = init_caches();
1037        if (ret)
1038                goto out;
1039
1040        ceph_flock_init();
1041        ceph_xattr_init();
1042        ret = register_filesystem(&ceph_fs_type);
1043        if (ret)
1044                goto out_xattr;
1045
1046        pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1047
1048        return 0;
1049
1050out_xattr:
1051        ceph_xattr_exit();
1052        destroy_caches();
1053out:
1054        return ret;
1055}
1056
1057static void __exit exit_ceph(void)
1058{
1059        dout("exit_ceph\n");
1060        unregister_filesystem(&ceph_fs_type);
1061        ceph_xattr_exit();
1062        destroy_caches();
1063}
1064
1065module_init(init_ceph);
1066module_exit(exit_ceph);
1067
1068MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1069MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1070MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1071MODULE_DESCRIPTION("Ceph filesystem for Linux");
1072MODULE_LICENSE("GPL");
1073