linux/fs/ocfs2/dlmfs/dlmfs.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * dlmfs.c
   4 *
   5 * Code which implements the kernel side of a minimal userspace
   6 * interface to our DLM. This file handles the virtual file system
   7 * used for communication with userspace. Credit should go to ramfs,
   8 * which was a template for the fs side of this module.
   9 *
  10 * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
  11 */
  12
  13/* Simple VFS hooks based on: */
  14/*
  15 * Resizable simple ram filesystem for Linux.
  16 *
  17 * Copyright (C) 2000 Linus Torvalds.
  18 *               2000 Transmeta Corp.
  19 */
  20
  21#include <linux/module.h>
  22#include <linux/fs.h>
  23#include <linux/pagemap.h>
  24#include <linux/types.h>
  25#include <linux/slab.h>
  26#include <linux/highmem.h>
  27#include <linux/init.h>
  28#include <linux/string.h>
  29#include <linux/backing-dev.h>
  30#include <linux/poll.h>
  31
  32#include <linux/uaccess.h>
  33
  34#include "../stackglue.h"
  35#include "userdlm.h"
  36
  37#define MLOG_MASK_PREFIX ML_DLMFS
  38#include "../cluster/masklog.h"
  39
  40
  41static const struct super_operations dlmfs_ops;
  42static const struct file_operations dlmfs_file_operations;
  43static const struct inode_operations dlmfs_dir_inode_operations;
  44static const struct inode_operations dlmfs_root_inode_operations;
  45static const struct inode_operations dlmfs_file_inode_operations;
  46static struct kmem_cache *dlmfs_inode_cache;
  47
  48struct workqueue_struct *user_dlm_worker;
  49
  50
  51
  52/*
  53 * These are the ABI capabilities of dlmfs.
  54 *
  55 * Over time, dlmfs has added some features that were not part of the
  56 * initial ABI.  Unfortunately, some of these features are not detectable
  57 * via standard usage.  For example, Linux's default poll always returns
  58 * EPOLLIN, so there is no way for a caller of poll(2) to know when dlmfs
  59 * added poll support.  Instead, we provide this list of new capabilities.
  60 *
  61 * Capabilities is a read-only attribute.  We do it as a module parameter
  62 * so we can discover it whether dlmfs is built in, loaded, or even not
  63 * loaded.
  64 *
  65 * The ABI features are local to this machine's dlmfs mount.  This is
  66 * distinct from the locking protocol, which is concerned with inter-node
  67 * interaction.
  68 *
  69 * Capabilities:
  70 * - bast       : EPOLLIN against the file descriptor of a held lock
  71 *                signifies a bast fired on the lock.
  72 */
  73#define DLMFS_CAPABILITIES "bast stackglue"
  74static int param_set_dlmfs_capabilities(const char *val,
  75                                        const struct kernel_param *kp)
  76{
  77        printk(KERN_ERR "%s: readonly parameter\n", kp->name);
  78        return -EINVAL;
  79}
  80static int param_get_dlmfs_capabilities(char *buffer,
  81                                        const struct kernel_param *kp)
  82{
  83        return strlcpy(buffer, DLMFS_CAPABILITIES,
  84                       strlen(DLMFS_CAPABILITIES) + 1);
  85}
  86module_param_call(capabilities, param_set_dlmfs_capabilities,
  87                  param_get_dlmfs_capabilities, NULL, 0444);
  88MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
  89
  90
  91/*
  92 * decodes a set of open flags into a valid lock level and a set of flags.
  93 * returns < 0 if we have invalid flags
  94 * flags which mean something to us:
  95 * O_RDONLY -> PRMODE level
  96 * O_WRONLY -> EXMODE level
  97 *
  98 * O_NONBLOCK -> NOQUEUE
  99 */
 100static int dlmfs_decode_open_flags(int open_flags,
 101                                   int *level,
 102                                   int *flags)
 103{
 104        if (open_flags & (O_WRONLY|O_RDWR))
 105                *level = DLM_LOCK_EX;
 106        else
 107                *level = DLM_LOCK_PR;
 108
 109        *flags = 0;
 110        if (open_flags & O_NONBLOCK)
 111                *flags |= DLM_LKF_NOQUEUE;
 112
 113        return 0;
 114}
 115
 116static int dlmfs_file_open(struct inode *inode,
 117                           struct file *file)
 118{
 119        int status, level, flags;
 120        struct dlmfs_filp_private *fp = NULL;
 121        struct dlmfs_inode_private *ip;
 122
 123        if (S_ISDIR(inode->i_mode))
 124                BUG();
 125
 126        mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
 127                file->f_flags);
 128
 129        status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
 130        if (status < 0)
 131                goto bail;
 132
 133        /* We don't want to honor O_APPEND at read/write time as it
 134         * doesn't make sense for LVB writes. */
 135        file->f_flags &= ~O_APPEND;
 136
 137        fp = kmalloc(sizeof(*fp), GFP_NOFS);
 138        if (!fp) {
 139                status = -ENOMEM;
 140                goto bail;
 141        }
 142        fp->fp_lock_level = level;
 143
 144        ip = DLMFS_I(inode);
 145
 146        status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
 147        if (status < 0) {
 148                /* this is a strange error to return here but I want
 149                 * to be able userspace to be able to distinguish a
 150                 * valid lock request from one that simply couldn't be
 151                 * granted. */
 152                if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
 153                        status = -ETXTBSY;
 154                kfree(fp);
 155                goto bail;
 156        }
 157
 158        file->private_data = fp;
 159bail:
 160        return status;
 161}
 162
 163static int dlmfs_file_release(struct inode *inode,
 164                              struct file *file)
 165{
 166        int level;
 167        struct dlmfs_inode_private *ip = DLMFS_I(inode);
 168        struct dlmfs_filp_private *fp = file->private_data;
 169
 170        if (S_ISDIR(inode->i_mode))
 171                BUG();
 172
 173        mlog(0, "close called on inode %lu\n", inode->i_ino);
 174
 175        if (fp) {
 176                level = fp->fp_lock_level;
 177                if (level != DLM_LOCK_IV)
 178                        user_dlm_cluster_unlock(&ip->ip_lockres, level);
 179
 180                kfree(fp);
 181                file->private_data = NULL;
 182        }
 183
 184        return 0;
 185}
 186
 187/*
 188 * We do ->setattr() just to override size changes.  Our size is the size
 189 * of the LVB and nothing else.
 190 */
 191static int dlmfs_file_setattr(struct user_namespace *mnt_userns,
 192                              struct dentry *dentry, struct iattr *attr)
 193{
 194        int error;
 195        struct inode *inode = d_inode(dentry);
 196
 197        attr->ia_valid &= ~ATTR_SIZE;
 198        error = setattr_prepare(&init_user_ns, dentry, attr);
 199        if (error)
 200                return error;
 201
 202        setattr_copy(&init_user_ns, inode, attr);
 203        mark_inode_dirty(inode);
 204        return 0;
 205}
 206
 207static __poll_t dlmfs_file_poll(struct file *file, poll_table *wait)
 208{
 209        __poll_t event = 0;
 210        struct inode *inode = file_inode(file);
 211        struct dlmfs_inode_private *ip = DLMFS_I(inode);
 212
 213        poll_wait(file, &ip->ip_lockres.l_event, wait);
 214
 215        spin_lock(&ip->ip_lockres.l_lock);
 216        if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
 217                event = EPOLLIN | EPOLLRDNORM;
 218        spin_unlock(&ip->ip_lockres.l_lock);
 219
 220        return event;
 221}
 222
 223static ssize_t dlmfs_file_read(struct file *file,
 224                               char __user *buf,
 225                               size_t count,
 226                               loff_t *ppos)
 227{
 228        char lvb[DLM_LVB_LEN];
 229
 230        if (!user_dlm_read_lvb(file_inode(file), lvb))
 231                return 0;
 232
 233        return simple_read_from_buffer(buf, count, ppos, lvb, sizeof(lvb));
 234}
 235
 236static ssize_t dlmfs_file_write(struct file *filp,
 237                                const char __user *buf,
 238                                size_t count,
 239                                loff_t *ppos)
 240{
 241        char lvb_buf[DLM_LVB_LEN];
 242        int bytes_left;
 243        struct inode *inode = file_inode(filp);
 244
 245        mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
 246                inode->i_ino, count, *ppos);
 247
 248        if (*ppos >= DLM_LVB_LEN)
 249                return -ENOSPC;
 250
 251        /* don't write past the lvb */
 252        if (count > DLM_LVB_LEN - *ppos)
 253                count = DLM_LVB_LEN - *ppos;
 254
 255        if (!count)
 256                return 0;
 257
 258        bytes_left = copy_from_user(lvb_buf, buf, count);
 259        count -= bytes_left;
 260        if (count)
 261                user_dlm_write_lvb(inode, lvb_buf, count);
 262
 263        *ppos = *ppos + count;
 264        mlog(0, "wrote %zu bytes\n", count);
 265        return count;
 266}
 267
 268static void dlmfs_init_once(void *foo)
 269{
 270        struct dlmfs_inode_private *ip =
 271                (struct dlmfs_inode_private *) foo;
 272
 273        ip->ip_conn = NULL;
 274        ip->ip_parent = NULL;
 275
 276        inode_init_once(&ip->ip_vfs_inode);
 277}
 278
 279static struct inode *dlmfs_alloc_inode(struct super_block *sb)
 280{
 281        struct dlmfs_inode_private *ip;
 282
 283        ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
 284        if (!ip)
 285                return NULL;
 286
 287        return &ip->ip_vfs_inode;
 288}
 289
 290static void dlmfs_free_inode(struct inode *inode)
 291{
 292        kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 293}
 294
 295static void dlmfs_evict_inode(struct inode *inode)
 296{
 297        int status;
 298        struct dlmfs_inode_private *ip;
 299
 300        clear_inode(inode);
 301
 302        mlog(0, "inode %lu\n", inode->i_ino);
 303
 304        ip = DLMFS_I(inode);
 305
 306        if (S_ISREG(inode->i_mode)) {
 307                status = user_dlm_destroy_lock(&ip->ip_lockres);
 308                if (status < 0)
 309                        mlog_errno(status);
 310                iput(ip->ip_parent);
 311                goto clear_fields;
 312        }
 313
 314        mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
 315        /* we must be a directory. If required, lets unregister the
 316         * dlm context now. */
 317        if (ip->ip_conn)
 318                user_dlm_unregister(ip->ip_conn);
 319clear_fields:
 320        ip->ip_parent = NULL;
 321        ip->ip_conn = NULL;
 322}
 323
 324static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 325{
 326        struct inode *inode = new_inode(sb);
 327        umode_t mode = S_IFDIR | 0755;
 328
 329        if (inode) {
 330                inode->i_ino = get_next_ino();
 331                inode_init_owner(&init_user_ns, inode, NULL, mode);
 332                inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 333                inc_nlink(inode);
 334
 335                inode->i_fop = &simple_dir_operations;
 336                inode->i_op = &dlmfs_root_inode_operations;
 337        }
 338
 339        return inode;
 340}
 341
 342static struct inode *dlmfs_get_inode(struct inode *parent,
 343                                     struct dentry *dentry,
 344                                     umode_t mode)
 345{
 346        struct super_block *sb = parent->i_sb;
 347        struct inode * inode = new_inode(sb);
 348        struct dlmfs_inode_private *ip;
 349
 350        if (!inode)
 351                return NULL;
 352
 353        inode->i_ino = get_next_ino();
 354        inode_init_owner(&init_user_ns, inode, parent, mode);
 355        inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 356
 357        ip = DLMFS_I(inode);
 358        ip->ip_conn = DLMFS_I(parent)->ip_conn;
 359
 360        switch (mode & S_IFMT) {
 361        default:
 362                /* for now we don't support anything other than
 363                 * directories and regular files. */
 364                BUG();
 365                break;
 366        case S_IFREG:
 367                inode->i_op = &dlmfs_file_inode_operations;
 368                inode->i_fop = &dlmfs_file_operations;
 369
 370                i_size_write(inode,  DLM_LVB_LEN);
 371
 372                user_dlm_lock_res_init(&ip->ip_lockres, dentry);
 373
 374                /* released at clear_inode time, this insures that we
 375                 * get to drop the dlm reference on each lock *before*
 376                 * we call the unregister code for releasing parent
 377                 * directories. */
 378                ip->ip_parent = igrab(parent);
 379                BUG_ON(!ip->ip_parent);
 380                break;
 381        case S_IFDIR:
 382                inode->i_op = &dlmfs_dir_inode_operations;
 383                inode->i_fop = &simple_dir_operations;
 384
 385                /* directory inodes start off with i_nlink ==
 386                 * 2 (for "." entry) */
 387                inc_nlink(inode);
 388                break;
 389        }
 390        return inode;
 391}
 392
 393/*
 394 * File creation. Allocate an inode, and we're done..
 395 */
 396/* SMP-safe */
 397static int dlmfs_mkdir(struct user_namespace * mnt_userns,
 398                       struct inode * dir,
 399                       struct dentry * dentry,
 400                       umode_t mode)
 401{
 402        int status;
 403        struct inode *inode = NULL;
 404        const struct qstr *domain = &dentry->d_name;
 405        struct dlmfs_inode_private *ip;
 406        struct ocfs2_cluster_connection *conn;
 407
 408        mlog(0, "mkdir %.*s\n", domain->len, domain->name);
 409
 410        /* verify that we have a proper domain */
 411        if (domain->len >= GROUP_NAME_MAX) {
 412                status = -EINVAL;
 413                mlog(ML_ERROR, "invalid domain name for directory.\n");
 414                goto bail;
 415        }
 416
 417        inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
 418        if (!inode) {
 419                status = -ENOMEM;
 420                mlog_errno(status);
 421                goto bail;
 422        }
 423
 424        ip = DLMFS_I(inode);
 425
 426        conn = user_dlm_register(domain);
 427        if (IS_ERR(conn)) {
 428                status = PTR_ERR(conn);
 429                mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
 430                     status, domain->len, domain->name);
 431                goto bail;
 432        }
 433        ip->ip_conn = conn;
 434
 435        inc_nlink(dir);
 436        d_instantiate(dentry, inode);
 437        dget(dentry);   /* Extra count - pin the dentry in core */
 438
 439        status = 0;
 440bail:
 441        if (status < 0)
 442                iput(inode);
 443        return status;
 444}
 445
 446static int dlmfs_create(struct user_namespace *mnt_userns,
 447                        struct inode *dir,
 448                        struct dentry *dentry,
 449                        umode_t mode,
 450                        bool excl)
 451{
 452        int status = 0;
 453        struct inode *inode;
 454        const struct qstr *name = &dentry->d_name;
 455
 456        mlog(0, "create %.*s\n", name->len, name->name);
 457
 458        /* verify name is valid and doesn't contain any dlm reserved
 459         * characters */
 460        if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
 461            name->name[0] == '$') {
 462                status = -EINVAL;
 463                mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
 464                     name->name);
 465                goto bail;
 466        }
 467
 468        inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
 469        if (!inode) {
 470                status = -ENOMEM;
 471                mlog_errno(status);
 472                goto bail;
 473        }
 474
 475        d_instantiate(dentry, inode);
 476        dget(dentry);   /* Extra count - pin the dentry in core */
 477bail:
 478        return status;
 479}
 480
 481static int dlmfs_unlink(struct inode *dir,
 482                        struct dentry *dentry)
 483{
 484        int status;
 485        struct inode *inode = d_inode(dentry);
 486
 487        mlog(0, "unlink inode %lu\n", inode->i_ino);
 488
 489        /* if there are no current holders, or none that are waiting
 490         * to acquire a lock, this basically destroys our lockres. */
 491        status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
 492        if (status < 0) {
 493                mlog(ML_ERROR, "unlink %pd, error %d from destroy\n",
 494                     dentry, status);
 495                goto bail;
 496        }
 497        status = simple_unlink(dir, dentry);
 498bail:
 499        return status;
 500}
 501
 502static int dlmfs_fill_super(struct super_block * sb,
 503                            void * data,
 504                            int silent)
 505{
 506        sb->s_maxbytes = MAX_LFS_FILESIZE;
 507        sb->s_blocksize = PAGE_SIZE;
 508        sb->s_blocksize_bits = PAGE_SHIFT;
 509        sb->s_magic = DLMFS_MAGIC;
 510        sb->s_op = &dlmfs_ops;
 511        sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
 512        if (!sb->s_root)
 513                return -ENOMEM;
 514        return 0;
 515}
 516
 517static const struct file_operations dlmfs_file_operations = {
 518        .open           = dlmfs_file_open,
 519        .release        = dlmfs_file_release,
 520        .poll           = dlmfs_file_poll,
 521        .read           = dlmfs_file_read,
 522        .write          = dlmfs_file_write,
 523        .llseek         = default_llseek,
 524};
 525
 526static const struct inode_operations dlmfs_dir_inode_operations = {
 527        .create         = dlmfs_create,
 528        .lookup         = simple_lookup,
 529        .unlink         = dlmfs_unlink,
 530};
 531
 532/* this way we can restrict mkdir to only the toplevel of the fs. */
 533static const struct inode_operations dlmfs_root_inode_operations = {
 534        .lookup         = simple_lookup,
 535        .mkdir          = dlmfs_mkdir,
 536        .rmdir          = simple_rmdir,
 537};
 538
 539static const struct super_operations dlmfs_ops = {
 540        .statfs         = simple_statfs,
 541        .alloc_inode    = dlmfs_alloc_inode,
 542        .free_inode     = dlmfs_free_inode,
 543        .evict_inode    = dlmfs_evict_inode,
 544        .drop_inode     = generic_delete_inode,
 545};
 546
 547static const struct inode_operations dlmfs_file_inode_operations = {
 548        .getattr        = simple_getattr,
 549        .setattr        = dlmfs_file_setattr,
 550};
 551
 552static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
 553        int flags, const char *dev_name, void *data)
 554{
 555        return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
 556}
 557
 558static struct file_system_type dlmfs_fs_type = {
 559        .owner          = THIS_MODULE,
 560        .name           = "ocfs2_dlmfs",
 561        .mount          = dlmfs_mount,
 562        .kill_sb        = kill_litter_super,
 563};
 564MODULE_ALIAS_FS("ocfs2_dlmfs");
 565
 566static int __init init_dlmfs_fs(void)
 567{
 568        int status;
 569        int cleanup_inode = 0, cleanup_worker = 0;
 570
 571        dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 572                                sizeof(struct dlmfs_inode_private),
 573                                0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 574                                        SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 575                                dlmfs_init_once);
 576        if (!dlmfs_inode_cache) {
 577                status = -ENOMEM;
 578                goto bail;
 579        }
 580        cleanup_inode = 1;
 581
 582        user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
 583        if (!user_dlm_worker) {
 584                status = -ENOMEM;
 585                goto bail;
 586        }
 587        cleanup_worker = 1;
 588
 589        user_dlm_set_locking_protocol();
 590        status = register_filesystem(&dlmfs_fs_type);
 591bail:
 592        if (status) {
 593                if (cleanup_inode)
 594                        kmem_cache_destroy(dlmfs_inode_cache);
 595                if (cleanup_worker)
 596                        destroy_workqueue(user_dlm_worker);
 597        } else
 598                printk("OCFS2 User DLM kernel interface loaded\n");
 599        return status;
 600}
 601
 602static void __exit exit_dlmfs_fs(void)
 603{
 604        unregister_filesystem(&dlmfs_fs_type);
 605
 606        destroy_workqueue(user_dlm_worker);
 607
 608        /*
 609         * Make sure all delayed rcu free inodes are flushed before we
 610         * destroy cache.
 611         */
 612        rcu_barrier();
 613        kmem_cache_destroy(dlmfs_inode_cache);
 614
 615}
 616
 617MODULE_AUTHOR("Oracle");
 618MODULE_LICENSE("GPL");
 619MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
 620
 621module_init(init_dlmfs_fs)
 622module_exit(exit_dlmfs_fs)
 623