linux/fs/ceph/mds_client.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2#ifndef _FS_CEPH_MDS_CLIENT_H
   3#define _FS_CEPH_MDS_CLIENT_H
   4
   5#include <linux/completion.h>
   6#include <linux/kref.h>
   7#include <linux/list.h>
   8#include <linux/mutex.h>
   9#include <linux/rbtree.h>
  10#include <linux/spinlock.h>
  11#include <linux/refcount.h>
  12#include <linux/utsname.h>
  13
  14#include <linux/ceph/types.h>
  15#include <linux/ceph/messenger.h>
  16#include <linux/ceph/mdsmap.h>
  17#include <linux/ceph/auth.h>
  18
  19/* The first 8 bits are reserved for old ceph releases */
  20#define CEPHFS_FEATURE_MIMIC            8
  21#define CEPHFS_FEATURE_REPLY_ENCODING   9
  22#define CEPHFS_FEATURE_RECLAIM_CLIENT   10
  23#define CEPHFS_FEATURE_LAZY_CAP_WANTED  11
  24#define CEPHFS_FEATURE_MULTI_RECONNECT  12
  25
  26#define CEPHFS_FEATURES_CLIENT_SUPPORTED {      \
  27        0, 1, 2, 3, 4, 5, 6, 7,                 \
  28        CEPHFS_FEATURE_MIMIC,                   \
  29        CEPHFS_FEATURE_REPLY_ENCODING,          \
  30        CEPHFS_FEATURE_LAZY_CAP_WANTED,         \
  31        CEPHFS_FEATURE_MULTI_RECONNECT,         \
  32}
  33#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
  34
  35
  36/*
  37 * Some lock dependencies:
  38 *
  39 * session->s_mutex
  40 *         mdsc->mutex
  41 *
  42 *         mdsc->snap_rwsem
  43 *
  44 *         ci->i_ceph_lock
  45 *                 mdsc->snap_flush_lock
  46 *                 mdsc->cap_delay_lock
  47 *
  48 */
  49
  50struct ceph_fs_client;
  51struct ceph_cap;
  52
  53/*
  54 * parsed info about a single inode.  pointers are into the encoded
  55 * on-wire structures within the mds reply message payload.
  56 */
  57struct ceph_mds_reply_info_in {
  58        struct ceph_mds_reply_inode *in;
  59        struct ceph_dir_layout dir_layout;
  60        u32 symlink_len;
  61        char *symlink;
  62        u32 xattr_len;
  63        char *xattr_data;
  64        u64 inline_version;
  65        u32 inline_len;
  66        char *inline_data;
  67        u32 pool_ns_len;
  68        char *pool_ns_data;
  69        u64 max_bytes;
  70        u64 max_files;
  71        s32 dir_pin;
  72        struct ceph_timespec btime;
  73        struct ceph_timespec snap_btime;
  74        u64 change_attr;
  75};
  76
  77struct ceph_mds_reply_dir_entry {
  78        char                          *name;
  79        u32                           name_len;
  80        struct ceph_mds_reply_lease   *lease;
  81        struct ceph_mds_reply_info_in inode;
  82        loff_t                        offset;
  83};
  84
  85/*
  86 * parsed info about an mds reply, including information about
  87 * either: 1) the target inode and/or its parent directory and dentry,
  88 * and directory contents (for readdir results), or
  89 * 2) the file range lock info (for fcntl F_GETLK results).
  90 */
  91struct ceph_mds_reply_info_parsed {
  92        struct ceph_mds_reply_head    *head;
  93
  94        /* trace */
  95        struct ceph_mds_reply_info_in diri, targeti;
  96        struct ceph_mds_reply_dirfrag *dirfrag;
  97        char                          *dname;
  98        u32                           dname_len;
  99        struct ceph_mds_reply_lease   *dlease;
 100
 101        /* extra */
 102        union {
 103                /* for fcntl F_GETLK results */
 104                struct ceph_filelock *filelock_reply;
 105
 106                /* for readdir results */
 107                struct {
 108                        struct ceph_mds_reply_dirfrag *dir_dir;
 109                        size_t                        dir_buf_size;
 110                        int                           dir_nr;
 111                        bool                          dir_end;
 112                        bool                          dir_complete;
 113                        bool                          hash_order;
 114                        bool                          offset_hash;
 115                        struct ceph_mds_reply_dir_entry  *dir_entries;
 116                };
 117
 118                /* for create results */
 119                struct {
 120                        bool has_create_ino;
 121                        u64 ino;
 122                };
 123        };
 124
 125        /* encoded blob describing snapshot contexts for certain
 126           operations (e.g., open) */
 127        void *snapblob;
 128        int snapblob_len;
 129};
 130
 131
 132/*
 133 * cap releases are batched and sent to the MDS en masse.
 134 *
 135 * Account for per-message overhead of mds_cap_release header
 136 * and __le32 for osd epoch barrier trailing field.
 137 */
 138#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) -               \
 139                                sizeof(struct ceph_mds_cap_release)) /  \
 140                                sizeof(struct ceph_mds_cap_item))
 141
 142
 143/*
 144 * state associated with each MDS<->client session
 145 */
 146enum {
 147        CEPH_MDS_SESSION_NEW = 1,
 148        CEPH_MDS_SESSION_OPENING = 2,
 149        CEPH_MDS_SESSION_OPEN = 3,
 150        CEPH_MDS_SESSION_HUNG = 4,
 151        CEPH_MDS_SESSION_CLOSING = 5,
 152        CEPH_MDS_SESSION_RESTARTING = 6,
 153        CEPH_MDS_SESSION_RECONNECTING = 7,
 154        CEPH_MDS_SESSION_REJECTED = 8,
 155};
 156
 157struct ceph_mds_session {
 158        struct ceph_mds_client *s_mdsc;
 159        int               s_mds;
 160        int               s_state;
 161        unsigned long     s_ttl;      /* time until mds kills us */
 162        unsigned long     s_features;
 163        u64               s_seq;      /* incoming msg seq # */
 164        struct mutex      s_mutex;    /* serialize session messages */
 165
 166        struct ceph_connection s_con;
 167
 168        struct ceph_auth_handshake s_auth;
 169
 170        /* protected by s_gen_ttl_lock */
 171        spinlock_t        s_gen_ttl_lock;
 172        u32               s_cap_gen;  /* inc each time we get mds stale msg */
 173        unsigned long     s_cap_ttl;  /* when session caps expire */
 174
 175        /* protected by s_cap_lock */
 176        spinlock_t        s_cap_lock;
 177        struct list_head  s_caps;     /* all caps issued by this session */
 178        struct ceph_cap  *s_cap_iterator;
 179        int               s_nr_caps, s_trim_caps;
 180        int               s_num_cap_releases;
 181        int               s_cap_reconnect;
 182        int               s_readonly;
 183        struct list_head  s_cap_releases; /* waiting cap_release messages */
 184        struct work_struct s_cap_release_work;
 185
 186        /* protected by mutex */
 187        struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
 188        unsigned long     s_renew_requested; /* last time we sent a renew req */
 189        u64               s_renew_seq;
 190
 191        refcount_t        s_ref;
 192        struct list_head  s_waiting;  /* waiting requests */
 193        struct list_head  s_unsafe;   /* unsafe requests */
 194};
 195
 196/*
 197 * modes of choosing which MDS to send a request to
 198 */
 199enum {
 200        USE_ANY_MDS,
 201        USE_RANDOM_MDS,
 202        USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
 203};
 204
 205struct ceph_mds_request;
 206struct ceph_mds_client;
 207
 208/*
 209 * request completion callback
 210 */
 211typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
 212                                             struct ceph_mds_request *req);
 213/*
 214 * wait for request completion callback
 215 */
 216typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
 217                                                 struct ceph_mds_request *req);
 218
 219/*
 220 * an in-flight mds request
 221 */
 222struct ceph_mds_request {
 223        u64 r_tid;                   /* transaction id */
 224        struct rb_node r_node;
 225        struct ceph_mds_client *r_mdsc;
 226
 227        int r_op;                    /* mds op code */
 228
 229        /* operation on what? */
 230        struct inode *r_inode;              /* arg1 */
 231        struct dentry *r_dentry;            /* arg1 */
 232        struct dentry *r_old_dentry;        /* arg2: rename from or link from */
 233        struct inode *r_old_dentry_dir;     /* arg2: old dentry's parent dir */
 234        char *r_path1, *r_path2;
 235        struct ceph_vino r_ino1, r_ino2;
 236
 237        struct inode *r_parent;             /* parent dir inode */
 238        struct inode *r_target_inode;       /* resulting inode */
 239
 240#define CEPH_MDS_R_DIRECT_IS_HASH       (1) /* r_direct_hash is valid */
 241#define CEPH_MDS_R_ABORTED              (2) /* call was aborted */
 242#define CEPH_MDS_R_GOT_UNSAFE           (3) /* got an unsafe reply */
 243#define CEPH_MDS_R_GOT_SAFE             (4) /* got a safe reply */
 244#define CEPH_MDS_R_GOT_RESULT           (5) /* got a result */
 245#define CEPH_MDS_R_DID_PREPOPULATE      (6) /* prepopulated readdir */
 246#define CEPH_MDS_R_PARENT_LOCKED        (7) /* is r_parent->i_rwsem wlocked? */
 247        unsigned long   r_req_flags;
 248
 249        struct mutex r_fill_mutex;
 250
 251        union ceph_mds_request_args r_args;
 252        int r_fmode;        /* file mode, if expecting cap */
 253        kuid_t r_uid;
 254        kgid_t r_gid;
 255        struct timespec64 r_stamp;
 256
 257        /* for choosing which mds to send this request to */
 258        int r_direct_mode;
 259        u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
 260
 261        /* data payload is used for xattr ops */
 262        struct ceph_pagelist *r_pagelist;
 263
 264        /* what caps shall we drop? */
 265        int r_inode_drop, r_inode_unless;
 266        int r_dentry_drop, r_dentry_unless;
 267        int r_old_dentry_drop, r_old_dentry_unless;
 268        struct inode *r_old_inode;
 269        int r_old_inode_drop, r_old_inode_unless;
 270
 271        struct ceph_msg  *r_request;  /* original request */
 272        int r_request_release_offset;
 273        struct ceph_msg  *r_reply;
 274        struct ceph_mds_reply_info_parsed r_reply_info;
 275        struct page *r_locked_page;
 276        int r_err;
 277
 278        unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
 279        unsigned long r_started;  /* start time to measure timeout against */
 280        unsigned long r_request_started; /* start time for mds request only,
 281                                            used to measure lease durations */
 282
 283        /* link unsafe requests to parent directory, for fsync */
 284        struct inode    *r_unsafe_dir;
 285        struct list_head r_unsafe_dir_item;
 286
 287        /* unsafe requests that modify the target inode */
 288        struct list_head r_unsafe_target_item;
 289
 290        struct ceph_mds_session *r_session;
 291
 292        int               r_attempts;   /* resend attempts */
 293        int               r_num_fwd;    /* number of forward attempts */
 294        int               r_resend_mds; /* mds to resend to next, if any*/
 295        u32               r_sent_on_mseq; /* cap mseq request was sent at*/
 296
 297        struct kref       r_kref;
 298        struct list_head  r_wait;
 299        struct completion r_completion;
 300        struct completion r_safe_completion;
 301        ceph_mds_request_callback_t r_callback;
 302        ceph_mds_request_wait_callback_t r_wait_for_completion;
 303        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
 304
 305        long long         r_dir_release_cnt;
 306        long long         r_dir_ordered_cnt;
 307        int               r_readdir_cache_idx;
 308        u32               r_readdir_offset;
 309
 310        struct ceph_cap_reservation r_caps_reservation;
 311        int r_num_caps;
 312};
 313
 314struct ceph_pool_perm {
 315        struct rb_node node;
 316        int perm;
 317        s64 pool;
 318        size_t pool_ns_len;
 319        char pool_ns[];
 320};
 321
 322struct ceph_snapid_map {
 323        struct rb_node node;
 324        struct list_head lru;
 325        atomic_t ref;
 326        u64 snap;
 327        dev_t dev;
 328        unsigned long last_used;
 329};
 330
 331/*
 332 * node for list of quotarealm inodes that are not visible from the filesystem
 333 * mountpoint, but required to handle, e.g. quotas.
 334 */
 335struct ceph_quotarealm_inode {
 336        struct rb_node node;
 337        u64 ino;
 338        unsigned long timeout; /* last time a lookup failed for this inode */
 339        struct mutex mutex;
 340        struct inode *inode;
 341};
 342
 343/*
 344 * mds client state
 345 */
 346struct ceph_mds_client {
 347        struct ceph_fs_client  *fsc;
 348        struct mutex            mutex;         /* all nested structures */
 349
 350        struct ceph_mdsmap      *mdsmap;
 351        struct completion       safe_umount_waiters;
 352        wait_queue_head_t       session_close_wq;
 353        struct list_head        waiting_for_map;
 354        int                     mdsmap_err;
 355
 356        struct ceph_mds_session **sessions;    /* NULL for mds if no session */
 357        atomic_t                num_sessions;
 358        int                     max_sessions;  /* len of s_mds_sessions */
 359        int                     stopping;      /* true if shutting down */
 360
 361        atomic64_t              quotarealms_count; /* # realms with quota */
 362        /*
 363         * We keep a list of inodes we don't see in the mountpoint but that we
 364         * need to track quota realms.
 365         */
 366        struct rb_root          quotarealms_inodes;
 367        struct mutex            quotarealms_inodes_mutex;
 368
 369        /*
 370         * snap_rwsem will cover cap linkage into snaprealms, and
 371         * realm snap contexts.  (later, we can do per-realm snap
 372         * contexts locks..)  the empty list contains realms with no
 373         * references (implying they contain no inodes with caps) that
 374         * should be destroyed.
 375         */
 376        u64                     last_snap_seq;
 377        struct rw_semaphore     snap_rwsem;
 378        struct rb_root          snap_realms;
 379        struct list_head        snap_empty;
 380        int                     num_snap_realms;
 381        spinlock_t              snap_empty_lock;  /* protect snap_empty */
 382
 383        u64                    last_tid;      /* most recent mds request */
 384        u64                    oldest_tid;    /* oldest incomplete mds request,
 385                                                 excluding setfilelock requests */
 386        struct rb_root         request_tree;  /* pending mds requests */
 387        struct delayed_work    delayed_work;  /* delayed work */
 388        unsigned long    last_renew_caps;  /* last time we renewed our caps */
 389        struct list_head cap_delay_list;   /* caps with delayed release */
 390        spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
 391        struct list_head snap_flush_list;  /* cap_snaps ready to flush */
 392        spinlock_t       snap_flush_lock;
 393
 394        u64               last_cap_flush_tid;
 395        struct list_head  cap_flush_list;
 396        struct list_head  cap_dirty;        /* inodes with dirty caps */
 397        struct list_head  cap_dirty_migrating; /* ...that are migration... */
 398        int               num_cap_flushing; /* # caps we are flushing */
 399        spinlock_t        cap_dirty_lock;   /* protects above items */
 400        wait_queue_head_t cap_flushing_wq;
 401
 402        struct work_struct cap_reclaim_work;
 403        atomic_t           cap_reclaim_pending;
 404
 405        /*
 406         * Cap reservations
 407         *
 408         * Maintain a global pool of preallocated struct ceph_caps, referenced
 409         * by struct ceph_caps_reservations.  This ensures that we preallocate
 410         * memory needed to successfully process an MDS response.  (If an MDS
 411         * sends us cap information and we fail to process it, we will have
 412         * problems due to the client and MDS being out of sync.)
 413         *
 414         * Reservations are 'owned' by a ceph_cap_reservation context.
 415         */
 416        spinlock_t      caps_list_lock;
 417        struct          list_head caps_list; /* unused (reserved or
 418                                                unreserved) */
 419        int             caps_total_count;    /* total caps allocated */
 420        int             caps_use_count;      /* in use */
 421        int             caps_use_max;        /* max used caps */
 422        int             caps_reserve_count;  /* unused, reserved */
 423        int             caps_avail_count;    /* unused, unreserved */
 424        int             caps_min_count;      /* keep at least this many
 425                                                (unreserved) */
 426        spinlock_t        dentry_list_lock;
 427        struct list_head  dentry_leases;     /* fifo list */
 428        struct list_head  dentry_dir_leases; /* lru list */
 429
 430        spinlock_t              snapid_map_lock;
 431        struct rb_root          snapid_map_tree;
 432        struct list_head        snapid_map_lru;
 433
 434        struct rw_semaphore     pool_perm_rwsem;
 435        struct rb_root          pool_perm_tree;
 436
 437        char nodename[__NEW_UTS_LEN + 1];
 438};
 439
 440extern const char *ceph_mds_op_name(int op);
 441
 442extern struct ceph_mds_session *
 443__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
 444
 445static inline struct ceph_mds_session *
 446ceph_get_mds_session(struct ceph_mds_session *s)
 447{
 448        refcount_inc(&s->s_ref);
 449        return s;
 450}
 451
 452extern const char *ceph_session_state_name(int s);
 453
 454extern void ceph_put_mds_session(struct ceph_mds_session *s);
 455
 456extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 457                             struct ceph_msg *msg, int mds);
 458
 459extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 460extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
 461extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
 462extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 463
 464extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 465
 466extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 467extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 468                                           struct inode *dir);
 469extern struct ceph_mds_request *
 470ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 471extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
 472                                    struct inode *dir,
 473                                    struct ceph_mds_request *req);
 474extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 475                                struct inode *dir,
 476                                struct ceph_mds_request *req);
 477static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
 478{
 479        kref_get(&req->r_kref);
 480}
 481extern void ceph_mdsc_release_request(struct kref *kref);
 482static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 483{
 484        kref_put(&req->r_kref, ceph_mdsc_release_request);
 485}
 486
 487extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
 488                                    struct ceph_cap *cap);
 489extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
 490                                    struct ceph_mds_session *session);
 491extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
 492extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
 493extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
 494                                     int (*cb)(struct inode *,
 495                                               struct ceph_cap *, void *),
 496                                     void *arg);
 497extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 498
 499static inline void ceph_mdsc_free_path(char *path, int len)
 500{
 501        if (path)
 502                __putname(path - (PATH_MAX - 1 - len));
 503}
 504
 505extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
 506                                  int stop_on_nosnap);
 507
 508extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
 509extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 510                                     struct dentry *dentry, char action,
 511                                     u32 seq);
 512
 513extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
 514                                    struct ceph_msg *msg);
 515extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
 516                                   struct ceph_msg *msg);
 517
 518extern struct ceph_mds_session *
 519ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
 520extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 521                                          struct ceph_mds_session *session);
 522
 523extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
 524                          struct ceph_mds_session *session,
 525                          int max_caps);
 526#endif
 527