linux/drivers/staging/zcache/tmem.c
<<
>>
Prefs
   1/*
   2 * In-kernel transcendent memory (generic implementation)
   3 *
   4 * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
   5 *
   6 * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
   7 * "handles" (triples containing a pool id, and object id, and an index), to
   8 * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
   9 * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
  10 * set of functions (pamops).  Each pampd contains some representation of
  11 * PAGE_SIZE bytes worth of data. For those familiar with key-value stores,
  12 * the tmem handle is a three-level hierarchical key, and the value is always
  13 * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is
  14 * referenced in the datastore by the pampd.  The hierarchy is required
  15 * to ensure that certain invalidation functions can be performed efficiently
  16 * (i.e. flush all indexes associated with this object_id, or
  17 * flush all objects associated with this pool).
  18 *
  19 * Tmem must support potentially millions of pages and must be able to insert,
  20 * find, and delete these pages at a potential frequency of thousands per
  21 * second concurrently across many CPUs, (and, if used with KVM, across many
  22 * vcpus across many guests).  Tmem is tracked with a hierarchy of data
  23 * structures, organized by the elements in the handle-tuple: pool_id,
  24 * object_id, and page index.  One or more "clients" (e.g. guests) each
  25 * provide one or more tmem_pools.  Each pool, contains a hash table of
  26 * rb_trees of tmem_objs.  Each tmem_obj contains a radix-tree-like tree
  27 * of pointers, with intermediate nodes called tmem_objnodes.  Each leaf
  28 * pointer in this tree points to a pampd, which is accessible only through
  29 * a small set of callbacks registered by the PAM implementation (see
  30 * tmem_register_pamops). Tmem only needs to memory allocation for objs
  31 * and objnodes and this is done via a set of callbacks that must be
  32 * registered by the tmem host implementation (e.g. see tmem_register_hostops).
  33 */
  34
  35#include <linux/list.h>
  36#include <linux/spinlock.h>
  37#include <linux/atomic.h>
  38#include <linux/export.h>
  39#if defined(CONFIG_RAMSTER) || defined(CONFIG_RAMSTER_MODULE)
  40#include <linux/delay.h>
  41#endif
  42
  43#include "tmem.h"
  44
  45/* data structure sentinels used for debugging... see tmem.h */
  46#define POOL_SENTINEL 0x87658765
  47#define OBJ_SENTINEL 0x12345678
  48#define OBJNODE_SENTINEL 0xfedcba09
  49
  50/*
  51 * A tmem host implementation must use this function to register callbacks
  52 * for memory allocation.
  53 */
  54static struct tmem_hostops tmem_hostops;
  55
  56static void tmem_objnode_tree_init(void);
  57
  58void tmem_register_hostops(struct tmem_hostops *m)
  59{
  60        tmem_objnode_tree_init();
  61        tmem_hostops = *m;
  62}
  63
  64/*
  65 * A tmem host implementation must use this function to register
  66 * callbacks for a page-accessible memory (PAM) implementation.
  67 */
  68static struct tmem_pamops tmem_pamops;
  69
  70void tmem_register_pamops(struct tmem_pamops *m)
  71{
  72        tmem_pamops = *m;
  73}
  74
  75/*
  76 * Oid's are potentially very sparse and tmem_objs may have an indeterminately
  77 * short life, being added and deleted at a relatively high frequency.
  78 * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
  79 * of the potentially huge number of tmem_objs, each pool manages a hashtable
  80 * of rb_trees to reduce search, insert, delete, and rebalancing time.
  81 * Each hashbucket also has a lock to manage concurrent access and no
  82 * searches, inserts, or deletions can be performed unless the lock is held.
  83 * As a result, care must be taken to ensure tmem routines are not called
  84 * recursively; the vast majority of the time, a recursive call may work
  85 * but a deadlock will occur a small fraction of the time due to the
  86 * hashbucket lock.
  87 *
  88 * The following routines manage tmem_objs.  In all of these routines,
  89 * the hashbucket lock is already held.
  90 */
  91
  92/* Search for object==oid in pool, returns object if found. */
  93static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb,
  94                                        struct tmem_oid *oidp,
  95                                        struct rb_node **parent,
  96                                        struct rb_node ***link)
  97{
  98        struct rb_node *_parent = NULL, **rbnode;
  99        struct tmem_obj *obj = NULL;
 100
 101        rbnode = &hb->obj_rb_root.rb_node;
 102        while (*rbnode) {
 103                BUG_ON(RB_EMPTY_NODE(*rbnode));
 104                _parent = *rbnode;
 105                obj = rb_entry(*rbnode, struct tmem_obj,
 106                               rb_tree_node);
 107                switch (tmem_oid_compare(oidp, &obj->oid)) {
 108                case 0: /* equal */
 109                        goto out;
 110                case -1:
 111                        rbnode = &(*rbnode)->rb_left;
 112                        break;
 113                case 1:
 114                        rbnode = &(*rbnode)->rb_right;
 115                        break;
 116                }
 117        }
 118
 119        if (parent)
 120                *parent = _parent;
 121        if (link)
 122                *link = rbnode;
 123        obj = NULL;
 124out:
 125        return obj;
 126}
 127
 128static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
 129                                        struct tmem_oid *oidp)
 130{
 131        return __tmem_obj_find(hb, oidp, NULL, NULL);
 132}
 133
 134static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool);
 135
 136/* Free an object that has no more pampds in it. */
 137static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
 138{
 139        struct tmem_pool *pool;
 140
 141        BUG_ON(obj == NULL);
 142        ASSERT_SENTINEL(obj, OBJ);
 143        BUG_ON(obj->pampd_count > 0);
 144        pool = obj->pool;
 145        BUG_ON(pool == NULL);
 146        if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
 147                tmem_pampd_destroy_all_in_obj(obj, false);
 148        BUG_ON(obj->objnode_tree_root != NULL);
 149        BUG_ON((long)obj->objnode_count != 0);
 150        atomic_dec(&pool->obj_count);
 151        BUG_ON(atomic_read(&pool->obj_count) < 0);
 152        INVERT_SENTINEL(obj, OBJ);
 153        obj->pool = NULL;
 154        tmem_oid_set_invalid(&obj->oid);
 155        rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
 156}
 157
 158/*
 159 * Initialize, and insert an tmem_object_root (called only if find failed).
 160 */
 161static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
 162                                        struct tmem_pool *pool,
 163                                        struct tmem_oid *oidp)
 164{
 165        struct rb_root *root = &hb->obj_rb_root;
 166        struct rb_node **new = NULL, *parent = NULL;
 167
 168        BUG_ON(pool == NULL);
 169        atomic_inc(&pool->obj_count);
 170        obj->objnode_tree_height = 0;
 171        obj->objnode_tree_root = NULL;
 172        obj->pool = pool;
 173        obj->oid = *oidp;
 174        obj->objnode_count = 0;
 175        obj->pampd_count = 0;
 176#ifdef CONFIG_RAMSTER
 177        if (tmem_pamops.new_obj != NULL)
 178                (*tmem_pamops.new_obj)(obj);
 179#endif
 180        SET_SENTINEL(obj, OBJ);
 181
 182        if (__tmem_obj_find(hb, oidp, &parent, &new))
 183                BUG();
 184
 185        rb_link_node(&obj->rb_tree_node, parent, new);
 186        rb_insert_color(&obj->rb_tree_node, root);
 187}
 188
 189/*
 190 * Tmem is managed as a set of tmem_pools with certain attributes, such as
 191 * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
 192 * and all pampds that belong to a tmem_pool.  A tmem_pool is created
 193 * or deleted relatively rarely (for example, when a filesystem is
 194 * mounted or unmounted).
 195 */
 196
 197/* flush all data from a pool and, optionally, free it */
 198static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
 199{
 200        struct rb_node *rbnode;
 201        struct tmem_obj *obj;
 202        struct tmem_hashbucket *hb = &pool->hashbucket[0];
 203        int i;
 204
 205        BUG_ON(pool == NULL);
 206        for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
 207                spin_lock(&hb->lock);
 208                rbnode = rb_first(&hb->obj_rb_root);
 209                while (rbnode != NULL) {
 210                        obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
 211                        rbnode = rb_next(rbnode);
 212                        tmem_pampd_destroy_all_in_obj(obj, true);
 213                        tmem_obj_free(obj, hb);
 214                        (*tmem_hostops.obj_free)(obj, pool);
 215                }
 216                spin_unlock(&hb->lock);
 217        }
 218        if (destroy)
 219                list_del(&pool->pool_list);
 220}
 221
 222/*
 223 * A tmem_obj contains a radix-tree-like tree in which the intermediate
 224 * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
 225 * is very specialized and tuned for specific uses and is not particularly
 226 * suited for use from this code, though some code from the core algorithms has
 227 * been reused, thus the copyright notices below).  Each tmem_objnode contains
 228 * a set of pointers which point to either a set of intermediate tmem_objnodes
 229 * or a set of of pampds.
 230 *
 231 * Portions Copyright (C) 2001 Momchil Velikov
 232 * Portions Copyright (C) 2001 Christoph Hellwig
 233 * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
 234 */
 235
 236struct tmem_objnode_tree_path {
 237        struct tmem_objnode *objnode;
 238        int offset;
 239};
 240
 241/* objnode height_to_maxindex translation */
 242static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
 243
 244static void tmem_objnode_tree_init(void)
 245{
 246        unsigned int ht, tmp;
 247
 248        for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
 249                tmp = ht * OBJNODE_TREE_MAP_SHIFT;
 250                if (tmp >= OBJNODE_TREE_INDEX_BITS)
 251                        tmem_objnode_tree_h2max[ht] = ~0UL;
 252                else
 253                        tmem_objnode_tree_h2max[ht] =
 254                            (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
 255        }
 256}
 257
 258static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
 259{
 260        struct tmem_objnode *objnode;
 261
 262        ASSERT_SENTINEL(obj, OBJ);
 263        BUG_ON(obj->pool == NULL);
 264        ASSERT_SENTINEL(obj->pool, POOL);
 265        objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
 266        if (unlikely(objnode == NULL))
 267                goto out;
 268        objnode->obj = obj;
 269        SET_SENTINEL(objnode, OBJNODE);
 270        memset(&objnode->slots, 0, sizeof(objnode->slots));
 271        objnode->slots_in_use = 0;
 272        obj->objnode_count++;
 273out:
 274        return objnode;
 275}
 276
 277static void tmem_objnode_free(struct tmem_objnode *objnode)
 278{
 279        struct tmem_pool *pool;
 280        int i;
 281
 282        BUG_ON(objnode == NULL);
 283        for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
 284                BUG_ON(objnode->slots[i] != NULL);
 285        ASSERT_SENTINEL(objnode, OBJNODE);
 286        INVERT_SENTINEL(objnode, OBJNODE);
 287        BUG_ON(objnode->obj == NULL);
 288        ASSERT_SENTINEL(objnode->obj, OBJ);
 289        pool = objnode->obj->pool;
 290        BUG_ON(pool == NULL);
 291        ASSERT_SENTINEL(pool, POOL);
 292        objnode->obj->objnode_count--;
 293        objnode->obj = NULL;
 294        (*tmem_hostops.objnode_free)(objnode, pool);
 295}
 296
 297/*
 298 * Lookup index in object and return associated pampd (or NULL if not found).
 299 */
 300static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
 301{
 302        unsigned int height, shift;
 303        struct tmem_objnode **slot = NULL;
 304
 305        BUG_ON(obj == NULL);
 306        ASSERT_SENTINEL(obj, OBJ);
 307        BUG_ON(obj->pool == NULL);
 308        ASSERT_SENTINEL(obj->pool, POOL);
 309
 310        height = obj->objnode_tree_height;
 311        if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
 312                goto out;
 313        if (height == 0 && obj->objnode_tree_root) {
 314                slot = &obj->objnode_tree_root;
 315                goto out;
 316        }
 317        shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
 318        slot = &obj->objnode_tree_root;
 319        while (height > 0) {
 320                if (*slot == NULL)
 321                        goto out;
 322                slot = (struct tmem_objnode **)
 323                        ((*slot)->slots +
 324                         ((index >> shift) & OBJNODE_TREE_MAP_MASK));
 325                shift -= OBJNODE_TREE_MAP_SHIFT;
 326                height--;
 327        }
 328out:
 329        return slot != NULL ? (void **)slot : NULL;
 330}
 331
 332static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
 333{
 334        struct tmem_objnode **slot;
 335
 336        slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
 337        return slot != NULL ? *slot : NULL;
 338}
 339
 340#ifdef CONFIG_RAMSTER
 341static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
 342                                        void *new_pampd, bool no_free)
 343{
 344        struct tmem_objnode **slot;
 345        void *ret = NULL;
 346
 347        slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
 348        if ((slot != NULL) && (*slot != NULL)) {
 349                void *old_pampd = *(void **)slot;
 350                *(void **)slot = new_pampd;
 351                if (!no_free)
 352                        (*tmem_pamops.free)(old_pampd, obj->pool,
 353                                                NULL, 0, false);
 354                ret = new_pampd;
 355        }
 356        return ret;
 357}
 358#endif
 359
 360static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
 361                                        void *pampd)
 362{
 363        int ret = 0;
 364        struct tmem_objnode *objnode = NULL, *newnode, *slot;
 365        unsigned int height, shift;
 366        int offset = 0;
 367
 368        /* if necessary, extend the tree to be higher  */
 369        if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
 370                height = obj->objnode_tree_height + 1;
 371                if (index > tmem_objnode_tree_h2max[height])
 372                        while (index > tmem_objnode_tree_h2max[height])
 373                                height++;
 374                if (obj->objnode_tree_root == NULL) {
 375                        obj->objnode_tree_height = height;
 376                        goto insert;
 377                }
 378                do {
 379                        newnode = tmem_objnode_alloc(obj);
 380                        if (!newnode) {
 381                                ret = -ENOMEM;
 382                                goto out;
 383                        }
 384                        newnode->slots[0] = obj->objnode_tree_root;
 385                        newnode->slots_in_use = 1;
 386                        obj->objnode_tree_root = newnode;
 387                        obj->objnode_tree_height++;
 388                } while (height > obj->objnode_tree_height);
 389        }
 390insert:
 391        slot = obj->objnode_tree_root;
 392        height = obj->objnode_tree_height;
 393        shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
 394        while (height > 0) {
 395                if (slot == NULL) {
 396                        /* add a child objnode.  */
 397                        slot = tmem_objnode_alloc(obj);
 398                        if (!slot) {
 399                                ret = -ENOMEM;
 400                                goto out;
 401                        }
 402                        if (objnode) {
 403
 404                                objnode->slots[offset] = slot;
 405                                objnode->slots_in_use++;
 406                        } else
 407                                obj->objnode_tree_root = slot;
 408                }
 409                /* go down a level */
 410                offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
 411                objnode = slot;
 412                slot = objnode->slots[offset];
 413                shift -= OBJNODE_TREE_MAP_SHIFT;
 414                height--;
 415        }
 416        BUG_ON(slot != NULL);
 417        if (objnode) {
 418                objnode->slots_in_use++;
 419                objnode->slots[offset] = pampd;
 420        } else
 421                obj->objnode_tree_root = pampd;
 422        obj->pampd_count++;
 423out:
 424        return ret;
 425}
 426
 427static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
 428{
 429        struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
 430        struct tmem_objnode_tree_path *pathp = path;
 431        struct tmem_objnode *slot = NULL;
 432        unsigned int height, shift;
 433        int offset;
 434
 435        BUG_ON(obj == NULL);
 436        ASSERT_SENTINEL(obj, OBJ);
 437        BUG_ON(obj->pool == NULL);
 438        ASSERT_SENTINEL(obj->pool, POOL);
 439        height = obj->objnode_tree_height;
 440        if (index > tmem_objnode_tree_h2max[height])
 441                goto out;
 442        slot = obj->objnode_tree_root;
 443        if (height == 0 && obj->objnode_tree_root) {
 444                obj->objnode_tree_root = NULL;
 445                goto out;
 446        }
 447        shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
 448        pathp->objnode = NULL;
 449        do {
 450                if (slot == NULL)
 451                        goto out;
 452                pathp++;
 453                offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
 454                pathp->offset = offset;
 455                pathp->objnode = slot;
 456                slot = slot->slots[offset];
 457                shift -= OBJNODE_TREE_MAP_SHIFT;
 458                height--;
 459        } while (height > 0);
 460        if (slot == NULL)
 461                goto out;
 462        while (pathp->objnode) {
 463                pathp->objnode->slots[pathp->offset] = NULL;
 464                pathp->objnode->slots_in_use--;
 465                if (pathp->objnode->slots_in_use) {
 466                        if (pathp->objnode == obj->objnode_tree_root) {
 467                                while (obj->objnode_tree_height > 0 &&
 468                                  obj->objnode_tree_root->slots_in_use == 1 &&
 469                                  obj->objnode_tree_root->slots[0]) {
 470                                        struct tmem_objnode *to_free =
 471                                                obj->objnode_tree_root;
 472
 473                                        obj->objnode_tree_root =
 474                                                        to_free->slots[0];
 475                                        obj->objnode_tree_height--;
 476                                        to_free->slots[0] = NULL;
 477                                        to_free->slots_in_use = 0;
 478                                        tmem_objnode_free(to_free);
 479                                }
 480                        }
 481                        goto out;
 482                }
 483                tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
 484                pathp--;
 485        }
 486        obj->objnode_tree_height = 0;
 487        obj->objnode_tree_root = NULL;
 488
 489out:
 490        if (slot != NULL)
 491                obj->pampd_count--;
 492        BUG_ON(obj->pampd_count < 0);
 493        return slot;
 494}
 495
 496/* Recursively walk the objnode_tree destroying pampds and objnodes. */
 497static void tmem_objnode_node_destroy(struct tmem_obj *obj,
 498                                        struct tmem_objnode *objnode,
 499                                        unsigned int ht)
 500{
 501        int i;
 502
 503        if (ht == 0)
 504                return;
 505        for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
 506                if (objnode->slots[i]) {
 507                        if (ht == 1) {
 508                                obj->pampd_count--;
 509                                (*tmem_pamops.free)(objnode->slots[i],
 510                                                obj->pool, NULL, 0, true);
 511                                objnode->slots[i] = NULL;
 512                                continue;
 513                        }
 514                        tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
 515                        tmem_objnode_free(objnode->slots[i]);
 516                        objnode->slots[i] = NULL;
 517                }
 518        }
 519}
 520
 521static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj,
 522                                                bool pool_destroy)
 523{
 524        if (obj->objnode_tree_root == NULL)
 525                return;
 526        if (obj->objnode_tree_height == 0) {
 527                obj->pampd_count--;
 528                (*tmem_pamops.free)(obj->objnode_tree_root,
 529                                        obj->pool, NULL, 0, true);
 530        } else {
 531                tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
 532                                        obj->objnode_tree_height);
 533                tmem_objnode_free(obj->objnode_tree_root);
 534                obj->objnode_tree_height = 0;
 535        }
 536        obj->objnode_tree_root = NULL;
 537#ifdef CONFIG_RAMSTER
 538        if (tmem_pamops.free_obj != NULL)
 539                (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy);
 540#endif
 541}
 542
 543/*
 544 * Tmem is operated on by a set of well-defined actions:
 545 * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
 546 * (The tmem ABI allows for subpages and exchanges but these operations
 547 * are not included in this implementation.)
 548 *
 549 * These "tmem core" operations are implemented in the following functions.
 550 */
 551
 552/*
 553 * "Put" a page, e.g. associate the passed pampd with the passed handle.
 554 * Tmem_put is complicated by a corner case: What if a page with matching
 555 * handle already exists in tmem?  To guarantee coherency, one of two
 556 * actions is necessary: Either the data for the page must be overwritten,
 557 * or the page must be "flushed" so that the data is not accessible to a
 558 * subsequent "get".  Since these "duplicate puts" are relatively rare,
 559 * this implementation always flushes for simplicity.
 560 */
 561int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
 562                bool raw, void *pampd_to_use)
 563{
 564        struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
 565        void *pampd = NULL, *pampd_del = NULL;
 566        int ret = -ENOMEM;
 567        struct tmem_hashbucket *hb;
 568
 569        hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 570        spin_lock(&hb->lock);
 571        obj = objfound = tmem_obj_find(hb, oidp);
 572        if (obj != NULL) {
 573                pampd = tmem_pampd_lookup_in_obj(objfound, index);
 574                if (pampd != NULL) {
 575                        /* if found, is a dup put, flush the old one */
 576                        pampd_del = tmem_pampd_delete_from_obj(obj, index);
 577                        BUG_ON(pampd_del != pampd);
 578                        (*tmem_pamops.free)(pampd, pool, oidp, index, true);
 579                        if (obj->pampd_count == 0) {
 580                                objnew = obj;
 581                                objfound = NULL;
 582                        }
 583                        pampd = NULL;
 584                }
 585        } else {
 586                obj = objnew = (*tmem_hostops.obj_alloc)(pool);
 587                if (unlikely(obj == NULL)) {
 588                        ret = -ENOMEM;
 589                        goto out;
 590                }
 591                tmem_obj_init(obj, hb, pool, oidp);
 592        }
 593        BUG_ON(obj == NULL);
 594        BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
 595        pampd = pampd_to_use;
 596        BUG_ON(pampd_to_use == NULL);
 597        ret = tmem_pampd_add_to_obj(obj, index, pampd);
 598        if (unlikely(ret == -ENOMEM))
 599                /* may have partially built objnode tree ("stump") */
 600                goto delete_and_free;
 601        (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool));
 602        goto out;
 603
 604delete_and_free:
 605        (void)tmem_pampd_delete_from_obj(obj, index);
 606        if (pampd)
 607                (*tmem_pamops.free)(pampd, pool, NULL, 0, true);
 608        if (objnew) {
 609                tmem_obj_free(objnew, hb);
 610                (*tmem_hostops.obj_free)(objnew, pool);
 611        }
 612out:
 613        spin_unlock(&hb->lock);
 614        return ret;
 615}
 616
 617#ifdef CONFIG_RAMSTER
 618/*
 619 * For ramster only:  The following routines provide a two-step sequence
 620 * to allow the caller to replace a pampd in the tmem data structures with
 621 * another pampd. Here, we lookup the passed handle and, if found, return the
 622 * associated pampd and object, leaving the hashbucket locked and returning
 623 * a reference to it.  The caller is expected to immediately call the
 624 * matching tmem_localify_finish routine which will handles the replacement
 625 * and unlocks the hashbucket.
 626 */
 627void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
 628                                uint32_t index, struct tmem_obj **ret_obj,
 629                                void **saved_hb)
 630{
 631        struct tmem_hashbucket *hb;
 632        struct tmem_obj *obj = NULL;
 633        void *pampd = NULL;
 634
 635        hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 636        spin_lock(&hb->lock);
 637        obj = tmem_obj_find(hb, oidp);
 638        if (likely(obj != NULL))
 639                pampd = tmem_pampd_lookup_in_obj(obj, index);
 640        *ret_obj = obj;
 641        *saved_hb = (void *)hb;
 642        /* note, hashbucket remains locked */
 643        return pampd;
 644}
 645EXPORT_SYMBOL_GPL(tmem_localify_get_pampd);
 646
 647void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
 648                          void *pampd, void *saved_hb, bool delete)
 649{
 650        struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
 651
 652        BUG_ON(!spin_is_locked(&hb->lock));
 653        if (pampd != NULL) {
 654                BUG_ON(obj == NULL);
 655                (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
 656                (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool));
 657        } else if (delete) {
 658                BUG_ON(obj == NULL);
 659                (void)tmem_pampd_delete_from_obj(obj, index);
 660        }
 661        spin_unlock(&hb->lock);
 662}
 663EXPORT_SYMBOL_GPL(tmem_localify_finish);
 664
 665/*
 666 * For ramster only.  Helper function to support asynchronous tmem_get.
 667 */
 668static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
 669                                struct tmem_pool *pool, struct tmem_oid *oidp,
 670                                uint32_t index, bool free, char *data)
 671{
 672        void *old_pampd = *ppampd, *new_pampd = NULL;
 673        bool intransit = false;
 674        int ret = 0;
 675
 676        if (!is_ephemeral(pool))
 677                new_pampd = (*tmem_pamops.repatriate_preload)(
 678                                old_pampd, pool, oidp, index, &intransit);
 679        if (intransit)
 680                ret = -EAGAIN;
 681        else if (new_pampd != NULL)
 682                *ppampd = new_pampd;
 683        /* must release the hb->lock else repatriate can't sleep */
 684        spin_unlock(&hb->lock);
 685        if (!intransit)
 686                ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
 687                                                oidp, index, free, data);
 688        if (ret == -EAGAIN) {
 689                /* rare I think, but should cond_resched()??? */
 690                usleep_range(10, 1000);
 691        } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) {
 692                ret = -1;
 693        } else if (ret != 0 && ret != -ENOENT) {
 694                ret = -1;
 695        }
 696        /* note hb->lock has now been unlocked */
 697        return ret;
 698}
 699
 700/*
 701 * For ramster only.  If a page in tmem matches the handle, replace the
 702 * page so that any subsequent "get" gets the new page.  Returns 0 if
 703 * there was a page to replace, else returns -1.
 704 */
 705int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
 706                        uint32_t index, void *new_pampd)
 707{
 708        struct tmem_obj *obj;
 709        int ret = -1;
 710        struct tmem_hashbucket *hb;
 711
 712        hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 713        spin_lock(&hb->lock);
 714        obj = tmem_obj_find(hb, oidp);
 715        if (obj == NULL)
 716                goto out;
 717        new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
 718        /* if we bug here, pamops wasn't properly set up for ramster */
 719        BUG_ON(tmem_pamops.replace_in_obj == NULL);
 720        ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
 721out:
 722        spin_unlock(&hb->lock);
 723        return ret;
 724}
 725EXPORT_SYMBOL_GPL(tmem_replace);
 726#endif
 727
 728/*
 729 * "Get" a page, e.g. if a pampd can be found matching the passed handle,
 730 * use a pamops callback to recreated the page from the pampd with the
 731 * matching handle.  By tmem definition, when a "get" is successful on
 732 * an ephemeral page, the page is "flushed", and when a "get" is successful
 733 * on a persistent page, the page is retained in tmem.  Note that to preserve
 734 * coherency, "get" can never be skipped if tmem contains the data.
 735 * That is, if a get is done with a certain handle and fails, any
 736 * subsequent "get" must also fail (unless of course there is a
 737 * "put" done with the same handle).
 738 */
 739int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
 740                char *data, size_t *sizep, bool raw, int get_and_free)
 741{
 742        struct tmem_obj *obj;
 743        void *pampd = NULL;
 744        bool ephemeral = is_ephemeral(pool);
 745        int ret = -1;
 746        struct tmem_hashbucket *hb;
 747        bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
 748        bool lock_held = false;
 749        void **ppampd;
 750
 751        do {
 752                hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 753                spin_lock(&hb->lock);
 754                lock_held = true;
 755                obj = tmem_obj_find(hb, oidp);
 756                if (obj == NULL)
 757                        goto out;
 758                ppampd = __tmem_pampd_lookup_in_obj(obj, index);
 759                if (ppampd == NULL)
 760                        goto out;
 761#ifdef CONFIG_RAMSTER
 762                if ((tmem_pamops.is_remote != NULL) &&
 763                     tmem_pamops.is_remote(*ppampd)) {
 764                        ret = tmem_repatriate(ppampd, hb, pool, oidp,
 765                                                index, free, data);
 766                        /* tmem_repatriate releases hb->lock */
 767                        lock_held = false;
 768                        *sizep = PAGE_SIZE;
 769                        if (ret != -EAGAIN)
 770                                goto out;
 771                }
 772#endif
 773        } while (ret == -EAGAIN);
 774        if (free)
 775                pampd = tmem_pampd_delete_from_obj(obj, index);
 776        else
 777                pampd = tmem_pampd_lookup_in_obj(obj, index);
 778        if (pampd == NULL)
 779                goto out;
 780        if (free) {
 781                if (obj->pampd_count == 0) {
 782                        tmem_obj_free(obj, hb);
 783                        (*tmem_hostops.obj_free)(obj, pool);
 784                        obj = NULL;
 785                }
 786        }
 787        if (free)
 788                ret = (*tmem_pamops.get_data_and_free)(
 789                                data, sizep, raw, pampd, pool, oidp, index);
 790        else
 791                ret = (*tmem_pamops.get_data)(
 792                                data, sizep, raw, pampd, pool, oidp, index);
 793        if (ret < 0)
 794                goto out;
 795        ret = 0;
 796out:
 797        if (lock_held)
 798                spin_unlock(&hb->lock);
 799        return ret;
 800}
 801
 802/*
 803 * If a page in tmem matches the handle, "flush" this page from tmem such
 804 * that any subsequent "get" does not succeed (unless, of course, there
 805 * was another "put" with the same handle).
 806 */
 807int tmem_flush_page(struct tmem_pool *pool,
 808                                struct tmem_oid *oidp, uint32_t index)
 809{
 810        struct tmem_obj *obj;
 811        void *pampd;
 812        int ret = -1;
 813        struct tmem_hashbucket *hb;
 814
 815        hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 816        spin_lock(&hb->lock);
 817        obj = tmem_obj_find(hb, oidp);
 818        if (obj == NULL)
 819                goto out;
 820        pampd = tmem_pampd_delete_from_obj(obj, index);
 821        if (pampd == NULL)
 822                goto out;
 823        (*tmem_pamops.free)(pampd, pool, oidp, index, true);
 824        if (obj->pampd_count == 0) {
 825                tmem_obj_free(obj, hb);
 826                (*tmem_hostops.obj_free)(obj, pool);
 827        }
 828        ret = 0;
 829
 830out:
 831        spin_unlock(&hb->lock);
 832        return ret;
 833}
 834
 835/*
 836 * "Flush" all pages in tmem matching this oid.
 837 */
 838int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
 839{
 840        struct tmem_obj *obj;
 841        struct tmem_hashbucket *hb;
 842        int ret = -1;
 843
 844        hb = &pool->hashbucket[tmem_oid_hash(oidp)];
 845        spin_lock(&hb->lock);
 846        obj = tmem_obj_find(hb, oidp);
 847        if (obj == NULL)
 848                goto out;
 849        tmem_pampd_destroy_all_in_obj(obj, false);
 850        tmem_obj_free(obj, hb);
 851        (*tmem_hostops.obj_free)(obj, pool);
 852        ret = 0;
 853
 854out:
 855        spin_unlock(&hb->lock);
 856        return ret;
 857}
 858
 859/*
 860 * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
 861 * all subsequent access to this tmem_pool.
 862 */
 863int tmem_destroy_pool(struct tmem_pool *pool)
 864{
 865        int ret = -1;
 866
 867        if (pool == NULL)
 868                goto out;
 869        tmem_pool_flush(pool, 1);
 870        ret = 0;
 871out:
 872        return ret;
 873}
 874
 875static LIST_HEAD(tmem_global_pool_list);
 876
 877/*
 878 * Create a new tmem_pool with the provided flag and return
 879 * a pool id provided by the tmem host implementation.
 880 */
 881void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
 882{
 883        int persistent = flags & TMEM_POOL_PERSIST;
 884        int shared = flags & TMEM_POOL_SHARED;
 885        struct tmem_hashbucket *hb = &pool->hashbucket[0];
 886        int i;
 887
 888        for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
 889                hb->obj_rb_root = RB_ROOT;
 890                spin_lock_init(&hb->lock);
 891        }
 892        INIT_LIST_HEAD(&pool->pool_list);
 893        atomic_set(&pool->obj_count, 0);
 894        SET_SENTINEL(pool, POOL);
 895        list_add_tail(&pool->pool_list, &tmem_global_pool_list);
 896        pool->persistent = persistent;
 897        pool->shared = shared;
 898}
 899