linux/net/rds/ib_rdma.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006 Oracle.  All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 *
  32 */
  33#include <linux/kernel.h>
  34#include <linux/slab.h>
  35#include <linux/rculist.h>
  36
  37#include "rds.h"
  38#include "ib.h"
  39#include "xlist.h"
  40
  41static DEFINE_PER_CPU(unsigned long, clean_list_grace);
  42#define CLEAN_LIST_BUSY_BIT 0
  43
  44/*
  45 * This is stored as mr->r_trans_private.
  46 */
  47struct rds_ib_mr {
  48        struct rds_ib_device    *device;
  49        struct rds_ib_mr_pool   *pool;
  50        struct ib_fmr           *fmr;
  51
  52        struct xlist_head       xlist;
  53
  54        /* unmap_list is for freeing */
  55        struct list_head        unmap_list;
  56        unsigned int            remap_count;
  57
  58        struct scatterlist      *sg;
  59        unsigned int            sg_len;
  60        u64                     *dma;
  61        int                     sg_dma_len;
  62};
  63
  64/*
  65 * Our own little FMR pool
  66 */
  67struct rds_ib_mr_pool {
  68        struct mutex            flush_lock;             /* serialize fmr invalidate */
  69        struct delayed_work     flush_worker;           /* flush worker */
  70
  71        atomic_t                item_count;             /* total # of MRs */
  72        atomic_t                dirty_count;            /* # dirty of MRs */
  73
  74        struct xlist_head       drop_list;              /* MRs that have reached their max_maps limit */
  75        struct xlist_head       free_list;              /* unused MRs */
  76        struct xlist_head       clean_list;             /* global unused & unamapped MRs */
  77        wait_queue_head_t       flush_wait;
  78
  79        atomic_t                free_pinned;            /* memory pinned by free MRs */
  80        unsigned long           max_items;
  81        unsigned long           max_items_soft;
  82        unsigned long           max_free_pinned;
  83        struct ib_fmr_attr      fmr_attr;
  84};
  85
  86static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
  87static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
  88static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
  89
  90static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
  91{
  92        struct rds_ib_device *rds_ibdev;
  93        struct rds_ib_ipaddr *i_ipaddr;
  94
  95        rcu_read_lock();
  96        list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
  97                list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
  98                        if (i_ipaddr->ipaddr == ipaddr) {
  99                                atomic_inc(&rds_ibdev->refcount);
 100                                rcu_read_unlock();
 101                                return rds_ibdev;
 102                        }
 103                }
 104        }
 105        rcu_read_unlock();
 106
 107        return NULL;
 108}
 109
 110static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 111{
 112        struct rds_ib_ipaddr *i_ipaddr;
 113
 114        i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
 115        if (!i_ipaddr)
 116                return -ENOMEM;
 117
 118        i_ipaddr->ipaddr = ipaddr;
 119
 120        spin_lock_irq(&rds_ibdev->spinlock);
 121        list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
 122        spin_unlock_irq(&rds_ibdev->spinlock);
 123
 124        return 0;
 125}
 126
 127static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 128{
 129        struct rds_ib_ipaddr *i_ipaddr;
 130        struct rds_ib_ipaddr *to_free = NULL;
 131
 132
 133        spin_lock_irq(&rds_ibdev->spinlock);
 134        list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
 135                if (i_ipaddr->ipaddr == ipaddr) {
 136                        list_del_rcu(&i_ipaddr->list);
 137                        to_free = i_ipaddr;
 138                        break;
 139                }
 140        }
 141        spin_unlock_irq(&rds_ibdev->spinlock);
 142
 143        if (to_free) {
 144                synchronize_rcu();
 145                kfree(to_free);
 146        }
 147}
 148
 149int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 150{
 151        struct rds_ib_device *rds_ibdev_old;
 152
 153        rds_ibdev_old = rds_ib_get_device(ipaddr);
 154        if (rds_ibdev_old) {
 155                rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
 156                rds_ib_dev_put(rds_ibdev_old);
 157        }
 158
 159        return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
 160}
 161
 162void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
 163{
 164        struct rds_ib_connection *ic = conn->c_transport_data;
 165
 166        /* conn was previously on the nodev_conns_list */
 167        spin_lock_irq(&ib_nodev_conns_lock);
 168        BUG_ON(list_empty(&ib_nodev_conns));
 169        BUG_ON(list_empty(&ic->ib_node));
 170        list_del(&ic->ib_node);
 171
 172        spin_lock(&rds_ibdev->spinlock);
 173        list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
 174        spin_unlock(&rds_ibdev->spinlock);
 175        spin_unlock_irq(&ib_nodev_conns_lock);
 176
 177        ic->rds_ibdev = rds_ibdev;
 178        atomic_inc(&rds_ibdev->refcount);
 179}
 180
 181void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
 182{
 183        struct rds_ib_connection *ic = conn->c_transport_data;
 184
 185        /* place conn on nodev_conns_list */
 186        spin_lock(&ib_nodev_conns_lock);
 187
 188        spin_lock_irq(&rds_ibdev->spinlock);
 189        BUG_ON(list_empty(&ic->ib_node));
 190        list_del(&ic->ib_node);
 191        spin_unlock_irq(&rds_ibdev->spinlock);
 192
 193        list_add_tail(&ic->ib_node, &ib_nodev_conns);
 194
 195        spin_unlock(&ib_nodev_conns_lock);
 196
 197        ic->rds_ibdev = NULL;
 198        rds_ib_dev_put(rds_ibdev);
 199}
 200
 201void rds_ib_destroy_nodev_conns(void)
 202{
 203        struct rds_ib_connection *ic, *_ic;
 204        LIST_HEAD(tmp_list);
 205
 206        /* avoid calling conn_destroy with irqs off */
 207        spin_lock_irq(&ib_nodev_conns_lock);
 208        list_splice(&ib_nodev_conns, &tmp_list);
 209        spin_unlock_irq(&ib_nodev_conns_lock);
 210
 211        list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
 212                rds_conn_destroy(ic->conn);
 213}
 214
 215struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
 216{
 217        struct rds_ib_mr_pool *pool;
 218
 219        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
 220        if (!pool)
 221                return ERR_PTR(-ENOMEM);
 222
 223        INIT_XLIST_HEAD(&pool->free_list);
 224        INIT_XLIST_HEAD(&pool->drop_list);
 225        INIT_XLIST_HEAD(&pool->clean_list);
 226        mutex_init(&pool->flush_lock);
 227        init_waitqueue_head(&pool->flush_wait);
 228        INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
 229
 230        pool->fmr_attr.max_pages = fmr_message_size;
 231        pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
 232        pool->fmr_attr.page_shift = PAGE_SHIFT;
 233        pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
 234
 235        /* We never allow more than max_items MRs to be allocated.
 236         * When we exceed more than max_items_soft, we start freeing
 237         * items more aggressively.
 238         * Make sure that max_items > max_items_soft > max_items / 2
 239         */
 240        pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
 241        pool->max_items = rds_ibdev->max_fmrs;
 242
 243        return pool;
 244}
 245
 246void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
 247{
 248        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 249
 250        iinfo->rdma_mr_max = pool->max_items;
 251        iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
 252}
 253
 254void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 255{
 256        cancel_delayed_work_sync(&pool->flush_worker);
 257        rds_ib_flush_mr_pool(pool, 1, NULL);
 258        WARN_ON(atomic_read(&pool->item_count));
 259        WARN_ON(atomic_read(&pool->free_pinned));
 260        kfree(pool);
 261}
 262
 263static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
 264                         struct rds_ib_mr **ibmr_ret)
 265{
 266        struct xlist_head *ibmr_xl;
 267        ibmr_xl = xlist_del_head_fast(xl);
 268        *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
 269}
 270
 271static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
 272{
 273        struct rds_ib_mr *ibmr = NULL;
 274        struct xlist_head *ret;
 275        unsigned long *flag;
 276
 277        preempt_disable();
 278        flag = &__get_cpu_var(clean_list_grace);
 279        set_bit(CLEAN_LIST_BUSY_BIT, flag);
 280        ret = xlist_del_head(&pool->clean_list);
 281        if (ret)
 282                ibmr = list_entry(ret, struct rds_ib_mr, xlist);
 283
 284        clear_bit(CLEAN_LIST_BUSY_BIT, flag);
 285        preempt_enable();
 286        return ibmr;
 287}
 288
 289static inline void wait_clean_list_grace(void)
 290{
 291        int cpu;
 292        unsigned long *flag;
 293
 294        for_each_online_cpu(cpu) {
 295                flag = &per_cpu(clean_list_grace, cpu);
 296                while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
 297                        cpu_relax();
 298        }
 299}
 300
 301static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 302{
 303        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 304        struct rds_ib_mr *ibmr = NULL;
 305        int err = 0, iter = 0;
 306
 307        if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
 308                schedule_delayed_work(&pool->flush_worker, 10);
 309
 310        while (1) {
 311                ibmr = rds_ib_reuse_fmr(pool);
 312                if (ibmr)
 313                        return ibmr;
 314
 315                /* No clean MRs - now we have the choice of either
 316                 * allocating a fresh MR up to the limit imposed by the
 317                 * driver, or flush any dirty unused MRs.
 318                 * We try to avoid stalling in the send path if possible,
 319                 * so we allocate as long as we're allowed to.
 320                 *
 321                 * We're fussy with enforcing the FMR limit, though. If the driver
 322                 * tells us we can't use more than N fmrs, we shouldn't start
 323                 * arguing with it */
 324                if (atomic_inc_return(&pool->item_count) <= pool->max_items)
 325                        break;
 326
 327                atomic_dec(&pool->item_count);
 328
 329                if (++iter > 2) {
 330                        rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
 331                        return ERR_PTR(-EAGAIN);
 332                }
 333
 334                /* We do have some empty MRs. Flush them out. */
 335                rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
 336                rds_ib_flush_mr_pool(pool, 0, &ibmr);
 337                if (ibmr)
 338                        return ibmr;
 339        }
 340
 341        ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
 342        if (!ibmr) {
 343                err = -ENOMEM;
 344                goto out_no_cigar;
 345        }
 346
 347        memset(ibmr, 0, sizeof(*ibmr));
 348
 349        ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
 350                        (IB_ACCESS_LOCAL_WRITE |
 351                         IB_ACCESS_REMOTE_READ |
 352                         IB_ACCESS_REMOTE_WRITE|
 353                         IB_ACCESS_REMOTE_ATOMIC),
 354                        &pool->fmr_attr);
 355        if (IS_ERR(ibmr->fmr)) {
 356                err = PTR_ERR(ibmr->fmr);
 357                ibmr->fmr = NULL;
 358                printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
 359                goto out_no_cigar;
 360        }
 361
 362        rds_ib_stats_inc(s_ib_rdma_mr_alloc);
 363        return ibmr;
 364
 365out_no_cigar:
 366        if (ibmr) {
 367                if (ibmr->fmr)
 368                        ib_dealloc_fmr(ibmr->fmr);
 369                kfree(ibmr);
 370        }
 371        atomic_dec(&pool->item_count);
 372        return ERR_PTR(err);
 373}
 374
 375static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
 376               struct scatterlist *sg, unsigned int nents)
 377{
 378        struct ib_device *dev = rds_ibdev->dev;
 379        struct scatterlist *scat = sg;
 380        u64 io_addr = 0;
 381        u64 *dma_pages;
 382        u32 len;
 383        int page_cnt, sg_dma_len;
 384        int i, j;
 385        int ret;
 386
 387        sg_dma_len = ib_dma_map_sg(dev, sg, nents,
 388                                 DMA_BIDIRECTIONAL);
 389        if (unlikely(!sg_dma_len)) {
 390                printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
 391                return -EBUSY;
 392        }
 393
 394        len = 0;
 395        page_cnt = 0;
 396
 397        for (i = 0; i < sg_dma_len; ++i) {
 398                unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
 399                u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
 400
 401                if (dma_addr & ~PAGE_MASK) {
 402                        if (i > 0)
 403                                return -EINVAL;
 404                        else
 405                                ++page_cnt;
 406                }
 407                if ((dma_addr + dma_len) & ~PAGE_MASK) {
 408                        if (i < sg_dma_len - 1)
 409                                return -EINVAL;
 410                        else
 411                                ++page_cnt;
 412                }
 413
 414                len += dma_len;
 415        }
 416
 417        page_cnt += len >> PAGE_SHIFT;
 418        if (page_cnt > fmr_message_size)
 419                return -EINVAL;
 420
 421        dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
 422                                 rdsibdev_to_node(rds_ibdev));
 423        if (!dma_pages)
 424                return -ENOMEM;
 425
 426        page_cnt = 0;
 427        for (i = 0; i < sg_dma_len; ++i) {
 428                unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
 429                u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
 430
 431                for (j = 0; j < dma_len; j += PAGE_SIZE)
 432                        dma_pages[page_cnt++] =
 433                                (dma_addr & PAGE_MASK) + j;
 434        }
 435
 436        ret = ib_map_phys_fmr(ibmr->fmr,
 437                                   dma_pages, page_cnt, io_addr);
 438        if (ret)
 439                goto out;
 440
 441        /* Success - we successfully remapped the MR, so we can
 442         * safely tear down the old mapping. */
 443        rds_ib_teardown_mr(ibmr);
 444
 445        ibmr->sg = scat;
 446        ibmr->sg_len = nents;
 447        ibmr->sg_dma_len = sg_dma_len;
 448        ibmr->remap_count++;
 449
 450        rds_ib_stats_inc(s_ib_rdma_mr_used);
 451        ret = 0;
 452
 453out:
 454        kfree(dma_pages);
 455
 456        return ret;
 457}
 458
 459void rds_ib_sync_mr(void *trans_private, int direction)
 460{
 461        struct rds_ib_mr *ibmr = trans_private;
 462        struct rds_ib_device *rds_ibdev = ibmr->device;
 463
 464        switch (direction) {
 465        case DMA_FROM_DEVICE:
 466                ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
 467                        ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
 468                break;
 469        case DMA_TO_DEVICE:
 470                ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
 471                        ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
 472                break;
 473        }
 474}
 475
 476static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 477{
 478        struct rds_ib_device *rds_ibdev = ibmr->device;
 479
 480        if (ibmr->sg_dma_len) {
 481                ib_dma_unmap_sg(rds_ibdev->dev,
 482                                ibmr->sg, ibmr->sg_len,
 483                                DMA_BIDIRECTIONAL);
 484                ibmr->sg_dma_len = 0;
 485        }
 486
 487        /* Release the s/g list */
 488        if (ibmr->sg_len) {
 489                unsigned int i;
 490
 491                for (i = 0; i < ibmr->sg_len; ++i) {
 492                        struct page *page = sg_page(&ibmr->sg[i]);
 493
 494                        /* FIXME we need a way to tell a r/w MR
 495                         * from a r/o MR */
 496                        BUG_ON(irqs_disabled());
 497                        set_page_dirty(page);
 498                        put_page(page);
 499                }
 500                kfree(ibmr->sg);
 501
 502                ibmr->sg = NULL;
 503                ibmr->sg_len = 0;
 504        }
 505}
 506
 507static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 508{
 509        unsigned int pinned = ibmr->sg_len;
 510
 511        __rds_ib_teardown_mr(ibmr);
 512        if (pinned) {
 513                struct rds_ib_device *rds_ibdev = ibmr->device;
 514                struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 515
 516                atomic_sub(pinned, &pool->free_pinned);
 517        }
 518}
 519
 520static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
 521{
 522        unsigned int item_count;
 523
 524        item_count = atomic_read(&pool->item_count);
 525        if (free_all)
 526                return item_count;
 527
 528        return 0;
 529}
 530
 531/*
 532 * given an xlist of mrs, put them all into the list_head for more processing
 533 */
 534static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
 535{
 536        struct rds_ib_mr *ibmr;
 537        struct xlist_head splice;
 538        struct xlist_head *cur;
 539        struct xlist_head *next;
 540
 541        splice.next = NULL;
 542        xlist_splice(xlist, &splice);
 543        cur = splice.next;
 544        while (cur) {
 545                next = cur->next;
 546                ibmr = list_entry(cur, struct rds_ib_mr, xlist);
 547                list_add_tail(&ibmr->unmap_list, list);
 548                cur = next;
 549        }
 550}
 551
 552/*
 553 * this takes a list head of mrs and turns it into an xlist of clusters.
 554 * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
 555 * reuse.
 556 */
 557static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
 558                                struct list_head *list, struct xlist_head *xlist,
 559                                struct xlist_head **tail_ret)
 560{
 561        struct rds_ib_mr *ibmr;
 562        struct xlist_head *cur_mr = xlist;
 563        struct xlist_head *tail_mr = NULL;
 564
 565        list_for_each_entry(ibmr, list, unmap_list) {
 566                tail_mr = &ibmr->xlist;
 567                tail_mr->next = NULL;
 568                cur_mr->next = tail_mr;
 569                cur_mr = tail_mr;
 570        }
 571        *tail_ret = tail_mr;
 572}
 573
 574/*
 575 * Flush our pool of MRs.
 576 * At a minimum, all currently unused MRs are unmapped.
 577 * If the number of MRs allocated exceeds the limit, we also try
 578 * to free as many MRs as needed to get back to this limit.
 579 */
 580static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 581                                int free_all, struct rds_ib_mr **ibmr_ret)
 582{
 583        struct rds_ib_mr *ibmr, *next;
 584        struct xlist_head clean_xlist;
 585        struct xlist_head *clean_tail;
 586        LIST_HEAD(unmap_list);
 587        LIST_HEAD(fmr_list);
 588        unsigned long unpinned = 0;
 589        unsigned int nfreed = 0, ncleaned = 0, free_goal;
 590        int ret = 0;
 591
 592        rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
 593
 594        if (ibmr_ret) {
 595                DEFINE_WAIT(wait);
 596                while(!mutex_trylock(&pool->flush_lock)) {
 597                        ibmr = rds_ib_reuse_fmr(pool);
 598                        if (ibmr) {
 599                                *ibmr_ret = ibmr;
 600                                finish_wait(&pool->flush_wait, &wait);
 601                                goto out_nolock;
 602                        }
 603
 604                        prepare_to_wait(&pool->flush_wait, &wait,
 605                                        TASK_UNINTERRUPTIBLE);
 606                        if (xlist_empty(&pool->clean_list))
 607                                schedule();
 608
 609                        ibmr = rds_ib_reuse_fmr(pool);
 610                        if (ibmr) {
 611                                *ibmr_ret = ibmr;
 612                                finish_wait(&pool->flush_wait, &wait);
 613                                goto out_nolock;
 614                        }
 615                }
 616                finish_wait(&pool->flush_wait, &wait);
 617        } else
 618                mutex_lock(&pool->flush_lock);
 619
 620        if (ibmr_ret) {
 621                ibmr = rds_ib_reuse_fmr(pool);
 622                if (ibmr) {
 623                        *ibmr_ret = ibmr;
 624                        goto out;
 625                }
 626        }
 627
 628        /* Get the list of all MRs to be dropped. Ordering matters -
 629         * we want to put drop_list ahead of free_list.
 630         */
 631        xlist_append_to_list(&pool->drop_list, &unmap_list);
 632        xlist_append_to_list(&pool->free_list, &unmap_list);
 633        if (free_all)
 634                xlist_append_to_list(&pool->clean_list, &unmap_list);
 635
 636        free_goal = rds_ib_flush_goal(pool, free_all);
 637
 638        if (list_empty(&unmap_list))
 639                goto out;
 640
 641        /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
 642        list_for_each_entry(ibmr, &unmap_list, unmap_list)
 643                list_add(&ibmr->fmr->list, &fmr_list);
 644
 645        ret = ib_unmap_fmr(&fmr_list);
 646        if (ret)
 647                printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
 648
 649        /* Now we can destroy the DMA mapping and unpin any pages */
 650        list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
 651                unpinned += ibmr->sg_len;
 652                __rds_ib_teardown_mr(ibmr);
 653                if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
 654                        rds_ib_stats_inc(s_ib_rdma_mr_free);
 655                        list_del(&ibmr->unmap_list);
 656                        ib_dealloc_fmr(ibmr->fmr);
 657                        kfree(ibmr);
 658                        nfreed++;
 659                }
 660                ncleaned++;
 661        }
 662
 663        if (!list_empty(&unmap_list)) {
 664                /* we have to make sure that none of the things we're about
 665                 * to put on the clean list would race with other cpus trying
 666                 * to pull items off.  The xlist would explode if we managed to
 667                 * remove something from the clean list and then add it back again
 668                 * while another CPU was spinning on that same item in xlist_del_head.
 669                 *
 670                 * This is pretty unlikely, but just in case  wait for an xlist grace period
 671                 * here before adding anything back into the clean list.
 672                 */
 673                wait_clean_list_grace();
 674
 675                list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
 676                if (ibmr_ret)
 677                        refill_local(pool, &clean_xlist, ibmr_ret);
 678
 679                /* refill_local may have emptied our list */
 680                if (!xlist_empty(&clean_xlist))
 681                        xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
 682
 683        }
 684
 685        atomic_sub(unpinned, &pool->free_pinned);
 686        atomic_sub(ncleaned, &pool->dirty_count);
 687        atomic_sub(nfreed, &pool->item_count);
 688
 689out:
 690        mutex_unlock(&pool->flush_lock);
 691        if (waitqueue_active(&pool->flush_wait))
 692                wake_up(&pool->flush_wait);
 693out_nolock:
 694        return ret;
 695}
 696
 697static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
 698{
 699        struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
 700
 701        rds_ib_flush_mr_pool(pool, 0, NULL);
 702}
 703
 704void rds_ib_free_mr(void *trans_private, int invalidate)
 705{
 706        struct rds_ib_mr *ibmr = trans_private;
 707        struct rds_ib_device *rds_ibdev = ibmr->device;
 708        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 709
 710        rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 711
 712        /* Return it to the pool's free list */
 713        if (ibmr->remap_count >= pool->fmr_attr.max_maps)
 714                xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
 715        else
 716                xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
 717
 718        atomic_add(ibmr->sg_len, &pool->free_pinned);
 719        atomic_inc(&pool->dirty_count);
 720
 721        /* If we've pinned too many pages, request a flush */
 722        if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
 723            atomic_read(&pool->dirty_count) >= pool->max_items / 10)
 724                schedule_delayed_work(&pool->flush_worker, 10);
 725
 726        if (invalidate) {
 727                if (likely(!in_interrupt())) {
 728                        rds_ib_flush_mr_pool(pool, 0, NULL);
 729                } else {
 730                        /* We get here if the user created a MR marked
 731                         * as use_once and invalidate at the same time. */
 732                        schedule_delayed_work(&pool->flush_worker, 10);
 733                }
 734        }
 735
 736        rds_ib_dev_put(rds_ibdev);
 737}
 738
 739void rds_ib_flush_mrs(void)
 740{
 741        struct rds_ib_device *rds_ibdev;
 742
 743        down_read(&rds_ib_devices_lock);
 744        list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
 745                struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 746
 747                if (pool)
 748                        rds_ib_flush_mr_pool(pool, 0, NULL);
 749        }
 750        up_read(&rds_ib_devices_lock);
 751}
 752
 753void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 754                    struct rds_sock *rs, u32 *key_ret)
 755{
 756        struct rds_ib_device *rds_ibdev;
 757        struct rds_ib_mr *ibmr = NULL;
 758        int ret;
 759
 760        rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
 761        if (!rds_ibdev) {
 762                ret = -ENODEV;
 763                goto out;
 764        }
 765
 766        if (!rds_ibdev->mr_pool) {
 767                ret = -ENODEV;
 768                goto out;
 769        }
 770
 771        ibmr = rds_ib_alloc_fmr(rds_ibdev);
 772        if (IS_ERR(ibmr))
 773                return ibmr;
 774
 775        ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
 776        if (ret == 0)
 777                *key_ret = ibmr->fmr->rkey;
 778        else
 779                printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
 780
 781        ibmr->device = rds_ibdev;
 782        rds_ibdev = NULL;
 783
 784 out:
 785        if (ret) {
 786                if (ibmr)
 787                        rds_ib_free_mr(ibmr, 0);
 788                ibmr = ERR_PTR(ret);
 789        }
 790        if (rds_ibdev)
 791                rds_ib_dev_put(rds_ibdev);
 792        return ibmr;
 793}
 794
 795