linux/drivers/infiniband/hw/mlx5/mr.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32
  33
  34#include <linux/kref.h>
  35#include <linux/random.h>
  36#include <linux/debugfs.h>
  37#include <linux/export.h>
  38#include <linux/delay.h>
  39#include <rdma/ib_umem.h>
  40#include <rdma/ib_umem_odp.h>
  41#include <rdma/ib_verbs.h>
  42#include "mlx5_ib.h"
  43
  44enum {
  45        MAX_PENDING_REG_MR = 8,
  46};
  47
  48#define MLX5_UMR_ALIGN 2048
  49#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
  50static __be64 mlx5_ib_update_mtt_emergency_buffer[
  51                MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
  52        __aligned(MLX5_UMR_ALIGN);
  53static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
  54#endif
  55
  56static int clean_mr(struct mlx5_ib_mr *mr);
  57
  58static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
  59{
  60        int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
  61
  62#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
  63        /* Wait until all page fault handlers using the mr complete. */
  64        synchronize_srcu(&dev->mr_srcu);
  65#endif
  66
  67        return err;
  68}
  69
  70static int order2idx(struct mlx5_ib_dev *dev, int order)
  71{
  72        struct mlx5_mr_cache *cache = &dev->cache;
  73
  74        if (order < cache->ent[0].order)
  75                return 0;
  76        else
  77                return order - cache->ent[0].order;
  78}
  79
  80static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
  81{
  82        return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
  83                length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
  84}
  85
  86#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
  87static void update_odp_mr(struct mlx5_ib_mr *mr)
  88{
  89        if (mr->umem->odp_data) {
  90                /*
  91                 * This barrier prevents the compiler from moving the
  92                 * setting of umem->odp_data->private to point to our
  93                 * MR, before reg_umr finished, to ensure that the MR
  94                 * initialization have finished before starting to
  95                 * handle invalidations.
  96                 */
  97                smp_wmb();
  98                mr->umem->odp_data->private = mr;
  99                /*
 100                 * Make sure we will see the new
 101                 * umem->odp_data->private value in the invalidation
 102                 * routines, before we can get page faults on the
 103                 * MR. Page faults can happen once we put the MR in
 104                 * the tree, below this line. Without the barrier,
 105                 * there can be a fault handling and an invalidation
 106                 * before umem->odp_data->private == mr is visible to
 107                 * the invalidation handler.
 108                 */
 109                smp_wmb();
 110        }
 111}
 112#endif
 113
 114static void reg_mr_callback(int status, void *context)
 115{
 116        struct mlx5_ib_mr *mr = context;
 117        struct mlx5_ib_dev *dev = mr->dev;
 118        struct mlx5_mr_cache *cache = &dev->cache;
 119        int c = order2idx(dev, mr->order);
 120        struct mlx5_cache_ent *ent = &cache->ent[c];
 121        u8 key;
 122        unsigned long flags;
 123        struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table;
 124        int err;
 125
 126        spin_lock_irqsave(&ent->lock, flags);
 127        ent->pending--;
 128        spin_unlock_irqrestore(&ent->lock, flags);
 129        if (status) {
 130                mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 131                kfree(mr);
 132                dev->fill_delay = 1;
 133                mod_timer(&dev->delay_timer, jiffies + HZ);
 134                return;
 135        }
 136
 137        spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
 138        key = dev->mdev->priv.mkey_key++;
 139        spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
 140        mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
 141
 142        cache->last_add = jiffies;
 143
 144        spin_lock_irqsave(&ent->lock, flags);
 145        list_add_tail(&mr->list, &ent->head);
 146        ent->cur++;
 147        ent->size++;
 148        spin_unlock_irqrestore(&ent->lock, flags);
 149
 150        write_lock_irqsave(&table->lock, flags);
 151        err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmkey.key),
 152                                &mr->mmkey);
 153        if (err)
 154                pr_err("Error inserting to mkey tree. 0x%x\n", -err);
 155        write_unlock_irqrestore(&table->lock, flags);
 156}
 157
 158static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 159{
 160        struct mlx5_mr_cache *cache = &dev->cache;
 161        struct mlx5_cache_ent *ent = &cache->ent[c];
 162        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 163        struct mlx5_ib_mr *mr;
 164        int npages = 1 << ent->order;
 165        void *mkc;
 166        u32 *in;
 167        int err = 0;
 168        int i;
 169
 170        in = kzalloc(inlen, GFP_KERNEL);
 171        if (!in)
 172                return -ENOMEM;
 173
 174        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 175        for (i = 0; i < num; i++) {
 176                if (ent->pending >= MAX_PENDING_REG_MR) {
 177                        err = -EAGAIN;
 178                        break;
 179                }
 180
 181                mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 182                if (!mr) {
 183                        err = -ENOMEM;
 184                        break;
 185                }
 186                mr->order = ent->order;
 187                mr->umred = 1;
 188                mr->dev = dev;
 189
 190                MLX5_SET(mkc, mkc, free, 1);
 191                MLX5_SET(mkc, mkc, umr_en, 1);
 192                MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT);
 193
 194                MLX5_SET(mkc, mkc, qpn, 0xffffff);
 195                MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2);
 196                MLX5_SET(mkc, mkc, log_page_size, 12);
 197
 198                spin_lock_irq(&ent->lock);
 199                ent->pending++;
 200                spin_unlock_irq(&ent->lock);
 201                err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
 202                                               in, inlen,
 203                                               mr->out, sizeof(mr->out),
 204                                               reg_mr_callback, mr);
 205                if (err) {
 206                        spin_lock_irq(&ent->lock);
 207                        ent->pending--;
 208                        spin_unlock_irq(&ent->lock);
 209                        mlx5_ib_warn(dev, "create mkey failed %d\n", err);
 210                        kfree(mr);
 211                        break;
 212                }
 213        }
 214
 215        kfree(in);
 216        return err;
 217}
 218
 219static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
 220{
 221        struct mlx5_mr_cache *cache = &dev->cache;
 222        struct mlx5_cache_ent *ent = &cache->ent[c];
 223        struct mlx5_ib_mr *mr;
 224        int err;
 225        int i;
 226
 227        for (i = 0; i < num; i++) {
 228                spin_lock_irq(&ent->lock);
 229                if (list_empty(&ent->head)) {
 230                        spin_unlock_irq(&ent->lock);
 231                        return;
 232                }
 233                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 234                list_del(&mr->list);
 235                ent->cur--;
 236                ent->size--;
 237                spin_unlock_irq(&ent->lock);
 238                err = destroy_mkey(dev, mr);
 239                if (err)
 240                        mlx5_ib_warn(dev, "failed destroy mkey\n");
 241                else
 242                        kfree(mr);
 243        }
 244}
 245
 246static ssize_t size_write(struct file *filp, const char __user *buf,
 247                          size_t count, loff_t *pos)
 248{
 249        struct mlx5_cache_ent *ent = filp->private_data;
 250        struct mlx5_ib_dev *dev = ent->dev;
 251        char lbuf[20];
 252        u32 var;
 253        int err;
 254        int c;
 255
 256        if (copy_from_user(lbuf, buf, sizeof(lbuf)))
 257                return -EFAULT;
 258
 259        c = order2idx(dev, ent->order);
 260        lbuf[sizeof(lbuf) - 1] = 0;
 261
 262        if (sscanf(lbuf, "%u", &var) != 1)
 263                return -EINVAL;
 264
 265        if (var < ent->limit)
 266                return -EINVAL;
 267
 268        if (var > ent->size) {
 269                do {
 270                        err = add_keys(dev, c, var - ent->size);
 271                        if (err && err != -EAGAIN)
 272                                return err;
 273
 274                        usleep_range(3000, 5000);
 275                } while (err);
 276        } else if (var < ent->size) {
 277                remove_keys(dev, c, ent->size - var);
 278        }
 279
 280        return count;
 281}
 282
 283static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 284                         loff_t *pos)
 285{
 286        struct mlx5_cache_ent *ent = filp->private_data;
 287        char lbuf[20];
 288        int err;
 289
 290        if (*pos)
 291                return 0;
 292
 293        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
 294        if (err < 0)
 295                return err;
 296
 297        if (copy_to_user(buf, lbuf, err))
 298                return -EFAULT;
 299
 300        *pos += err;
 301
 302        return err;
 303}
 304
 305static const struct file_operations size_fops = {
 306        .owner  = THIS_MODULE,
 307        .open   = simple_open,
 308        .write  = size_write,
 309        .read   = size_read,
 310};
 311
 312static ssize_t limit_write(struct file *filp, const char __user *buf,
 313                           size_t count, loff_t *pos)
 314{
 315        struct mlx5_cache_ent *ent = filp->private_data;
 316        struct mlx5_ib_dev *dev = ent->dev;
 317        char lbuf[20];
 318        u32 var;
 319        int err;
 320        int c;
 321
 322        if (copy_from_user(lbuf, buf, sizeof(lbuf)))
 323                return -EFAULT;
 324
 325        c = order2idx(dev, ent->order);
 326        lbuf[sizeof(lbuf) - 1] = 0;
 327
 328        if (sscanf(lbuf, "%u", &var) != 1)
 329                return -EINVAL;
 330
 331        if (var > ent->size)
 332                return -EINVAL;
 333
 334        ent->limit = var;
 335
 336        if (ent->cur < ent->limit) {
 337                err = add_keys(dev, c, 2 * ent->limit - ent->cur);
 338                if (err)
 339                        return err;
 340        }
 341
 342        return count;
 343}
 344
 345static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
 346                          loff_t *pos)
 347{
 348        struct mlx5_cache_ent *ent = filp->private_data;
 349        char lbuf[20];
 350        int err;
 351
 352        if (*pos)
 353                return 0;
 354
 355        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
 356        if (err < 0)
 357                return err;
 358
 359        if (copy_to_user(buf, lbuf, err))
 360                return -EFAULT;
 361
 362        *pos += err;
 363
 364        return err;
 365}
 366
 367static const struct file_operations limit_fops = {
 368        .owner  = THIS_MODULE,
 369        .open   = simple_open,
 370        .write  = limit_write,
 371        .read   = limit_read,
 372};
 373
 374static int someone_adding(struct mlx5_mr_cache *cache)
 375{
 376        int i;
 377
 378        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 379                if (cache->ent[i].cur < cache->ent[i].limit)
 380                        return 1;
 381        }
 382
 383        return 0;
 384}
 385
 386static void __cache_work_func(struct mlx5_cache_ent *ent)
 387{
 388        struct mlx5_ib_dev *dev = ent->dev;
 389        struct mlx5_mr_cache *cache = &dev->cache;
 390        int i = order2idx(dev, ent->order);
 391        int err;
 392
 393        if (cache->stopped)
 394                return;
 395
 396        ent = &dev->cache.ent[i];
 397        if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
 398                err = add_keys(dev, i, 1);
 399                if (ent->cur < 2 * ent->limit) {
 400                        if (err == -EAGAIN) {
 401                                mlx5_ib_dbg(dev, "returned eagain, order %d\n",
 402                                            i + 2);
 403                                queue_delayed_work(cache->wq, &ent->dwork,
 404                                                   msecs_to_jiffies(3));
 405                        } else if (err) {
 406                                mlx5_ib_warn(dev, "command failed order %d, err %d\n",
 407                                             i + 2, err);
 408                                queue_delayed_work(cache->wq, &ent->dwork,
 409                                                   msecs_to_jiffies(1000));
 410                        } else {
 411                                queue_work(cache->wq, &ent->work);
 412                        }
 413                }
 414        } else if (ent->cur > 2 * ent->limit) {
 415                /*
 416                 * The remove_keys() logic is performed as garbage collection
 417                 * task. Such task is intended to be run when no other active
 418                 * processes are running.
 419                 *
 420                 * The need_resched() will return TRUE if there are user tasks
 421                 * to be activated in near future.
 422                 *
 423                 * In such case, we don't execute remove_keys() and postpone
 424                 * the garbage collection work to try to run in next cycle,
 425                 * in order to free CPU resources to other tasks.
 426                 */
 427                if (!need_resched() && !someone_adding(cache) &&
 428                    time_after(jiffies, cache->last_add + 300 * HZ)) {
 429                        remove_keys(dev, i, 1);
 430                        if (ent->cur > ent->limit)
 431                                queue_work(cache->wq, &ent->work);
 432                } else {
 433                        queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
 434                }
 435        }
 436}
 437
 438static void delayed_cache_work_func(struct work_struct *work)
 439{
 440        struct mlx5_cache_ent *ent;
 441
 442        ent = container_of(work, struct mlx5_cache_ent, dwork.work);
 443        __cache_work_func(ent);
 444}
 445
 446static void cache_work_func(struct work_struct *work)
 447{
 448        struct mlx5_cache_ent *ent;
 449
 450        ent = container_of(work, struct mlx5_cache_ent, work);
 451        __cache_work_func(ent);
 452}
 453
 454static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 455{
 456        struct mlx5_mr_cache *cache = &dev->cache;
 457        struct mlx5_ib_mr *mr = NULL;
 458        struct mlx5_cache_ent *ent;
 459        int c;
 460        int i;
 461
 462        c = order2idx(dev, order);
 463        if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
 464                mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
 465                return NULL;
 466        }
 467
 468        for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
 469                ent = &cache->ent[i];
 470
 471                mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
 472
 473                spin_lock_irq(&ent->lock);
 474                if (!list_empty(&ent->head)) {
 475                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 476                                              list);
 477                        list_del(&mr->list);
 478                        ent->cur--;
 479                        spin_unlock_irq(&ent->lock);
 480                        if (ent->cur < ent->limit)
 481                                queue_work(cache->wq, &ent->work);
 482                        break;
 483                }
 484                spin_unlock_irq(&ent->lock);
 485
 486                queue_work(cache->wq, &ent->work);
 487        }
 488
 489        if (!mr)
 490                cache->ent[c].miss++;
 491
 492        return mr;
 493}
 494
 495static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 496{
 497        struct mlx5_mr_cache *cache = &dev->cache;
 498        struct mlx5_cache_ent *ent;
 499        int shrink = 0;
 500        int c;
 501
 502        c = order2idx(dev, mr->order);
 503        if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
 504                mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
 505                return;
 506        }
 507        ent = &cache->ent[c];
 508        spin_lock_irq(&ent->lock);
 509        list_add_tail(&mr->list, &ent->head);
 510        ent->cur++;
 511        if (ent->cur > 2 * ent->limit)
 512                shrink = 1;
 513        spin_unlock_irq(&ent->lock);
 514
 515        if (shrink)
 516                queue_work(cache->wq, &ent->work);
 517}
 518
 519static void clean_keys(struct mlx5_ib_dev *dev, int c)
 520{
 521        struct mlx5_mr_cache *cache = &dev->cache;
 522        struct mlx5_cache_ent *ent = &cache->ent[c];
 523        struct mlx5_ib_mr *mr;
 524        int err;
 525
 526        cancel_delayed_work(&ent->dwork);
 527        while (1) {
 528                spin_lock_irq(&ent->lock);
 529                if (list_empty(&ent->head)) {
 530                        spin_unlock_irq(&ent->lock);
 531                        return;
 532                }
 533                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 534                list_del(&mr->list);
 535                ent->cur--;
 536                ent->size--;
 537                spin_unlock_irq(&ent->lock);
 538                err = destroy_mkey(dev, mr);
 539                if (err)
 540                        mlx5_ib_warn(dev, "failed destroy mkey\n");
 541                else
 542                        kfree(mr);
 543        }
 544}
 545
 546static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 547{
 548        struct mlx5_mr_cache *cache = &dev->cache;
 549        struct mlx5_cache_ent *ent;
 550        int i;
 551
 552        if (!mlx5_debugfs_root)
 553                return 0;
 554
 555        cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
 556        if (!cache->root)
 557                return -ENOMEM;
 558
 559        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 560                ent = &cache->ent[i];
 561                sprintf(ent->name, "%d", ent->order);
 562                ent->dir = debugfs_create_dir(ent->name,  cache->root);
 563                if (!ent->dir)
 564                        return -ENOMEM;
 565
 566                ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
 567                                                 &size_fops);
 568                if (!ent->fsize)
 569                        return -ENOMEM;
 570
 571                ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
 572                                                  &limit_fops);
 573                if (!ent->flimit)
 574                        return -ENOMEM;
 575
 576                ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
 577                                               &ent->cur);
 578                if (!ent->fcur)
 579                        return -ENOMEM;
 580
 581                ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
 582                                                &ent->miss);
 583                if (!ent->fmiss)
 584                        return -ENOMEM;
 585        }
 586
 587        return 0;
 588}
 589
 590static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 591{
 592        if (!mlx5_debugfs_root)
 593                return;
 594
 595        debugfs_remove_recursive(dev->cache.root);
 596}
 597
 598static void delay_time_func(unsigned long ctx)
 599{
 600        struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
 601
 602        dev->fill_delay = 0;
 603}
 604
 605int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 606{
 607        struct mlx5_mr_cache *cache = &dev->cache;
 608        struct mlx5_cache_ent *ent;
 609        int limit;
 610        int err;
 611        int i;
 612
 613        mutex_init(&dev->slow_path_mutex);
 614        cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 615        if (!cache->wq) {
 616                mlx5_ib_warn(dev, "failed to create work queue\n");
 617                return -ENOMEM;
 618        }
 619
 620        setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
 621        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 622                INIT_LIST_HEAD(&cache->ent[i].head);
 623                spin_lock_init(&cache->ent[i].lock);
 624
 625                ent = &cache->ent[i];
 626                INIT_LIST_HEAD(&ent->head);
 627                spin_lock_init(&ent->lock);
 628                ent->order = i + 2;
 629                ent->dev = dev;
 630
 631                if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE)
 632                        limit = dev->mdev->profile->mr_cache[i].limit;
 633                else
 634                        limit = 0;
 635
 636                INIT_WORK(&ent->work, cache_work_func);
 637                INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 638                ent->limit = limit;
 639                queue_work(cache->wq, &ent->work);
 640        }
 641
 642        err = mlx5_mr_cache_debugfs_init(dev);
 643        if (err)
 644                mlx5_ib_warn(dev, "cache debugfs failure\n");
 645
 646        return 0;
 647}
 648
 649int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 650{
 651        int i;
 652
 653        dev->cache.stopped = 1;
 654        flush_workqueue(dev->cache.wq);
 655
 656        mlx5_mr_cache_debugfs_cleanup(dev);
 657
 658        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
 659                clean_keys(dev, i);
 660
 661        destroy_workqueue(dev->cache.wq);
 662        del_timer_sync(&dev->delay_timer);
 663
 664        return 0;
 665}
 666
 667struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
 668{
 669        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 670        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 671        struct mlx5_core_dev *mdev = dev->mdev;
 672        struct mlx5_ib_mr *mr;
 673        void *mkc;
 674        u32 *in;
 675        int err;
 676
 677        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 678        if (!mr)
 679                return ERR_PTR(-ENOMEM);
 680
 681        in = kzalloc(inlen, GFP_KERNEL);
 682        if (!in) {
 683                err = -ENOMEM;
 684                goto err_free;
 685        }
 686
 687        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 688
 689        MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_PA);
 690        MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
 691        MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
 692        MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
 693        MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
 694        MLX5_SET(mkc, mkc, lr, 1);
 695
 696        MLX5_SET(mkc, mkc, length64, 1);
 697        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
 698        MLX5_SET(mkc, mkc, qpn, 0xffffff);
 699        MLX5_SET64(mkc, mkc, start_addr, 0);
 700
 701        err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
 702        if (err)
 703                goto err_in;
 704
 705        kfree(in);
 706        mr->ibmr.lkey = mr->mmkey.key;
 707        mr->ibmr.rkey = mr->mmkey.key;
 708        mr->umem = NULL;
 709
 710        return &mr->ibmr;
 711
 712err_in:
 713        kfree(in);
 714
 715err_free:
 716        kfree(mr);
 717
 718        return ERR_PTR(err);
 719}
 720
 721static int get_octo_len(u64 addr, u64 len, int page_size)
 722{
 723        u64 offset;
 724        int npages;
 725
 726        offset = addr & (page_size - 1);
 727        npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
 728        return (npages + 1) / 2;
 729}
 730
 731static int use_umr(int order)
 732{
 733        return order <= MLX5_MAX_UMR_SHIFT;
 734}
 735
 736static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 737                          int npages, int page_shift, int *size,
 738                          __be64 **mr_pas, dma_addr_t *dma)
 739{
 740        __be64 *pas;
 741        struct device *ddev = dev->ib_dev.dma_device;
 742
 743        /*
 744         * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
 745         * To avoid copying garbage after the pas array, we allocate
 746         * a little more.
 747         */
 748        *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
 749        *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
 750        if (!(*mr_pas))
 751                return -ENOMEM;
 752
 753        pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
 754        mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
 755        /* Clear padding after the actual pages. */
 756        memset(pas + npages, 0, *size - npages * sizeof(u64));
 757
 758        *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
 759        if (dma_mapping_error(ddev, *dma)) {
 760                kfree(*mr_pas);
 761                return -ENOMEM;
 762        }
 763
 764        return 0;
 765}
 766
 767static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
 768                                struct ib_sge *sg, u64 dma, int n, u32 key,
 769                                int page_shift)
 770{
 771        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 772        struct mlx5_umr_wr *umrwr = umr_wr(wr);
 773
 774        sg->addr = dma;
 775        sg->length = ALIGN(sizeof(u64) * n, 64);
 776        sg->lkey = dev->umrc.pd->local_dma_lkey;
 777
 778        wr->next = NULL;
 779        wr->sg_list = sg;
 780        if (n)
 781                wr->num_sge = 1;
 782        else
 783                wr->num_sge = 0;
 784
 785        wr->opcode = MLX5_IB_WR_UMR;
 786
 787        umrwr->npages = n;
 788        umrwr->page_shift = page_shift;
 789        umrwr->mkey = key;
 790}
 791
 792static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
 793                             struct ib_sge *sg, u64 dma, int n, u32 key,
 794                             int page_shift, u64 virt_addr, u64 len,
 795                             int access_flags)
 796{
 797        struct mlx5_umr_wr *umrwr = umr_wr(wr);
 798
 799        prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift);
 800
 801        wr->send_flags = 0;
 802
 803        umrwr->target.virt_addr = virt_addr;
 804        umrwr->length = len;
 805        umrwr->access_flags = access_flags;
 806        umrwr->pd = pd;
 807}
 808
 809static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
 810                               struct ib_send_wr *wr, u32 key)
 811{
 812        struct mlx5_umr_wr *umrwr = umr_wr(wr);
 813
 814        wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
 815        wr->opcode = MLX5_IB_WR_UMR;
 816        umrwr->mkey = key;
 817}
 818
 819static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
 820                                   int access_flags, int *npages,
 821                                   int *page_shift, int *ncont, int *order)
 822{
 823        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 824        struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length,
 825                                           access_flags, 0);
 826        if (IS_ERR(umem)) {
 827                mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
 828                return (void *)umem;
 829        }
 830
 831        mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order);
 832        if (!*npages) {
 833                mlx5_ib_warn(dev, "avoid zero region\n");
 834                ib_umem_release(umem);
 835                return ERR_PTR(-EINVAL);
 836        }
 837
 838        mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
 839                    *npages, *ncont, *order, *page_shift);
 840
 841        return umem;
 842}
 843
 844static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
 845{
 846        struct mlx5_ib_umr_context *context =
 847                container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
 848
 849        context->status = wc->status;
 850        complete(&context->done);
 851}
 852
 853static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
 854{
 855        context->cqe.done = mlx5_ib_umr_done;
 856        context->status = -1;
 857        init_completion(&context->done);
 858}
 859
 860static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
 861                                  u64 virt_addr, u64 len, int npages,
 862                                  int page_shift, int order, int access_flags)
 863{
 864        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 865        struct device *ddev = dev->ib_dev.dma_device;
 866        struct umr_common *umrc = &dev->umrc;
 867        struct mlx5_ib_umr_context umr_context;
 868        struct mlx5_umr_wr umrwr = {};
 869        struct ib_send_wr *bad;
 870        struct mlx5_ib_mr *mr;
 871        struct ib_sge sg;
 872        int size;
 873        __be64 *mr_pas;
 874        dma_addr_t dma;
 875        int err = 0;
 876        int i;
 877
 878        for (i = 0; i < 1; i++) {
 879                mr = alloc_cached_mr(dev, order);
 880                if (mr)
 881                        break;
 882
 883                err = add_keys(dev, order2idx(dev, order), 1);
 884                if (err && err != -EAGAIN) {
 885                        mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
 886                        break;
 887                }
 888        }
 889
 890        if (!mr)
 891                return ERR_PTR(-EAGAIN);
 892
 893        err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas,
 894                             &dma);
 895        if (err)
 896                goto free_mr;
 897
 898        mlx5_ib_init_umr_context(&umr_context);
 899
 900        umrwr.wr.wr_cqe = &umr_context.cqe;
 901        prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
 902                         page_shift, virt_addr, len, access_flags);
 903
 904        down(&umrc->sem);
 905        err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
 906        if (err) {
 907                mlx5_ib_warn(dev, "post send failed, err %d\n", err);
 908                goto unmap_dma;
 909        } else {
 910                wait_for_completion(&umr_context.done);
 911                if (umr_context.status != IB_WC_SUCCESS) {
 912                        mlx5_ib_warn(dev, "reg umr failed\n");
 913                        err = -EFAULT;
 914                }
 915        }
 916
 917        mr->mmkey.iova = virt_addr;
 918        mr->mmkey.size = len;
 919        mr->mmkey.pd = to_mpd(pd)->pdn;
 920
 921        mr->live = 1;
 922
 923unmap_dma:
 924        up(&umrc->sem);
 925        dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
 926
 927        kfree(mr_pas);
 928
 929free_mr:
 930        if (err) {
 931                free_cached_mr(dev, mr);
 932                return ERR_PTR(err);
 933        }
 934
 935        return mr;
 936}
 937
 938#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 939int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
 940                       int zap)
 941{
 942        struct mlx5_ib_dev *dev = mr->dev;
 943        struct device *ddev = dev->ib_dev.dma_device;
 944        struct umr_common *umrc = &dev->umrc;
 945        struct mlx5_ib_umr_context umr_context;
 946        struct ib_umem *umem = mr->umem;
 947        int size;
 948        __be64 *pas;
 949        dma_addr_t dma;
 950        struct ib_send_wr *bad;
 951        struct mlx5_umr_wr wr;
 952        struct ib_sge sg;
 953        int err = 0;
 954        const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
 955        const int page_index_mask = page_index_alignment - 1;
 956        size_t pages_mapped = 0;
 957        size_t pages_to_map = 0;
 958        size_t pages_iter = 0;
 959        int use_emergency_buf = 0;
 960
 961        /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
 962         * so we need to align the offset and length accordingly */
 963        if (start_page_index & page_index_mask) {
 964                npages += start_page_index & page_index_mask;
 965                start_page_index &= ~page_index_mask;
 966        }
 967
 968        pages_to_map = ALIGN(npages, page_index_alignment);
 969
 970        if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
 971                return -EINVAL;
 972
 973        size = sizeof(u64) * pages_to_map;
 974        size = min_t(int, PAGE_SIZE, size);
 975        /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
 976         * code, when we are called from an invalidation. The pas buffer must
 977         * be 2k-aligned for Connect-IB. */
 978        pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
 979        if (!pas) {
 980                mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
 981                pas = mlx5_ib_update_mtt_emergency_buffer;
 982                size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
 983                use_emergency_buf = 1;
 984                mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
 985                memset(pas, 0, size);
 986        }
 987        pages_iter = size / sizeof(u64);
 988        dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
 989        if (dma_mapping_error(ddev, dma)) {
 990                mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
 991                err = -ENOMEM;
 992                goto free_pas;
 993        }
 994
 995        for (pages_mapped = 0;
 996             pages_mapped < pages_to_map && !err;
 997             pages_mapped += pages_iter, start_page_index += pages_iter) {
 998                dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
 999
1000                npages = min_t(size_t,
1001                               pages_iter,
1002                               ib_umem_num_pages(umem) - start_page_index);
1003
1004                if (!zap) {
1005                        __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
1006                                               start_page_index, npages, pas,
1007                                               MLX5_IB_MTT_PRESENT);
1008                        /* Clear padding after the pages brought from the
1009                         * umem. */
1010                        memset(pas + npages, 0, size - npages * sizeof(u64));
1011                }
1012
1013                dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
1014
1015                mlx5_ib_init_umr_context(&umr_context);
1016
1017                memset(&wr, 0, sizeof(wr));
1018                wr.wr.wr_cqe = &umr_context.cqe;
1019
1020                sg.addr = dma;
1021                sg.length = ALIGN(npages * sizeof(u64),
1022                                MLX5_UMR_MTT_ALIGNMENT);
1023                sg.lkey = dev->umrc.pd->local_dma_lkey;
1024
1025                wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1026                                MLX5_IB_SEND_UMR_UPDATE_MTT;
1027                wr.wr.sg_list = &sg;
1028                wr.wr.num_sge = 1;
1029                wr.wr.opcode = MLX5_IB_WR_UMR;
1030                wr.npages = sg.length / sizeof(u64);
1031                wr.page_shift = PAGE_SHIFT;
1032                wr.mkey = mr->mmkey.key;
1033                wr.target.offset = start_page_index;
1034
1035                down(&umrc->sem);
1036                err = ib_post_send(umrc->qp, &wr.wr, &bad);
1037                if (err) {
1038                        mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
1039                } else {
1040                        wait_for_completion(&umr_context.done);
1041                        if (umr_context.status != IB_WC_SUCCESS) {
1042                                mlx5_ib_err(dev, "UMR completion failed, code %d\n",
1043                                            umr_context.status);
1044                                err = -EFAULT;
1045                        }
1046                }
1047                up(&umrc->sem);
1048        }
1049        dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1050
1051free_pas:
1052        if (!use_emergency_buf)
1053                free_page((unsigned long)pas);
1054        else
1055                mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
1056
1057        return err;
1058}
1059#endif
1060
1061/*
1062 * If ibmr is NULL it will be allocated by reg_create.
1063 * Else, the given ibmr will be used.
1064 */
1065static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
1066                                     u64 virt_addr, u64 length,
1067                                     struct ib_umem *umem, int npages,
1068                                     int page_shift, int access_flags)
1069{
1070        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1071        struct mlx5_ib_mr *mr;
1072        __be64 *pas;
1073        void *mkc;
1074        int inlen;
1075        u32 *in;
1076        int err;
1077        bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1078
1079        mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
1080        if (!mr)
1081                return ERR_PTR(-ENOMEM);
1082
1083        inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
1084                sizeof(*pas) * ((npages + 1) / 2) * 2;
1085        in = mlx5_vzalloc(inlen);
1086        if (!in) {
1087                err = -ENOMEM;
1088                goto err_1;
1089        }
1090        pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1091        mlx5_ib_populate_pas(dev, umem, page_shift, pas,
1092                             pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1093
1094        /* The pg_access bit allows setting the access flags
1095         * in the page list submitted with the command. */
1096        MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1097
1098        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1099        MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT);
1100        MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
1101        MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
1102        MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
1103        MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
1104        MLX5_SET(mkc, mkc, lr, 1);
1105
1106        MLX5_SET64(mkc, mkc, start_addr, virt_addr);
1107        MLX5_SET64(mkc, mkc, len, length);
1108        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1109        MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1110        MLX5_SET(mkc, mkc, translations_octword_size,
1111                 get_octo_len(virt_addr, length, 1 << page_shift));
1112        MLX5_SET(mkc, mkc, log_page_size, page_shift);
1113        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1114        MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1115                 get_octo_len(virt_addr, length, 1 << page_shift));
1116
1117        err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1118        if (err) {
1119                mlx5_ib_warn(dev, "create mkey failed\n");
1120                goto err_2;
1121        }
1122        mr->umem = umem;
1123        mr->dev = dev;
1124        mr->live = 1;
1125        kvfree(in);
1126
1127        mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1128
1129        return mr;
1130
1131err_2:
1132        kvfree(in);
1133
1134err_1:
1135        if (!ibmr)
1136                kfree(mr);
1137
1138        return ERR_PTR(err);
1139}
1140
1141static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1142                          int npages, u64 length, int access_flags)
1143{
1144        mr->npages = npages;
1145        atomic_add(npages, &dev->mdev->priv.reg_pages);
1146        mr->ibmr.lkey = mr->mmkey.key;
1147        mr->ibmr.rkey = mr->mmkey.key;
1148        mr->ibmr.length = length;
1149        mr->access_flags = access_flags;
1150}
1151
1152struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1153                                  u64 virt_addr, int access_flags,
1154                                  struct ib_udata *udata)
1155{
1156        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1157        struct mlx5_ib_mr *mr = NULL;
1158        struct ib_umem *umem;
1159        int page_shift;
1160        int npages;
1161        int ncont;
1162        int order;
1163        int err;
1164
1165        mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1166                    start, virt_addr, length, access_flags);
1167        umem = mr_umem_get(pd, start, length, access_flags, &npages,
1168                           &page_shift, &ncont, &order);
1169
1170        if (IS_ERR(umem))
1171                return (void *)umem;
1172
1173        if (use_umr(order)) {
1174                mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
1175                             order, access_flags);
1176                if (PTR_ERR(mr) == -EAGAIN) {
1177                        mlx5_ib_dbg(dev, "cache empty for order %d", order);
1178                        mr = NULL;
1179                }
1180        } else if (access_flags & IB_ACCESS_ON_DEMAND) {
1181                err = -EINVAL;
1182                pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
1183                goto error;
1184        }
1185
1186        if (!mr) {
1187                mutex_lock(&dev->slow_path_mutex);
1188                mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
1189                                page_shift, access_flags);
1190                mutex_unlock(&dev->slow_path_mutex);
1191        }
1192
1193        if (IS_ERR(mr)) {
1194                err = PTR_ERR(mr);
1195                goto error;
1196        }
1197
1198        mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1199
1200        mr->umem = umem;
1201        set_mr_fileds(dev, mr, npages, length, access_flags);
1202
1203#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1204        update_odp_mr(mr);
1205#endif
1206
1207        return &mr->ibmr;
1208
1209error:
1210        ib_umem_release(umem);
1211        return ERR_PTR(err);
1212}
1213
1214static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1215{
1216        struct mlx5_core_dev *mdev = dev->mdev;
1217        struct umr_common *umrc = &dev->umrc;
1218        struct mlx5_ib_umr_context umr_context;
1219        struct mlx5_umr_wr umrwr = {};
1220        struct ib_send_wr *bad;
1221        int err;
1222
1223        if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1224                return 0;
1225
1226        mlx5_ib_init_umr_context(&umr_context);
1227
1228        umrwr.wr.wr_cqe = &umr_context.cqe;
1229        prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key);
1230
1231        down(&umrc->sem);
1232        err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1233        if (err) {
1234                up(&umrc->sem);
1235                mlx5_ib_dbg(dev, "err %d\n", err);
1236                goto error;
1237        } else {
1238                wait_for_completion(&umr_context.done);
1239                up(&umrc->sem);
1240        }
1241        if (umr_context.status != IB_WC_SUCCESS) {
1242                mlx5_ib_warn(dev, "unreg umr failed\n");
1243                err = -EFAULT;
1244                goto error;
1245        }
1246        return 0;
1247
1248error:
1249        return err;
1250}
1251
1252static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
1253                     u64 length, int npages, int page_shift, int order,
1254                     int access_flags, int flags)
1255{
1256        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1257        struct device *ddev = dev->ib_dev.dma_device;
1258        struct mlx5_ib_umr_context umr_context;
1259        struct ib_send_wr *bad;
1260        struct mlx5_umr_wr umrwr = {};
1261        struct ib_sge sg;
1262        struct umr_common *umrc = &dev->umrc;
1263        dma_addr_t dma = 0;
1264        __be64 *mr_pas = NULL;
1265        int size;
1266        int err;
1267
1268        mlx5_ib_init_umr_context(&umr_context);
1269
1270        umrwr.wr.wr_cqe = &umr_context.cqe;
1271        umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1272
1273        if (flags & IB_MR_REREG_TRANS) {
1274                err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size,
1275                                     &mr_pas, &dma);
1276                if (err)
1277                        return err;
1278
1279                umrwr.target.virt_addr = virt_addr;
1280                umrwr.length = length;
1281                umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1282        }
1283
1284        prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
1285                            page_shift);
1286
1287        if (flags & IB_MR_REREG_PD) {
1288                umrwr.pd = pd;
1289                umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD;
1290        }
1291
1292        if (flags & IB_MR_REREG_ACCESS) {
1293                umrwr.access_flags = access_flags;
1294                umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS;
1295        }
1296
1297        /* post send request to UMR QP */
1298        down(&umrc->sem);
1299        err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1300
1301        if (err) {
1302                mlx5_ib_warn(dev, "post send failed, err %d\n", err);
1303        } else {
1304                wait_for_completion(&umr_context.done);
1305                if (umr_context.status != IB_WC_SUCCESS) {
1306                        mlx5_ib_warn(dev, "reg umr failed (%u)\n",
1307                                     umr_context.status);
1308                        err = -EFAULT;
1309                }
1310        }
1311
1312        up(&umrc->sem);
1313        if (flags & IB_MR_REREG_TRANS) {
1314                dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1315                kfree(mr_pas);
1316        }
1317        return err;
1318}
1319
1320int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1321                          u64 length, u64 virt_addr, int new_access_flags,
1322                          struct ib_pd *new_pd, struct ib_udata *udata)
1323{
1324        struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1325        struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1326        struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
1327        int access_flags = flags & IB_MR_REREG_ACCESS ?
1328                            new_access_flags :
1329                            mr->access_flags;
1330        u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address;
1331        u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length;
1332        int page_shift = 0;
1333        int npages = 0;
1334        int ncont = 0;
1335        int order = 0;
1336        int err;
1337
1338        mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1339                    start, virt_addr, length, access_flags);
1340
1341        if (flags != IB_MR_REREG_PD) {
1342                /*
1343                 * Replace umem. This needs to be done whether or not UMR is
1344                 * used.
1345                 */
1346                flags |= IB_MR_REREG_TRANS;
1347                ib_umem_release(mr->umem);
1348                mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages,
1349                                       &page_shift, &ncont, &order);
1350                if (IS_ERR(mr->umem)) {
1351                        err = PTR_ERR(mr->umem);
1352                        mr->umem = NULL;
1353                        return err;
1354                }
1355        }
1356
1357        if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
1358                /*
1359                 * UMR can't be used - MKey needs to be replaced.
1360                 */
1361                if (mr->umred) {
1362                        err = unreg_umr(dev, mr);
1363                        if (err)
1364                                mlx5_ib_warn(dev, "Failed to unregister MR\n");
1365                } else {
1366                        err = destroy_mkey(dev, mr);
1367                        if (err)
1368                                mlx5_ib_warn(dev, "Failed to destroy MKey\n");
1369                }
1370                if (err)
1371                        return err;
1372
1373                mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont,
1374                                page_shift, access_flags);
1375
1376                if (IS_ERR(mr))
1377                        return PTR_ERR(mr);
1378
1379                mr->umred = 0;
1380        } else {
1381                /*
1382                 * Send a UMR WQE
1383                 */
1384                err = rereg_umr(pd, mr, addr, len, npages, page_shift,
1385                                order, access_flags, flags);
1386                if (err) {
1387                        mlx5_ib_warn(dev, "Failed to rereg UMR\n");
1388                        return err;
1389                }
1390        }
1391
1392        if (flags & IB_MR_REREG_PD) {
1393                ib_mr->pd = pd;
1394                mr->mmkey.pd = to_mpd(pd)->pdn;
1395        }
1396
1397        if (flags & IB_MR_REREG_ACCESS)
1398                mr->access_flags = access_flags;
1399
1400        if (flags & IB_MR_REREG_TRANS) {
1401                atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1402                set_mr_fileds(dev, mr, npages, len, access_flags);
1403                mr->mmkey.iova = addr;
1404                mr->mmkey.size = len;
1405        }
1406#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1407        update_odp_mr(mr);
1408#endif
1409
1410        return 0;
1411}
1412
1413static int
1414mlx5_alloc_priv_descs(struct ib_device *device,
1415                      struct mlx5_ib_mr *mr,
1416                      int ndescs,
1417                      int desc_size)
1418{
1419        int size = ndescs * desc_size;
1420        int add_size;
1421        int ret;
1422
1423        add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1424
1425        mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1426        if (!mr->descs_alloc)
1427                return -ENOMEM;
1428
1429        mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1430
1431        mr->desc_map = dma_map_single(device->dma_device, mr->descs,
1432                                      size, DMA_TO_DEVICE);
1433        if (dma_mapping_error(device->dma_device, mr->desc_map)) {
1434                ret = -ENOMEM;
1435                goto err;
1436        }
1437
1438        return 0;
1439err:
1440        kfree(mr->descs_alloc);
1441
1442        return ret;
1443}
1444
1445static void
1446mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1447{
1448        if (mr->descs) {
1449                struct ib_device *device = mr->ibmr.device;
1450                int size = mr->max_descs * mr->desc_size;
1451
1452                dma_unmap_single(device->dma_device, mr->desc_map,
1453                                 size, DMA_TO_DEVICE);
1454                kfree(mr->descs_alloc);
1455                mr->descs = NULL;
1456        }
1457}
1458
1459static int clean_mr(struct mlx5_ib_mr *mr)
1460{
1461        struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1462        int umred = mr->umred;
1463        int err;
1464
1465        if (mr->sig) {
1466                if (mlx5_core_destroy_psv(dev->mdev,
1467                                          mr->sig->psv_memory.psv_idx))
1468                        mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1469                                     mr->sig->psv_memory.psv_idx);
1470                if (mlx5_core_destroy_psv(dev->mdev,
1471                                          mr->sig->psv_wire.psv_idx))
1472                        mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1473                                     mr->sig->psv_wire.psv_idx);
1474                kfree(mr->sig);
1475                mr->sig = NULL;
1476        }
1477
1478        mlx5_free_priv_descs(mr);
1479
1480        if (!umred) {
1481                err = destroy_mkey(dev, mr);
1482                if (err) {
1483                        mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1484                                     mr->mmkey.key, err);
1485                        return err;
1486                }
1487        } else {
1488                err = unreg_umr(dev, mr);
1489                if (err) {
1490                        mlx5_ib_warn(dev, "failed unregister\n");
1491                        return err;
1492                }
1493                free_cached_mr(dev, mr);
1494        }
1495
1496        if (!umred)
1497                kfree(mr);
1498
1499        return 0;
1500}
1501
1502int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1503{
1504        struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1505        struct mlx5_ib_mr *mr = to_mmr(ibmr);
1506        int npages = mr->npages;
1507        struct ib_umem *umem = mr->umem;
1508
1509#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1510        if (umem && umem->odp_data) {
1511                /* Prevent new page faults from succeeding */
1512                mr->live = 0;
1513                /* Wait for all running page-fault handlers to finish. */
1514                synchronize_srcu(&dev->mr_srcu);
1515                /* Destroy all page mappings */
1516                mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1517                                         ib_umem_end(umem));
1518                /*
1519                 * We kill the umem before the MR for ODP,
1520                 * so that there will not be any invalidations in
1521                 * flight, looking at the *mr struct.
1522                 */
1523                ib_umem_release(umem);
1524                atomic_sub(npages, &dev->mdev->priv.reg_pages);
1525
1526                /* Avoid double-freeing the umem. */
1527                umem = NULL;
1528        }
1529#endif
1530
1531        clean_mr(mr);
1532
1533        if (umem) {
1534                ib_umem_release(umem);
1535                atomic_sub(npages, &dev->mdev->priv.reg_pages);
1536        }
1537
1538        return 0;
1539}
1540
1541struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
1542                               enum ib_mr_type mr_type,
1543                               u32 max_num_sg)
1544{
1545        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1546        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1547        int ndescs = ALIGN(max_num_sg, 4);
1548        struct mlx5_ib_mr *mr;
1549        void *mkc;
1550        u32 *in;
1551        int err;
1552
1553        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1554        if (!mr)
1555                return ERR_PTR(-ENOMEM);
1556
1557        in = kzalloc(inlen, GFP_KERNEL);
1558        if (!in) {
1559                err = -ENOMEM;
1560                goto err_free;
1561        }
1562
1563        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1564        MLX5_SET(mkc, mkc, free, 1);
1565        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1566        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1567        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1568
1569        if (mr_type == IB_MR_TYPE_MEM_REG) {
1570                mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
1571                MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
1572                err = mlx5_alloc_priv_descs(pd->device, mr,
1573                                            ndescs, sizeof(u64));
1574                if (err)
1575                        goto err_free_in;
1576
1577                mr->desc_size = sizeof(u64);
1578                mr->max_descs = ndescs;
1579        } else if (mr_type == IB_MR_TYPE_SG_GAPS) {
1580                mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
1581
1582                err = mlx5_alloc_priv_descs(pd->device, mr,
1583                                            ndescs, sizeof(struct mlx5_klm));
1584                if (err)
1585                        goto err_free_in;
1586                mr->desc_size = sizeof(struct mlx5_klm);
1587                mr->max_descs = ndescs;
1588        } else if (mr_type == IB_MR_TYPE_SIGNATURE) {
1589                u32 psv_index[2];
1590
1591                MLX5_SET(mkc, mkc, bsf_en, 1);
1592                MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1593                mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1594                if (!mr->sig) {
1595                        err = -ENOMEM;
1596                        goto err_free_in;
1597                }
1598
1599                /* create mem & wire PSVs */
1600                err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
1601                                           2, psv_index);
1602                if (err)
1603                        goto err_free_sig;
1604
1605                mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
1606                mr->sig->psv_memory.psv_idx = psv_index[0];
1607                mr->sig->psv_wire.psv_idx = psv_index[1];
1608
1609                mr->sig->sig_status_checked = true;
1610                mr->sig->sig_err_exists = false;
1611                /* Next UMR, Arm SIGERR */
1612                ++mr->sig->sigerr_count;
1613        } else {
1614                mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1615                err = -EINVAL;
1616                goto err_free_in;
1617        }
1618
1619        MLX5_SET(mkc, mkc, access_mode, mr->access_mode);
1620        MLX5_SET(mkc, mkc, umr_en, 1);
1621
1622        err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1623        if (err)
1624                goto err_destroy_psv;
1625
1626        mr->ibmr.lkey = mr->mmkey.key;
1627        mr->ibmr.rkey = mr->mmkey.key;
1628        mr->umem = NULL;
1629        kfree(in);
1630
1631        return &mr->ibmr;
1632
1633err_destroy_psv:
1634        if (mr->sig) {
1635                if (mlx5_core_destroy_psv(dev->mdev,
1636                                          mr->sig->psv_memory.psv_idx))
1637                        mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1638                                     mr->sig->psv_memory.psv_idx);
1639                if (mlx5_core_destroy_psv(dev->mdev,
1640                                          mr->sig->psv_wire.psv_idx))
1641                        mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1642                                     mr->sig->psv_wire.psv_idx);
1643        }
1644        mlx5_free_priv_descs(mr);
1645err_free_sig:
1646        kfree(mr->sig);
1647err_free_in:
1648        kfree(in);
1649err_free:
1650        kfree(mr);
1651        return ERR_PTR(err);
1652}
1653
1654struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
1655                               struct ib_udata *udata)
1656{
1657        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1658        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1659        struct mlx5_ib_mw *mw = NULL;
1660        u32 *in = NULL;
1661        void *mkc;
1662        int ndescs;
1663        int err;
1664        struct mlx5_ib_alloc_mw req = {};
1665        struct {
1666                __u32   comp_mask;
1667                __u32   response_length;
1668        } resp = {};
1669
1670        err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1671        if (err)
1672                return ERR_PTR(err);
1673
1674        if (req.comp_mask || req.reserved1 || req.reserved2)
1675                return ERR_PTR(-EOPNOTSUPP);
1676
1677        if (udata->inlen > sizeof(req) &&
1678            !ib_is_udata_cleared(udata, sizeof(req),
1679                                 udata->inlen - sizeof(req)))
1680                return ERR_PTR(-EOPNOTSUPP);
1681
1682        ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
1683
1684        mw = kzalloc(sizeof(*mw), GFP_KERNEL);
1685        in = kzalloc(inlen, GFP_KERNEL);
1686        if (!mw || !in) {
1687                err = -ENOMEM;
1688                goto free;
1689        }
1690
1691        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1692
1693        MLX5_SET(mkc, mkc, free, 1);
1694        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1695        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1696        MLX5_SET(mkc, mkc, umr_en, 1);
1697        MLX5_SET(mkc, mkc, lr, 1);
1698        MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_KLMS);
1699        MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
1700        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1701
1702        err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
1703        if (err)
1704                goto free;
1705
1706        mw->ibmw.rkey = mw->mmkey.key;
1707
1708        resp.response_length = min(offsetof(typeof(resp), response_length) +
1709                                   sizeof(resp.response_length), udata->outlen);
1710        if (resp.response_length) {
1711                err = ib_copy_to_udata(udata, &resp, resp.response_length);
1712                if (err) {
1713                        mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1714                        goto free;
1715                }
1716        }
1717
1718        kfree(in);
1719        return &mw->ibmw;
1720
1721free:
1722        kfree(mw);
1723        kfree(in);
1724        return ERR_PTR(err);
1725}
1726
1727int mlx5_ib_dealloc_mw(struct ib_mw *mw)
1728{
1729        struct mlx5_ib_mw *mmw = to_mmw(mw);
1730        int err;
1731
1732        err =  mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
1733                                      &mmw->mmkey);
1734        if (!err)
1735                kfree(mmw);
1736        return err;
1737}
1738
1739int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
1740                            struct ib_mr_status *mr_status)
1741{
1742        struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1743        int ret = 0;
1744
1745        if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
1746                pr_err("Invalid status check mask\n");
1747                ret = -EINVAL;
1748                goto done;
1749        }
1750
1751        mr_status->fail_status = 0;
1752        if (check_mask & IB_MR_CHECK_SIG_STATUS) {
1753                if (!mmr->sig) {
1754                        ret = -EINVAL;
1755                        pr_err("signature status check requested on a non-signature enabled MR\n");
1756                        goto done;
1757                }
1758
1759                mmr->sig->sig_status_checked = true;
1760                if (!mmr->sig->sig_err_exists)
1761                        goto done;
1762
1763                if (ibmr->lkey == mmr->sig->err_item.key)
1764                        memcpy(&mr_status->sig_err, &mmr->sig->err_item,
1765                               sizeof(mr_status->sig_err));
1766                else {
1767                        mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
1768                        mr_status->sig_err.sig_err_offset = 0;
1769                        mr_status->sig_err.key = mmr->sig->err_item.key;
1770                }
1771
1772                mmr->sig->sig_err_exists = false;
1773                mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
1774        }
1775
1776done:
1777        return ret;
1778}
1779
1780static int
1781mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
1782                   struct scatterlist *sgl,
1783                   unsigned short sg_nents,
1784                   unsigned int *sg_offset_p)
1785{
1786        struct scatterlist *sg = sgl;
1787        struct mlx5_klm *klms = mr->descs;
1788        unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
1789        u32 lkey = mr->ibmr.pd->local_dma_lkey;
1790        int i;
1791
1792        mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
1793        mr->ibmr.length = 0;
1794        mr->ndescs = sg_nents;
1795
1796        for_each_sg(sgl, sg, sg_nents, i) {
1797                if (unlikely(i > mr->max_descs))
1798                        break;
1799                klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
1800                klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
1801                klms[i].key = cpu_to_be32(lkey);
1802                mr->ibmr.length += sg_dma_len(sg);
1803
1804                sg_offset = 0;
1805        }
1806
1807        if (sg_offset_p)
1808                *sg_offset_p = sg_offset;
1809
1810        return i;
1811}
1812
1813static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
1814{
1815        struct mlx5_ib_mr *mr = to_mmr(ibmr);
1816        __be64 *descs;
1817
1818        if (unlikely(mr->ndescs == mr->max_descs))
1819                return -ENOMEM;
1820
1821        descs = mr->descs;
1822        descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
1823
1824        return 0;
1825}
1826
1827int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
1828                      unsigned int *sg_offset)
1829{
1830        struct mlx5_ib_mr *mr = to_mmr(ibmr);
1831        int n;
1832
1833        mr->ndescs = 0;
1834
1835        ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
1836                                   mr->desc_size * mr->max_descs,
1837                                   DMA_TO_DEVICE);
1838
1839        if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
1840                n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
1841        else
1842                n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
1843                                mlx5_set_page);
1844
1845        ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
1846                                      mr->desc_size * mr->max_descs,
1847                                      DMA_TO_DEVICE);
1848
1849        return n;
1850}
1851