linux/drivers/infiniband/hw/mlx5/mr.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
   3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34
  35#include <linux/kref.h>
  36#include <linux/random.h>
  37#include <linux/debugfs.h>
  38#include <linux/export.h>
  39#include <linux/delay.h>
  40#include <linux/dma-buf.h>
  41#include <linux/dma-resv.h>
  42#include <rdma/ib_umem.h>
  43#include <rdma/ib_umem_odp.h>
  44#include <rdma/ib_verbs.h>
  45#include "dm.h"
  46#include "mlx5_ib.h"
  47
  48/*
  49 * We can't use an array for xlt_emergency_page because dma_map_single doesn't
  50 * work on kernel modules memory
  51 */
  52void *xlt_emergency_page;
  53static DEFINE_MUTEX(xlt_emergency_page_mutex);
  54
  55enum {
  56        MAX_PENDING_REG_MR = 8,
  57};
  58
  59#define MLX5_UMR_ALIGN 2048
  60
  61static void
  62create_mkey_callback(int status, struct mlx5_async_work *context);
  63static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
  64                                     u64 iova, int access_flags,
  65                                     unsigned int page_size, bool populate);
  66
  67static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
  68                                          struct ib_pd *pd)
  69{
  70        struct mlx5_ib_dev *dev = to_mdev(pd->device);
  71        bool ro_pci_enabled = pcie_relaxed_ordering_enabled(dev->mdev->pdev);
  72
  73        MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
  74        MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
  75        MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
  76        MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
  77        MLX5_SET(mkc, mkc, lr, 1);
  78
  79        if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
  80                MLX5_SET(mkc, mkc, relaxed_ordering_write,
  81                         (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
  82        if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
  83                MLX5_SET(mkc, mkc, relaxed_ordering_read,
  84                         (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
  85
  86        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
  87        MLX5_SET(mkc, mkc, qpn, 0xffffff);
  88        MLX5_SET64(mkc, mkc, start_addr, start_addr);
  89}
  90
  91static void
  92assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
  93                    u32 *in)
  94{
  95        u8 key = atomic_inc_return(&dev->mkey_var);
  96        void *mkc;
  97
  98        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
  99        MLX5_SET(mkc, mkc, mkey_7_0, key);
 100        mkey->key = key;
 101}
 102
 103static int
 104mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
 105                    u32 *in, int inlen)
 106{
 107        assign_mkey_variant(dev, mkey, in);
 108        return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen);
 109}
 110
 111static int
 112mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
 113                       struct mlx5_core_mkey *mkey,
 114                       struct mlx5_async_ctx *async_ctx,
 115                       u32 *in, int inlen, u32 *out, int outlen,
 116                       struct mlx5_async_work *context)
 117{
 118        MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
 119        assign_mkey_variant(dev, mkey, in);
 120        return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
 121                                create_mkey_callback, context);
 122}
 123
 124static int mr_cache_max_order(struct mlx5_ib_dev *dev);
 125static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 126
 127static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
 128{
 129        return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
 130}
 131
 132static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 133{
 134        WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
 135
 136        return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 137}
 138
 139static void create_mkey_callback(int status, struct mlx5_async_work *context)
 140{
 141        struct mlx5_ib_mr *mr =
 142                container_of(context, struct mlx5_ib_mr, cb_work);
 143        struct mlx5_cache_ent *ent = mr->cache_ent;
 144        struct mlx5_ib_dev *dev = ent->dev;
 145        unsigned long flags;
 146
 147        if (status) {
 148                mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 149                kfree(mr);
 150                spin_lock_irqsave(&ent->lock, flags);
 151                ent->pending--;
 152                WRITE_ONCE(dev->fill_delay, 1);
 153                spin_unlock_irqrestore(&ent->lock, flags);
 154                mod_timer(&dev->delay_timer, jiffies + HZ);
 155                return;
 156        }
 157
 158        mr->mmkey.type = MLX5_MKEY_MR;
 159        mr->mmkey.key |= mlx5_idx_to_mkey(
 160                MLX5_GET(create_mkey_out, mr->out, mkey_index));
 161        init_waitqueue_head(&mr->mmkey.wait);
 162
 163        WRITE_ONCE(dev->cache.last_add, jiffies);
 164
 165        spin_lock_irqsave(&ent->lock, flags);
 166        list_add_tail(&mr->list, &ent->head);
 167        ent->available_mrs++;
 168        ent->total_mrs++;
 169        /* If we are doing fill_to_high_water then keep going. */
 170        queue_adjust_cache_locked(ent);
 171        ent->pending--;
 172        spin_unlock_irqrestore(&ent->lock, flags);
 173}
 174
 175static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
 176{
 177        struct mlx5_ib_mr *mr;
 178
 179        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 180        if (!mr)
 181                return NULL;
 182        mr->cache_ent = ent;
 183
 184        set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
 185        MLX5_SET(mkc, mkc, free, 1);
 186        MLX5_SET(mkc, mkc, umr_en, 1);
 187        MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
 188        MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
 189
 190        MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
 191        MLX5_SET(mkc, mkc, log_page_size, ent->page);
 192        return mr;
 193}
 194
 195/* Asynchronously schedule new MRs to be populated in the cache. */
 196static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 197{
 198        size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 199        struct mlx5_ib_mr *mr;
 200        void *mkc;
 201        u32 *in;
 202        int err = 0;
 203        int i;
 204
 205        in = kzalloc(inlen, GFP_KERNEL);
 206        if (!in)
 207                return -ENOMEM;
 208
 209        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 210        for (i = 0; i < num; i++) {
 211                mr = alloc_cache_mr(ent, mkc);
 212                if (!mr) {
 213                        err = -ENOMEM;
 214                        break;
 215                }
 216                spin_lock_irq(&ent->lock);
 217                if (ent->pending >= MAX_PENDING_REG_MR) {
 218                        err = -EAGAIN;
 219                        spin_unlock_irq(&ent->lock);
 220                        kfree(mr);
 221                        break;
 222                }
 223                ent->pending++;
 224                spin_unlock_irq(&ent->lock);
 225                err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
 226                                             &ent->dev->async_ctx, in, inlen,
 227                                             mr->out, sizeof(mr->out),
 228                                             &mr->cb_work);
 229                if (err) {
 230                        spin_lock_irq(&ent->lock);
 231                        ent->pending--;
 232                        spin_unlock_irq(&ent->lock);
 233                        mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
 234                        kfree(mr);
 235                        break;
 236                }
 237        }
 238
 239        kfree(in);
 240        return err;
 241}
 242
 243/* Synchronously create a MR in the cache */
 244static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
 245{
 246        size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 247        struct mlx5_ib_mr *mr;
 248        void *mkc;
 249        u32 *in;
 250        int err;
 251
 252        in = kzalloc(inlen, GFP_KERNEL);
 253        if (!in)
 254                return ERR_PTR(-ENOMEM);
 255        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 256
 257        mr = alloc_cache_mr(ent, mkc);
 258        if (!mr) {
 259                err = -ENOMEM;
 260                goto free_in;
 261        }
 262
 263        err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen);
 264        if (err)
 265                goto free_mr;
 266
 267        mr->mmkey.type = MLX5_MKEY_MR;
 268        WRITE_ONCE(ent->dev->cache.last_add, jiffies);
 269        spin_lock_irq(&ent->lock);
 270        ent->total_mrs++;
 271        spin_unlock_irq(&ent->lock);
 272        kfree(in);
 273        return mr;
 274free_mr:
 275        kfree(mr);
 276free_in:
 277        kfree(in);
 278        return ERR_PTR(err);
 279}
 280
 281static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 282{
 283        struct mlx5_ib_mr *mr;
 284
 285        lockdep_assert_held(&ent->lock);
 286        if (list_empty(&ent->head))
 287                return;
 288        mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 289        list_del(&mr->list);
 290        ent->available_mrs--;
 291        ent->total_mrs--;
 292        spin_unlock_irq(&ent->lock);
 293        mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
 294        kfree(mr);
 295        spin_lock_irq(&ent->lock);
 296}
 297
 298static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 299                                bool limit_fill)
 300{
 301        int err;
 302
 303        lockdep_assert_held(&ent->lock);
 304
 305        while (true) {
 306                if (limit_fill)
 307                        target = ent->limit * 2;
 308                if (target == ent->available_mrs + ent->pending)
 309                        return 0;
 310                if (target > ent->available_mrs + ent->pending) {
 311                        u32 todo = target - (ent->available_mrs + ent->pending);
 312
 313                        spin_unlock_irq(&ent->lock);
 314                        err = add_keys(ent, todo);
 315                        if (err == -EAGAIN)
 316                                usleep_range(3000, 5000);
 317                        spin_lock_irq(&ent->lock);
 318                        if (err) {
 319                                if (err != -EAGAIN)
 320                                        return err;
 321                        } else
 322                                return 0;
 323                } else {
 324                        remove_cache_mr_locked(ent);
 325                }
 326        }
 327}
 328
 329static ssize_t size_write(struct file *filp, const char __user *buf,
 330                          size_t count, loff_t *pos)
 331{
 332        struct mlx5_cache_ent *ent = filp->private_data;
 333        u32 target;
 334        int err;
 335
 336        err = kstrtou32_from_user(buf, count, 0, &target);
 337        if (err)
 338                return err;
 339
 340        /*
 341         * Target is the new value of total_mrs the user requests, however we
 342         * cannot free MRs that are in use. Compute the target value for
 343         * available_mrs.
 344         */
 345        spin_lock_irq(&ent->lock);
 346        if (target < ent->total_mrs - ent->available_mrs) {
 347                err = -EINVAL;
 348                goto err_unlock;
 349        }
 350        target = target - (ent->total_mrs - ent->available_mrs);
 351        if (target < ent->limit || target > ent->limit*2) {
 352                err = -EINVAL;
 353                goto err_unlock;
 354        }
 355        err = resize_available_mrs(ent, target, false);
 356        if (err)
 357                goto err_unlock;
 358        spin_unlock_irq(&ent->lock);
 359
 360        return count;
 361
 362err_unlock:
 363        spin_unlock_irq(&ent->lock);
 364        return err;
 365}
 366
 367static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 368                         loff_t *pos)
 369{
 370        struct mlx5_cache_ent *ent = filp->private_data;
 371        char lbuf[20];
 372        int err;
 373
 374        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
 375        if (err < 0)
 376                return err;
 377
 378        return simple_read_from_buffer(buf, count, pos, lbuf, err);
 379}
 380
 381static const struct file_operations size_fops = {
 382        .owner  = THIS_MODULE,
 383        .open   = simple_open,
 384        .write  = size_write,
 385        .read   = size_read,
 386};
 387
 388static ssize_t limit_write(struct file *filp, const char __user *buf,
 389                           size_t count, loff_t *pos)
 390{
 391        struct mlx5_cache_ent *ent = filp->private_data;
 392        u32 var;
 393        int err;
 394
 395        err = kstrtou32_from_user(buf, count, 0, &var);
 396        if (err)
 397                return err;
 398
 399        /*
 400         * Upon set we immediately fill the cache to high water mark implied by
 401         * the limit.
 402         */
 403        spin_lock_irq(&ent->lock);
 404        ent->limit = var;
 405        err = resize_available_mrs(ent, 0, true);
 406        spin_unlock_irq(&ent->lock);
 407        if (err)
 408                return err;
 409        return count;
 410}
 411
 412static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
 413                          loff_t *pos)
 414{
 415        struct mlx5_cache_ent *ent = filp->private_data;
 416        char lbuf[20];
 417        int err;
 418
 419        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
 420        if (err < 0)
 421                return err;
 422
 423        return simple_read_from_buffer(buf, count, pos, lbuf, err);
 424}
 425
 426static const struct file_operations limit_fops = {
 427        .owner  = THIS_MODULE,
 428        .open   = simple_open,
 429        .write  = limit_write,
 430        .read   = limit_read,
 431};
 432
 433static bool someone_adding(struct mlx5_mr_cache *cache)
 434{
 435        unsigned int i;
 436
 437        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 438                struct mlx5_cache_ent *ent = &cache->ent[i];
 439                bool ret;
 440
 441                spin_lock_irq(&ent->lock);
 442                ret = ent->available_mrs < ent->limit;
 443                spin_unlock_irq(&ent->lock);
 444                if (ret)
 445                        return true;
 446        }
 447        return false;
 448}
 449
 450/*
 451 * Check if the bucket is outside the high/low water mark and schedule an async
 452 * update. The cache refill has hysteresis, once the low water mark is hit it is
 453 * refilled up to the high mark.
 454 */
 455static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 456{
 457        lockdep_assert_held(&ent->lock);
 458
 459        if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
 460                return;
 461        if (ent->available_mrs < ent->limit) {
 462                ent->fill_to_high_water = true;
 463                queue_work(ent->dev->cache.wq, &ent->work);
 464        } else if (ent->fill_to_high_water &&
 465                   ent->available_mrs + ent->pending < 2 * ent->limit) {
 466                /*
 467                 * Once we start populating due to hitting a low water mark
 468                 * continue until we pass the high water mark.
 469                 */
 470                queue_work(ent->dev->cache.wq, &ent->work);
 471        } else if (ent->available_mrs == 2 * ent->limit) {
 472                ent->fill_to_high_water = false;
 473        } else if (ent->available_mrs > 2 * ent->limit) {
 474                /* Queue deletion of excess entries */
 475                ent->fill_to_high_water = false;
 476                if (ent->pending)
 477                        queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
 478                                           msecs_to_jiffies(1000));
 479                else
 480                        queue_work(ent->dev->cache.wq, &ent->work);
 481        }
 482}
 483
 484static void __cache_work_func(struct mlx5_cache_ent *ent)
 485{
 486        struct mlx5_ib_dev *dev = ent->dev;
 487        struct mlx5_mr_cache *cache = &dev->cache;
 488        int err;
 489
 490        spin_lock_irq(&ent->lock);
 491        if (ent->disabled)
 492                goto out;
 493
 494        if (ent->fill_to_high_water &&
 495            ent->available_mrs + ent->pending < 2 * ent->limit &&
 496            !READ_ONCE(dev->fill_delay)) {
 497                spin_unlock_irq(&ent->lock);
 498                err = add_keys(ent, 1);
 499                spin_lock_irq(&ent->lock);
 500                if (ent->disabled)
 501                        goto out;
 502                if (err) {
 503                        /*
 504                         * EAGAIN only happens if pending is positive, so we
 505                         * will be rescheduled from reg_mr_callback(). The only
 506                         * failure path here is ENOMEM.
 507                         */
 508                        if (err != -EAGAIN) {
 509                                mlx5_ib_warn(
 510                                        dev,
 511                                        "command failed order %d, err %d\n",
 512                                        ent->order, err);
 513                                queue_delayed_work(cache->wq, &ent->dwork,
 514                                                   msecs_to_jiffies(1000));
 515                        }
 516                }
 517        } else if (ent->available_mrs > 2 * ent->limit) {
 518                bool need_delay;
 519
 520                /*
 521                 * The remove_cache_mr() logic is performed as garbage
 522                 * collection task. Such task is intended to be run when no
 523                 * other active processes are running.
 524                 *
 525                 * The need_resched() will return TRUE if there are user tasks
 526                 * to be activated in near future.
 527                 *
 528                 * In such case, we don't execute remove_cache_mr() and postpone
 529                 * the garbage collection work to try to run in next cycle, in
 530                 * order to free CPU resources to other tasks.
 531                 */
 532                spin_unlock_irq(&ent->lock);
 533                need_delay = need_resched() || someone_adding(cache) ||
 534                             !time_after(jiffies,
 535                                         READ_ONCE(cache->last_add) + 300 * HZ);
 536                spin_lock_irq(&ent->lock);
 537                if (ent->disabled)
 538                        goto out;
 539                if (need_delay)
 540                        queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
 541                remove_cache_mr_locked(ent);
 542                queue_adjust_cache_locked(ent);
 543        }
 544out:
 545        spin_unlock_irq(&ent->lock);
 546}
 547
 548static void delayed_cache_work_func(struct work_struct *work)
 549{
 550        struct mlx5_cache_ent *ent;
 551
 552        ent = container_of(work, struct mlx5_cache_ent, dwork.work);
 553        __cache_work_func(ent);
 554}
 555
 556static void cache_work_func(struct work_struct *work)
 557{
 558        struct mlx5_cache_ent *ent;
 559
 560        ent = container_of(work, struct mlx5_cache_ent, work);
 561        __cache_work_func(ent);
 562}
 563
 564/* Allocate a special entry from the cache */
 565struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 566                                       unsigned int entry, int access_flags)
 567{
 568        struct mlx5_mr_cache *cache = &dev->cache;
 569        struct mlx5_cache_ent *ent;
 570        struct mlx5_ib_mr *mr;
 571
 572        if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
 573                    entry >= ARRAY_SIZE(cache->ent)))
 574                return ERR_PTR(-EINVAL);
 575
 576        /* Matches access in alloc_cache_mr() */
 577        if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
 578                return ERR_PTR(-EOPNOTSUPP);
 579
 580        ent = &cache->ent[entry];
 581        spin_lock_irq(&ent->lock);
 582        if (list_empty(&ent->head)) {
 583                spin_unlock_irq(&ent->lock);
 584                mr = create_cache_mr(ent);
 585                if (IS_ERR(mr))
 586                        return mr;
 587        } else {
 588                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 589                list_del(&mr->list);
 590                ent->available_mrs--;
 591                queue_adjust_cache_locked(ent);
 592                spin_unlock_irq(&ent->lock);
 593
 594                mlx5_clear_mr(mr);
 595        }
 596        mr->access_flags = access_flags;
 597        return mr;
 598}
 599
 600/* Return a MR already available in the cache */
 601static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
 602{
 603        struct mlx5_ib_dev *dev = req_ent->dev;
 604        struct mlx5_ib_mr *mr = NULL;
 605        struct mlx5_cache_ent *ent = req_ent;
 606
 607        /* Try larger MR pools from the cache to satisfy the allocation */
 608        for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) {
 609                mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
 610                            ent - dev->cache.ent);
 611
 612                spin_lock_irq(&ent->lock);
 613                if (!list_empty(&ent->head)) {
 614                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 615                                              list);
 616                        list_del(&mr->list);
 617                        ent->available_mrs--;
 618                        queue_adjust_cache_locked(ent);
 619                        spin_unlock_irq(&ent->lock);
 620                        mlx5_clear_mr(mr);
 621                        return mr;
 622                }
 623                queue_adjust_cache_locked(ent);
 624                spin_unlock_irq(&ent->lock);
 625        }
 626        req_ent->miss++;
 627        return NULL;
 628}
 629
 630static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 631{
 632        struct mlx5_cache_ent *ent = mr->cache_ent;
 633
 634        spin_lock_irq(&ent->lock);
 635        list_add_tail(&mr->list, &ent->head);
 636        ent->available_mrs++;
 637        queue_adjust_cache_locked(ent);
 638        spin_unlock_irq(&ent->lock);
 639}
 640
 641static void clean_keys(struct mlx5_ib_dev *dev, int c)
 642{
 643        struct mlx5_mr_cache *cache = &dev->cache;
 644        struct mlx5_cache_ent *ent = &cache->ent[c];
 645        struct mlx5_ib_mr *tmp_mr;
 646        struct mlx5_ib_mr *mr;
 647        LIST_HEAD(del_list);
 648
 649        cancel_delayed_work(&ent->dwork);
 650        while (1) {
 651                spin_lock_irq(&ent->lock);
 652                if (list_empty(&ent->head)) {
 653                        spin_unlock_irq(&ent->lock);
 654                        break;
 655                }
 656                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 657                list_move(&mr->list, &del_list);
 658                ent->available_mrs--;
 659                ent->total_mrs--;
 660                spin_unlock_irq(&ent->lock);
 661                mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 662        }
 663
 664        list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 665                list_del(&mr->list);
 666                kfree(mr);
 667        }
 668}
 669
 670static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 671{
 672        if (!mlx5_debugfs_root || dev->is_rep)
 673                return;
 674
 675        debugfs_remove_recursive(dev->cache.root);
 676        dev->cache.root = NULL;
 677}
 678
 679static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 680{
 681        struct mlx5_mr_cache *cache = &dev->cache;
 682        struct mlx5_cache_ent *ent;
 683        struct dentry *dir;
 684        int i;
 685
 686        if (!mlx5_debugfs_root || dev->is_rep)
 687                return;
 688
 689        cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
 690
 691        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 692                ent = &cache->ent[i];
 693                sprintf(ent->name, "%d", ent->order);
 694                dir = debugfs_create_dir(ent->name, cache->root);
 695                debugfs_create_file("size", 0600, dir, ent, &size_fops);
 696                debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
 697                debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
 698                debugfs_create_u32("miss", 0600, dir, &ent->miss);
 699        }
 700}
 701
 702static void delay_time_func(struct timer_list *t)
 703{
 704        struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 705
 706        WRITE_ONCE(dev->fill_delay, 0);
 707}
 708
 709int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 710{
 711        struct mlx5_mr_cache *cache = &dev->cache;
 712        struct mlx5_cache_ent *ent;
 713        int i;
 714
 715        mutex_init(&dev->slow_path_mutex);
 716        cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 717        if (!cache->wq) {
 718                mlx5_ib_warn(dev, "failed to create work queue\n");
 719                return -ENOMEM;
 720        }
 721
 722        mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 723        timer_setup(&dev->delay_timer, delay_time_func, 0);
 724        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 725                ent = &cache->ent[i];
 726                INIT_LIST_HEAD(&ent->head);
 727                spin_lock_init(&ent->lock);
 728                ent->order = i + 2;
 729                ent->dev = dev;
 730                ent->limit = 0;
 731
 732                INIT_WORK(&ent->work, cache_work_func);
 733                INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 734
 735                if (i > MR_CACHE_LAST_STD_ENTRY) {
 736                        mlx5_odp_init_mr_cache_entry(ent);
 737                        continue;
 738                }
 739
 740                if (ent->order > mr_cache_max_order(dev))
 741                        continue;
 742
 743                ent->page = PAGE_SHIFT;
 744                ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
 745                           MLX5_IB_UMR_OCTOWORD;
 746                ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 747                if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
 748                    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
 749                    mlx5_ib_can_load_pas_with_umr(dev, 0))
 750                        ent->limit = dev->mdev->profile.mr_cache[i].limit;
 751                else
 752                        ent->limit = 0;
 753                spin_lock_irq(&ent->lock);
 754                queue_adjust_cache_locked(ent);
 755                spin_unlock_irq(&ent->lock);
 756        }
 757
 758        mlx5_mr_cache_debugfs_init(dev);
 759
 760        return 0;
 761}
 762
 763int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 764{
 765        unsigned int i;
 766
 767        if (!dev->cache.wq)
 768                return 0;
 769
 770        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 771                struct mlx5_cache_ent *ent = &dev->cache.ent[i];
 772
 773                spin_lock_irq(&ent->lock);
 774                ent->disabled = true;
 775                spin_unlock_irq(&ent->lock);
 776                cancel_work_sync(&ent->work);
 777                cancel_delayed_work_sync(&ent->dwork);
 778        }
 779
 780        mlx5_mr_cache_debugfs_cleanup(dev);
 781        mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 782
 783        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
 784                clean_keys(dev, i);
 785
 786        destroy_workqueue(dev->cache.wq);
 787        del_timer_sync(&dev->delay_timer);
 788
 789        return 0;
 790}
 791
 792struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
 793{
 794        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 795        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 796        struct mlx5_ib_mr *mr;
 797        void *mkc;
 798        u32 *in;
 799        int err;
 800
 801        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 802        if (!mr)
 803                return ERR_PTR(-ENOMEM);
 804
 805        in = kzalloc(inlen, GFP_KERNEL);
 806        if (!in) {
 807                err = -ENOMEM;
 808                goto err_free;
 809        }
 810
 811        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 812
 813        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
 814        MLX5_SET(mkc, mkc, length64, 1);
 815        set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
 816                                      pd);
 817
 818        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 819        if (err)
 820                goto err_in;
 821
 822        kfree(in);
 823        mr->mmkey.type = MLX5_MKEY_MR;
 824        mr->ibmr.lkey = mr->mmkey.key;
 825        mr->ibmr.rkey = mr->mmkey.key;
 826        mr->umem = NULL;
 827
 828        return &mr->ibmr;
 829
 830err_in:
 831        kfree(in);
 832
 833err_free:
 834        kfree(mr);
 835
 836        return ERR_PTR(err);
 837}
 838
 839static int get_octo_len(u64 addr, u64 len, int page_shift)
 840{
 841        u64 page_size = 1ULL << page_shift;
 842        u64 offset;
 843        int npages;
 844
 845        offset = addr & (page_size - 1);
 846        npages = ALIGN(len + offset, page_size) >> page_shift;
 847        return (npages + 1) / 2;
 848}
 849
 850static int mr_cache_max_order(struct mlx5_ib_dev *dev)
 851{
 852        if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
 853                return MR_CACHE_LAST_STD_ENTRY + 2;
 854        return MLX5_MAX_UMR_SHIFT;
 855}
 856
 857static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
 858{
 859        struct mlx5_ib_umr_context *context =
 860                container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
 861
 862        context->status = wc->status;
 863        complete(&context->done);
 864}
 865
 866static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
 867{
 868        context->cqe.done = mlx5_ib_umr_done;
 869        context->status = -1;
 870        init_completion(&context->done);
 871}
 872
 873static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
 874                                  struct mlx5_umr_wr *umrwr)
 875{
 876        struct umr_common *umrc = &dev->umrc;
 877        const struct ib_send_wr *bad;
 878        int err;
 879        struct mlx5_ib_umr_context umr_context;
 880
 881        mlx5_ib_init_umr_context(&umr_context);
 882        umrwr->wr.wr_cqe = &umr_context.cqe;
 883
 884        down(&umrc->sem);
 885        err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
 886        if (err) {
 887                mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
 888        } else {
 889                wait_for_completion(&umr_context.done);
 890                if (umr_context.status != IB_WC_SUCCESS) {
 891                        mlx5_ib_warn(dev, "reg umr failed (%u)\n",
 892                                     umr_context.status);
 893                        err = -EFAULT;
 894                }
 895        }
 896        up(&umrc->sem);
 897        return err;
 898}
 899
 900static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
 901                                                      unsigned int order)
 902{
 903        struct mlx5_mr_cache *cache = &dev->cache;
 904
 905        if (order < cache->ent[0].order)
 906                return &cache->ent[0];
 907        order = order - cache->ent[0].order;
 908        if (order > MR_CACHE_LAST_STD_ENTRY)
 909                return NULL;
 910        return &cache->ent[order];
 911}
 912
 913static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 914                          u64 length, int access_flags)
 915{
 916        mr->ibmr.lkey = mr->mmkey.key;
 917        mr->ibmr.rkey = mr->mmkey.key;
 918        mr->ibmr.length = length;
 919        mr->ibmr.device = &dev->ib_dev;
 920        mr->access_flags = access_flags;
 921}
 922
 923static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
 924                                                  u64 iova)
 925{
 926        /*
 927         * The alignment of iova has already been checked upon entering
 928         * UVERBS_METHOD_REG_DMABUF_MR
 929         */
 930        umem->iova = iova;
 931        return PAGE_SIZE;
 932}
 933
 934static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 935                                             struct ib_umem *umem, u64 iova,
 936                                             int access_flags)
 937{
 938        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 939        struct mlx5_cache_ent *ent;
 940        struct mlx5_ib_mr *mr;
 941        unsigned int page_size;
 942
 943        if (umem->is_dmabuf)
 944                page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
 945        else
 946                page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
 947                                                     0, iova);
 948        if (WARN_ON(!page_size))
 949                return ERR_PTR(-EINVAL);
 950        ent = mr_cache_ent_from_order(
 951                dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
 952        /*
 953         * Matches access in alloc_cache_mr(). If the MR can't come from the
 954         * cache then synchronously create an uncached one.
 955         */
 956        if (!ent || ent->limit == 0 ||
 957            !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) {
 958                mutex_lock(&dev->slow_path_mutex);
 959                mr = reg_create(pd, umem, iova, access_flags, page_size, false);
 960                mutex_unlock(&dev->slow_path_mutex);
 961                return mr;
 962        }
 963
 964        mr = get_cache_mr(ent);
 965        if (!mr) {
 966                mr = create_cache_mr(ent);
 967                /*
 968                 * The above already tried to do the same stuff as reg_create(),
 969                 * no reason to try it again.
 970                 */
 971                if (IS_ERR(mr))
 972                        return mr;
 973        }
 974
 975        mr->ibmr.pd = pd;
 976        mr->umem = umem;
 977        mr->mmkey.iova = iova;
 978        mr->mmkey.size = umem->length;
 979        mr->mmkey.pd = to_mpd(pd)->pdn;
 980        mr->page_shift = order_base_2(page_size);
 981        set_mr_fields(dev, mr, umem->length, access_flags);
 982
 983        return mr;
 984}
 985
 986#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
 987                            MLX5_UMR_MTT_ALIGNMENT)
 988#define MLX5_SPARE_UMR_CHUNK 0x10000
 989
 990/*
 991 * Allocate a temporary buffer to hold the per-page information to transfer to
 992 * HW. For efficiency this should be as large as it can be, but buffer
 993 * allocation failure is not allowed, so try smaller sizes.
 994 */
 995static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
 996{
 997        const size_t xlt_chunk_align =
 998                MLX5_UMR_MTT_ALIGNMENT / sizeof(ent_size);
 999        size_t size;
1000        void *res = NULL;
1001
1002        static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
1003
1004        /*
1005         * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
1006         * allocation can't trigger any kind of reclaim.
1007         */
1008        might_sleep();
1009
1010        gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
1011
1012        /*
1013         * If the system already has a suitable high order page then just use
1014         * that, but don't try hard to create one. This max is about 1M, so a
1015         * free x86 huge page will satisfy it.
1016         */
1017        size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
1018                     MLX5_MAX_UMR_CHUNK);
1019        *nents = size / ent_size;
1020        res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1021                                       get_order(size));
1022        if (res)
1023                return res;
1024
1025        if (size > MLX5_SPARE_UMR_CHUNK) {
1026                size = MLX5_SPARE_UMR_CHUNK;
1027                *nents = get_order(size) / ent_size;
1028                res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1029                                               get_order(size));
1030                if (res)
1031                        return res;
1032        }
1033
1034        *nents = PAGE_SIZE / ent_size;
1035        res = (void *)__get_free_page(gfp_mask);
1036        if (res)
1037                return res;
1038
1039        mutex_lock(&xlt_emergency_page_mutex);
1040        memset(xlt_emergency_page, 0, PAGE_SIZE);
1041        return xlt_emergency_page;
1042}
1043
1044static void mlx5_ib_free_xlt(void *xlt, size_t length)
1045{
1046        if (xlt == xlt_emergency_page) {
1047                mutex_unlock(&xlt_emergency_page_mutex);
1048                return;
1049        }
1050
1051        free_pages((unsigned long)xlt, get_order(length));
1052}
1053
1054/*
1055 * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
1056 * submission.
1057 */
1058static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
1059                                   struct mlx5_umr_wr *wr, struct ib_sge *sg,
1060                                   size_t nents, size_t ent_size,
1061                                   unsigned int flags)
1062{
1063        struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1064        struct device *ddev = &dev->mdev->pdev->dev;
1065        dma_addr_t dma;
1066        void *xlt;
1067
1068        xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
1069                                flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
1070                                                                 GFP_KERNEL);
1071        sg->length = nents * ent_size;
1072        dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
1073        if (dma_mapping_error(ddev, dma)) {
1074                mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
1075                mlx5_ib_free_xlt(xlt, sg->length);
1076                return NULL;
1077        }
1078        sg->addr = dma;
1079        sg->lkey = dev->umrc.pd->local_dma_lkey;
1080
1081        memset(wr, 0, sizeof(*wr));
1082        wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
1083        if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
1084                wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1085        wr->wr.sg_list = sg;
1086        wr->wr.num_sge = 1;
1087        wr->wr.opcode = MLX5_IB_WR_UMR;
1088        wr->pd = mr->ibmr.pd;
1089        wr->mkey = mr->mmkey.key;
1090        wr->length = mr->mmkey.size;
1091        wr->virt_addr = mr->mmkey.iova;
1092        wr->access_flags = mr->access_flags;
1093        wr->page_shift = mr->page_shift;
1094        wr->xlt_size = sg->length;
1095        return xlt;
1096}
1097
1098static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
1099                                   struct ib_sge *sg)
1100{
1101        struct device *ddev = &dev->mdev->pdev->dev;
1102
1103        dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
1104        mlx5_ib_free_xlt(xlt, sg->length);
1105}
1106
1107static unsigned int xlt_wr_final_send_flags(unsigned int flags)
1108{
1109        unsigned int res = 0;
1110
1111        if (flags & MLX5_IB_UPD_XLT_ENABLE)
1112                res |= MLX5_IB_SEND_UMR_ENABLE_MR |
1113                       MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1114                       MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1115        if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
1116                res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1117        if (flags & MLX5_IB_UPD_XLT_ADDR)
1118                res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1119        return res;
1120}
1121
1122int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
1123                       int page_shift, int flags)
1124{
1125        struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1126        struct device *ddev = &dev->mdev->pdev->dev;
1127        void *xlt;
1128        struct mlx5_umr_wr wr;
1129        struct ib_sge sg;
1130        int err = 0;
1131        int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
1132                               ? sizeof(struct mlx5_klm)
1133                               : sizeof(struct mlx5_mtt);
1134        const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
1135        const int page_mask = page_align - 1;
1136        size_t pages_mapped = 0;
1137        size_t pages_to_map = 0;
1138        size_t pages_iter;
1139        size_t size_to_map = 0;
1140        size_t orig_sg_length;
1141
1142        if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
1143            !umr_can_use_indirect_mkey(dev))
1144                return -EPERM;
1145
1146        if (WARN_ON(!mr->umem->is_odp))
1147                return -EINVAL;
1148
1149        /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1150         * so we need to align the offset and length accordingly
1151         */
1152        if (idx & page_mask) {
1153                npages += idx & page_mask;
1154                idx &= ~page_mask;
1155        }
1156        pages_to_map = ALIGN(npages, page_align);
1157
1158        xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
1159        if (!xlt)
1160                return -ENOMEM;
1161        pages_iter = sg.length / desc_size;
1162        orig_sg_length = sg.length;
1163
1164        if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1165                struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1166                size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1167
1168                pages_to_map = min_t(size_t, pages_to_map, max_pages);
1169        }
1170
1171        wr.page_shift = page_shift;
1172
1173        for (pages_mapped = 0;
1174             pages_mapped < pages_to_map && !err;
1175             pages_mapped += pages_iter, idx += pages_iter) {
1176                npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1177                size_to_map = npages * desc_size;
1178                dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1179                                        DMA_TO_DEVICE);
1180                mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1181                dma_sync_single_for_device(ddev, sg.addr, sg.length,
1182                                           DMA_TO_DEVICE);
1183
1184                sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
1185
1186                if (pages_mapped + pages_iter >= pages_to_map)
1187                        wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1188
1189                wr.offset = idx * desc_size;
1190                wr.xlt_size = sg.length;
1191
1192                err = mlx5_ib_post_send_wait(dev, &wr);
1193        }
1194        sg.length = orig_sg_length;
1195        mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
1196        return err;
1197}
1198
1199/*
1200 * Send the DMA list to the HW for a normal MR using UMR.
1201 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
1202 * flag may be used.
1203 */
1204int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
1205{
1206        struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1207        struct device *ddev = &dev->mdev->pdev->dev;
1208        struct ib_block_iter biter;
1209        struct mlx5_mtt *cur_mtt;
1210        struct mlx5_umr_wr wr;
1211        size_t orig_sg_length;
1212        struct mlx5_mtt *mtt;
1213        size_t final_size;
1214        struct ib_sge sg;
1215        int err = 0;
1216
1217        if (WARN_ON(mr->umem->is_odp))
1218                return -EINVAL;
1219
1220        mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
1221                                    ib_umem_num_dma_blocks(mr->umem,
1222                                                           1 << mr->page_shift),
1223                                    sizeof(*mtt), flags);
1224        if (!mtt)
1225                return -ENOMEM;
1226        orig_sg_length = sg.length;
1227
1228        cur_mtt = mtt;
1229        rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap,
1230                             BIT(mr->page_shift)) {
1231                if (cur_mtt == (void *)mtt + sg.length) {
1232                        dma_sync_single_for_device(ddev, sg.addr, sg.length,
1233                                                   DMA_TO_DEVICE);
1234                        err = mlx5_ib_post_send_wait(dev, &wr);
1235                        if (err)
1236                                goto err;
1237                        dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1238                                                DMA_TO_DEVICE);
1239                        wr.offset += sg.length;
1240                        cur_mtt = mtt;
1241                }
1242
1243                cur_mtt->ptag =
1244                        cpu_to_be64(rdma_block_iter_dma_address(&biter) |
1245                                    MLX5_IB_MTT_PRESENT);
1246
1247                if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
1248                        cur_mtt->ptag = 0;
1249
1250                cur_mtt++;
1251        }
1252
1253        final_size = (void *)cur_mtt - (void *)mtt;
1254        sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
1255        memset(cur_mtt, 0, sg.length - final_size);
1256        wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1257        wr.xlt_size = sg.length;
1258
1259        dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
1260        err = mlx5_ib_post_send_wait(dev, &wr);
1261
1262err:
1263        sg.length = orig_sg_length;
1264        mlx5_ib_unmap_free_xlt(dev, mtt, &sg);
1265        return err;
1266}
1267
1268/*
1269 * If ibmr is NULL it will be allocated by reg_create.
1270 * Else, the given ibmr will be used.
1271 */
1272static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1273                                     u64 iova, int access_flags,
1274                                     unsigned int page_size, bool populate)
1275{
1276        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1277        struct mlx5_ib_mr *mr;
1278        __be64 *pas;
1279        void *mkc;
1280        int inlen;
1281        u32 *in;
1282        int err;
1283        bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1284
1285        if (!page_size)
1286                return ERR_PTR(-EINVAL);
1287        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1288        if (!mr)
1289                return ERR_PTR(-ENOMEM);
1290
1291        mr->ibmr.pd = pd;
1292        mr->access_flags = access_flags;
1293        mr->page_shift = order_base_2(page_size);
1294
1295        inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1296        if (populate)
1297                inlen += sizeof(*pas) *
1298                         roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1299        in = kvzalloc(inlen, GFP_KERNEL);
1300        if (!in) {
1301                err = -ENOMEM;
1302                goto err_1;
1303        }
1304        pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1305        if (populate) {
1306                if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1307                        err = -EINVAL;
1308                        goto err_2;
1309                }
1310                mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1311                                     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1312        }
1313
1314        /* The pg_access bit allows setting the access flags
1315         * in the page list submitted with the command. */
1316        MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1317
1318        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1319        set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1320                                      populate ? pd : dev->umrc.pd);
1321        MLX5_SET(mkc, mkc, free, !populate);
1322        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1323        MLX5_SET(mkc, mkc, umr_en, 1);
1324
1325        MLX5_SET64(mkc, mkc, len, umem->length);
1326        MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1327        MLX5_SET(mkc, mkc, translations_octword_size,
1328                 get_octo_len(iova, umem->length, mr->page_shift));
1329        MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1330        if (populate) {
1331                MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1332                         get_octo_len(iova, umem->length, mr->page_shift));
1333        }
1334
1335        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1336        if (err) {
1337                mlx5_ib_warn(dev, "create mkey failed\n");
1338                goto err_2;
1339        }
1340        mr->mmkey.type = MLX5_MKEY_MR;
1341        mr->desc_size = sizeof(struct mlx5_mtt);
1342        mr->umem = umem;
1343        set_mr_fields(dev, mr, umem->length, access_flags);
1344        kvfree(in);
1345
1346        mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1347
1348        return mr;
1349
1350err_2:
1351        kvfree(in);
1352err_1:
1353        kfree(mr);
1354        return ERR_PTR(err);
1355}
1356
1357static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1358                                       u64 length, int acc, int mode)
1359{
1360        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1361        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1362        struct mlx5_ib_mr *mr;
1363        void *mkc;
1364        u32 *in;
1365        int err;
1366
1367        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1368        if (!mr)
1369                return ERR_PTR(-ENOMEM);
1370
1371        in = kzalloc(inlen, GFP_KERNEL);
1372        if (!in) {
1373                err = -ENOMEM;
1374                goto err_free;
1375        }
1376
1377        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1378
1379        MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1380        MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1381        MLX5_SET64(mkc, mkc, len, length);
1382        set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1383
1384        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1385        if (err)
1386                goto err_in;
1387
1388        kfree(in);
1389
1390        set_mr_fields(dev, mr, length, acc);
1391
1392        return &mr->ibmr;
1393
1394err_in:
1395        kfree(in);
1396
1397err_free:
1398        kfree(mr);
1399
1400        return ERR_PTR(err);
1401}
1402
1403int mlx5_ib_advise_mr(struct ib_pd *pd,
1404                      enum ib_uverbs_advise_mr_advice advice,
1405                      u32 flags,
1406                      struct ib_sge *sg_list,
1407                      u32 num_sge,
1408                      struct uverbs_attr_bundle *attrs)
1409{
1410        if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1411            advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1412            advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1413                return -EOPNOTSUPP;
1414
1415        return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1416                                         sg_list, num_sge);
1417}
1418
1419struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1420                                struct ib_dm_mr_attr *attr,
1421                                struct uverbs_attr_bundle *attrs)
1422{
1423        struct mlx5_ib_dm *mdm = to_mdm(dm);
1424        struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1425        u64 start_addr = mdm->dev_addr + attr->offset;
1426        int mode;
1427
1428        switch (mdm->type) {
1429        case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1430                if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1431                        return ERR_PTR(-EINVAL);
1432
1433                mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1434                start_addr -= pci_resource_start(dev->pdev, 0);
1435                break;
1436        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1437        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1438                if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1439                        return ERR_PTR(-EINVAL);
1440
1441                mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1442                break;
1443        default:
1444                return ERR_PTR(-EINVAL);
1445        }
1446
1447        return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1448                                 attr->access_flags, mode);
1449}
1450
1451static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1452                                    u64 iova, int access_flags)
1453{
1454        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1455        struct mlx5_ib_mr *mr = NULL;
1456        bool xlt_with_umr;
1457        int err;
1458
1459        xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length);
1460        if (xlt_with_umr) {
1461                mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1462        } else {
1463                unsigned int page_size = mlx5_umem_find_best_pgsz(
1464                        umem, mkc, log_page_size, 0, iova);
1465
1466                mutex_lock(&dev->slow_path_mutex);
1467                mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1468                mutex_unlock(&dev->slow_path_mutex);
1469        }
1470        if (IS_ERR(mr)) {
1471                ib_umem_release(umem);
1472                return ERR_CAST(mr);
1473        }
1474
1475        mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1476
1477        atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1478
1479        if (xlt_with_umr) {
1480                /*
1481                 * If the MR was created with reg_create then it will be
1482                 * configured properly but left disabled. It is safe to go ahead
1483                 * and configure it again via UMR while enabling it.
1484                 */
1485                err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1486                if (err) {
1487                        mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1488                        return ERR_PTR(err);
1489                }
1490        }
1491        return &mr->ibmr;
1492}
1493
1494static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1495                                        u64 iova, int access_flags,
1496                                        struct ib_udata *udata)
1497{
1498        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1499        struct ib_umem_odp *odp;
1500        struct mlx5_ib_mr *mr;
1501        int err;
1502
1503        if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1504                return ERR_PTR(-EOPNOTSUPP);
1505
1506        err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1507        if (err)
1508                return ERR_PTR(err);
1509        if (!start && length == U64_MAX) {
1510                if (iova != 0)
1511                        return ERR_PTR(-EINVAL);
1512                if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1513                        return ERR_PTR(-EINVAL);
1514
1515                mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1516                if (IS_ERR(mr))
1517                        return ERR_CAST(mr);
1518                return &mr->ibmr;
1519        }
1520
1521        /* ODP requires xlt update via umr to work. */
1522        if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1523                return ERR_PTR(-EINVAL);
1524
1525        odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1526                              &mlx5_mn_ops);
1527        if (IS_ERR(odp))
1528                return ERR_CAST(odp);
1529
1530        mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1531        if (IS_ERR(mr)) {
1532                ib_umem_release(&odp->umem);
1533                return ERR_CAST(mr);
1534        }
1535
1536        odp->private = mr;
1537        err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1538        if (err)
1539                goto err_dereg_mr;
1540
1541        err = mlx5_ib_init_odp_mr(mr);
1542        if (err)
1543                goto err_dereg_mr;
1544        return &mr->ibmr;
1545
1546err_dereg_mr:
1547        mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1548        return ERR_PTR(err);
1549}
1550
1551struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1552                                  u64 iova, int access_flags,
1553                                  struct ib_udata *udata)
1554{
1555        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1556        struct ib_umem *umem;
1557
1558        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1559                return ERR_PTR(-EOPNOTSUPP);
1560
1561        mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1562                    start, iova, length, access_flags);
1563
1564        if (access_flags & IB_ACCESS_ON_DEMAND)
1565                return create_user_odp_mr(pd, start, length, iova, access_flags,
1566                                          udata);
1567        umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1568        if (IS_ERR(umem))
1569                return ERR_CAST(umem);
1570        return create_real_mr(pd, umem, iova, access_flags);
1571}
1572
1573static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1574{
1575        struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1576        struct mlx5_ib_mr *mr = umem_dmabuf->private;
1577
1578        dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1579
1580        if (!umem_dmabuf->sgt)
1581                return;
1582
1583        mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1584        ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1585}
1586
1587static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1588        .allow_peer2peer = 1,
1589        .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1590};
1591
1592struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1593                                         u64 length, u64 virt_addr,
1594                                         int fd, int access_flags,
1595                                         struct ib_udata *udata)
1596{
1597        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1598        struct mlx5_ib_mr *mr = NULL;
1599        struct ib_umem_dmabuf *umem_dmabuf;
1600        int err;
1601
1602        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1603            !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1604                return ERR_PTR(-EOPNOTSUPP);
1605
1606        mlx5_ib_dbg(dev,
1607                    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1608                    offset, virt_addr, length, fd, access_flags);
1609
1610        /* dmabuf requires xlt update via umr to work. */
1611        if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1612                return ERR_PTR(-EINVAL);
1613
1614        umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1615                                         access_flags,
1616                                         &mlx5_ib_dmabuf_attach_ops);
1617        if (IS_ERR(umem_dmabuf)) {
1618                mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1619                            PTR_ERR(umem_dmabuf));
1620                return ERR_CAST(umem_dmabuf);
1621        }
1622
1623        mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1624                                access_flags);
1625        if (IS_ERR(mr)) {
1626                ib_umem_release(&umem_dmabuf->umem);
1627                return ERR_CAST(mr);
1628        }
1629
1630        mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1631
1632        atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1633        umem_dmabuf->private = mr;
1634        err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1635        if (err)
1636                goto err_dereg_mr;
1637
1638        err = mlx5_ib_init_dmabuf_mr(mr);
1639        if (err)
1640                goto err_dereg_mr;
1641        return &mr->ibmr;
1642
1643err_dereg_mr:
1644        mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1645        return ERR_PTR(err);
1646}
1647
1648/**
1649 * revoke_mr - Fence all DMA on the MR
1650 * @mr: The MR to fence
1651 *
1652 * Upon return the NIC will not be doing any DMA to the pages under the MR,
1653 * and any DMA in progress will be completed. Failure of this function
1654 * indicates the HW has failed catastrophically.
1655 */
1656static int revoke_mr(struct mlx5_ib_mr *mr)
1657{
1658        struct mlx5_umr_wr umrwr = {};
1659
1660        if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1661                return 0;
1662
1663        umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1664                              MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1665        umrwr.wr.opcode = MLX5_IB_WR_UMR;
1666        umrwr.pd = mr_to_mdev(mr)->umrc.pd;
1667        umrwr.mkey = mr->mmkey.key;
1668        umrwr.ignore_free_state = 1;
1669
1670        return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr);
1671}
1672
1673/*
1674 * True if the change in access flags can be done via UMR, only some access
1675 * flags can be updated.
1676 */
1677static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1678                                     unsigned int current_access_flags,
1679                                     unsigned int target_access_flags)
1680{
1681        unsigned int diffs = current_access_flags ^ target_access_flags;
1682
1683        if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1684                      IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
1685                return false;
1686        return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags,
1687                                             target_access_flags);
1688}
1689
1690static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1691                               int access_flags)
1692{
1693        struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1694        struct mlx5_umr_wr umrwr = {
1695                .wr = {
1696                        .send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1697                                      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS,
1698                        .opcode = MLX5_IB_WR_UMR,
1699                },
1700                .mkey = mr->mmkey.key,
1701                .pd = pd,
1702                .access_flags = access_flags,
1703        };
1704        int err;
1705
1706        err = mlx5_ib_post_send_wait(dev, &umrwr);
1707        if (err)
1708                return err;
1709
1710        mr->access_flags = access_flags;
1711        mr->mmkey.pd = to_mpd(pd)->pdn;
1712        return 0;
1713}
1714
1715static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1716                                  struct ib_umem *new_umem,
1717                                  int new_access_flags, u64 iova,
1718                                  unsigned long *page_size)
1719{
1720        struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1721
1722        /* We only track the allocated sizes of MRs from the cache */
1723        if (!mr->cache_ent)
1724                return false;
1725        if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length))
1726                return false;
1727
1728        *page_size =
1729                mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1730        if (WARN_ON(!*page_size))
1731                return false;
1732        return (1ULL << mr->cache_ent->order) >=
1733               ib_umem_num_dma_blocks(new_umem, *page_size);
1734}
1735
1736static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1737                         int access_flags, int flags, struct ib_umem *new_umem,
1738                         u64 iova, unsigned long page_size)
1739{
1740        struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1741        int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1742        struct ib_umem *old_umem = mr->umem;
1743        int err;
1744
1745        /*
1746         * To keep everything simple the MR is revoked before we start to mess
1747         * with it. This ensure the change is atomic relative to any use of the
1748         * MR.
1749         */
1750        err = revoke_mr(mr);
1751        if (err)
1752                return err;
1753
1754        if (flags & IB_MR_REREG_PD) {
1755                mr->ibmr.pd = pd;
1756                mr->mmkey.pd = to_mpd(pd)->pdn;
1757                upd_flags |= MLX5_IB_UPD_XLT_PD;
1758        }
1759        if (flags & IB_MR_REREG_ACCESS) {
1760                mr->access_flags = access_flags;
1761                upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1762        }
1763
1764        mr->ibmr.length = new_umem->length;
1765        mr->mmkey.iova = iova;
1766        mr->mmkey.size = new_umem->length;
1767        mr->page_shift = order_base_2(page_size);
1768        mr->umem = new_umem;
1769        err = mlx5_ib_update_mr_pas(mr, upd_flags);
1770        if (err) {
1771                /*
1772                 * The MR is revoked at this point so there is no issue to free
1773                 * new_umem.
1774                 */
1775                mr->umem = old_umem;
1776                return err;
1777        }
1778
1779        atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1780        ib_umem_release(old_umem);
1781        atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1782        return 0;
1783}
1784
1785struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1786                                    u64 length, u64 iova, int new_access_flags,
1787                                    struct ib_pd *new_pd,
1788                                    struct ib_udata *udata)
1789{
1790        struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1791        struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1792        int err;
1793
1794        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1795                return ERR_PTR(-EOPNOTSUPP);
1796
1797        mlx5_ib_dbg(
1798                dev,
1799                "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1800                start, iova, length, new_access_flags);
1801
1802        if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1803                return ERR_PTR(-EOPNOTSUPP);
1804
1805        if (!(flags & IB_MR_REREG_ACCESS))
1806                new_access_flags = mr->access_flags;
1807        if (!(flags & IB_MR_REREG_PD))
1808                new_pd = ib_mr->pd;
1809
1810        if (!(flags & IB_MR_REREG_TRANS)) {
1811                struct ib_umem *umem;
1812
1813                /* Fast path for PD/access change */
1814                if (can_use_umr_rereg_access(dev, mr->access_flags,
1815                                             new_access_flags)) {
1816                        err = umr_rereg_pd_access(mr, new_pd, new_access_flags);
1817                        if (err)
1818                                return ERR_PTR(err);
1819                        return NULL;
1820                }
1821                /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1822                if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1823                        goto recreate;
1824
1825                /*
1826                 * Only one active MR can refer to a umem at one time, revoke
1827                 * the old MR before assigning the umem to the new one.
1828                 */
1829                err = revoke_mr(mr);
1830                if (err)
1831                        return ERR_PTR(err);
1832                umem = mr->umem;
1833                mr->umem = NULL;
1834                atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1835
1836                return create_real_mr(new_pd, umem, mr->mmkey.iova,
1837                                      new_access_flags);
1838        }
1839
1840        /*
1841         * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1842         * but the logic around releasing the umem is different
1843         */
1844        if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1845                goto recreate;
1846
1847        if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1848            can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1849                struct ib_umem *new_umem;
1850                unsigned long page_size;
1851
1852                new_umem = ib_umem_get(&dev->ib_dev, start, length,
1853                                       new_access_flags);
1854                if (IS_ERR(new_umem))
1855                        return ERR_CAST(new_umem);
1856
1857                /* Fast path for PAS change */
1858                if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1859                                          &page_size)) {
1860                        err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1861                                            new_umem, iova, page_size);
1862                        if (err) {
1863                                ib_umem_release(new_umem);
1864                                return ERR_PTR(err);
1865                        }
1866                        return NULL;
1867                }
1868                return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1869        }
1870
1871        /*
1872         * Everything else has no state we can preserve, just create a new MR
1873         * from scratch
1874         */
1875recreate:
1876        return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1877                                   new_access_flags, udata);
1878}
1879
1880static int
1881mlx5_alloc_priv_descs(struct ib_device *device,
1882                      struct mlx5_ib_mr *mr,
1883                      int ndescs,
1884                      int desc_size)
1885{
1886        struct mlx5_ib_dev *dev = to_mdev(device);
1887        struct device *ddev = &dev->mdev->pdev->dev;
1888        int size = ndescs * desc_size;
1889        int add_size;
1890        int ret;
1891
1892        add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1893
1894        mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1895        if (!mr->descs_alloc)
1896                return -ENOMEM;
1897
1898        mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1899
1900        mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1901        if (dma_mapping_error(ddev, mr->desc_map)) {
1902                ret = -ENOMEM;
1903                goto err;
1904        }
1905
1906        return 0;
1907err:
1908        kfree(mr->descs_alloc);
1909
1910        return ret;
1911}
1912
1913static void
1914mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1915{
1916        if (!mr->umem && mr->descs) {
1917                struct ib_device *device = mr->ibmr.device;
1918                int size = mr->max_descs * mr->desc_size;
1919                struct mlx5_ib_dev *dev = to_mdev(device);
1920
1921                dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1922                                 DMA_TO_DEVICE);
1923                kfree(mr->descs_alloc);
1924                mr->descs = NULL;
1925        }
1926}
1927
1928int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1929{
1930        struct mlx5_ib_mr *mr = to_mmr(ibmr);
1931        struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1932        int rc;
1933
1934        /*
1935         * Any async use of the mr must hold the refcount, once the refcount
1936         * goes to zero no other thread, such as ODP page faults, prefetch, any
1937         * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1938         */
1939        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1940            refcount_read(&mr->mmkey.usecount) != 0 &&
1941            xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1942                mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1943
1944        if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1945                xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1946                           mr->sig, NULL, GFP_KERNEL);
1947
1948                if (mr->mtt_mr) {
1949                        rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1950                        if (rc)
1951                                return rc;
1952                        mr->mtt_mr = NULL;
1953                }
1954                if (mr->klm_mr) {
1955                        rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1956                        if (rc)
1957                                return rc;
1958                        mr->klm_mr = NULL;
1959                }
1960
1961                if (mlx5_core_destroy_psv(dev->mdev,
1962                                          mr->sig->psv_memory.psv_idx))
1963                        mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1964                                     mr->sig->psv_memory.psv_idx);
1965                if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1966                        mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1967                                     mr->sig->psv_wire.psv_idx);
1968                kfree(mr->sig);
1969                mr->sig = NULL;
1970        }
1971
1972        /* Stop DMA */
1973        if (mr->cache_ent) {
1974                if (revoke_mr(mr)) {
1975                        spin_lock_irq(&mr->cache_ent->lock);
1976                        mr->cache_ent->total_mrs--;
1977                        spin_unlock_irq(&mr->cache_ent->lock);
1978                        mr->cache_ent = NULL;
1979                }
1980        }
1981        if (!mr->cache_ent) {
1982                rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1983                if (rc)
1984                        return rc;
1985        }
1986
1987        if (mr->umem) {
1988                bool is_odp = is_odp_mr(mr);
1989
1990                if (!is_odp)
1991                        atomic_sub(ib_umem_num_pages(mr->umem),
1992                                   &dev->mdev->priv.reg_pages);
1993                ib_umem_release(mr->umem);
1994                if (is_odp)
1995                        mlx5_ib_free_odp_mr(mr);
1996        }
1997
1998        if (mr->cache_ent) {
1999                mlx5_mr_cache_free(dev, mr);
2000        } else {
2001                mlx5_free_priv_descs(mr);
2002                kfree(mr);
2003        }
2004        return 0;
2005}
2006
2007static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2008                                   int access_mode, int page_shift)
2009{
2010        void *mkc;
2011
2012        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2013
2014        /* This is only used from the kernel, so setting the PD is OK. */
2015        set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2016        MLX5_SET(mkc, mkc, free, 1);
2017        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2018        MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2019        MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2020        MLX5_SET(mkc, mkc, umr_en, 1);
2021        MLX5_SET(mkc, mkc, log_page_size, page_shift);
2022}
2023
2024static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2025                                  int ndescs, int desc_size, int page_shift,
2026                                  int access_mode, u32 *in, int inlen)
2027{
2028        struct mlx5_ib_dev *dev = to_mdev(pd->device);
2029        int err;
2030
2031        mr->access_mode = access_mode;
2032        mr->desc_size = desc_size;
2033        mr->max_descs = ndescs;
2034
2035        err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2036        if (err)
2037                return err;
2038
2039        mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2040
2041        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2042        if (err)
2043                goto err_free_descs;
2044
2045        mr->mmkey.type = MLX5_MKEY_MR;
2046        mr->ibmr.lkey = mr->mmkey.key;
2047        mr->ibmr.rkey = mr->mmkey.key;
2048
2049        return 0;
2050
2051err_free_descs:
2052        mlx5_free_priv_descs(mr);
2053        return err;
2054}
2055
2056static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2057                                u32 max_num_sg, u32 max_num_meta_sg,
2058                                int desc_size, int access_mode)
2059{
2060        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2061        int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2062        int page_shift = 0;
2063        struct mlx5_ib_mr *mr;
2064        u32 *in;
2065        int err;
2066
2067        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2068        if (!mr)
2069                return ERR_PTR(-ENOMEM);
2070
2071        mr->ibmr.pd = pd;
2072        mr->ibmr.device = pd->device;
2073
2074        in = kzalloc(inlen, GFP_KERNEL);
2075        if (!in) {
2076                err = -ENOMEM;
2077                goto err_free;
2078        }
2079
2080        if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2081                page_shift = PAGE_SHIFT;
2082
2083        err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2084                                     access_mode, in, inlen);
2085        if (err)
2086                goto err_free_in;
2087
2088        mr->umem = NULL;
2089        kfree(in);
2090
2091        return mr;
2092
2093err_free_in:
2094        kfree(in);
2095err_free:
2096        kfree(mr);
2097        return ERR_PTR(err);
2098}
2099
2100static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2101                                    int ndescs, u32 *in, int inlen)
2102{
2103        return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2104                                      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2105                                      inlen);
2106}
2107
2108static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2109                                    int ndescs, u32 *in, int inlen)
2110{
2111        return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2112                                      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2113}
2114
2115static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2116                                      int max_num_sg, int max_num_meta_sg,
2117                                      u32 *in, int inlen)
2118{
2119        struct mlx5_ib_dev *dev = to_mdev(pd->device);
2120        u32 psv_index[2];
2121        void *mkc;
2122        int err;
2123
2124        mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2125        if (!mr->sig)
2126                return -ENOMEM;
2127
2128        /* create mem & wire PSVs */
2129        err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2130        if (err)
2131                goto err_free_sig;
2132
2133        mr->sig->psv_memory.psv_idx = psv_index[0];
2134        mr->sig->psv_wire.psv_idx = psv_index[1];
2135
2136        mr->sig->sig_status_checked = true;
2137        mr->sig->sig_err_exists = false;
2138        /* Next UMR, Arm SIGERR */
2139        ++mr->sig->sigerr_count;
2140        mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2141                                         sizeof(struct mlx5_klm),
2142                                         MLX5_MKC_ACCESS_MODE_KLMS);
2143        if (IS_ERR(mr->klm_mr)) {
2144                err = PTR_ERR(mr->klm_mr);
2145                goto err_destroy_psv;
2146        }
2147        mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2148                                         sizeof(struct mlx5_mtt),
2149                                         MLX5_MKC_ACCESS_MODE_MTT);
2150        if (IS_ERR(mr->mtt_mr)) {
2151                err = PTR_ERR(mr->mtt_mr);
2152                goto err_free_klm_mr;
2153        }
2154
2155        /* Set bsf descriptors for mkey */
2156        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2157        MLX5_SET(mkc, mkc, bsf_en, 1);
2158        MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2159
2160        err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2161                                     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2162        if (err)
2163                goto err_free_mtt_mr;
2164
2165        err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2166                              mr->sig, GFP_KERNEL));
2167        if (err)
2168                goto err_free_descs;
2169        return 0;
2170
2171err_free_descs:
2172        destroy_mkey(dev, mr);
2173        mlx5_free_priv_descs(mr);
2174err_free_mtt_mr:
2175        mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2176        mr->mtt_mr = NULL;
2177err_free_klm_mr:
2178        mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2179        mr->klm_mr = NULL;
2180err_destroy_psv:
2181        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2182                mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2183                             mr->sig->psv_memory.psv_idx);
2184        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2185                mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2186                             mr->sig->psv_wire.psv_idx);
2187err_free_sig:
2188        kfree(mr->sig);
2189
2190        return err;
2191}
2192
2193static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2194                                        enum ib_mr_type mr_type, u32 max_num_sg,
2195                                        u32 max_num_meta_sg)
2196{
2197        struct mlx5_ib_dev *dev = to_mdev(pd->device);
2198        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2199        int ndescs = ALIGN(max_num_sg, 4);
2200        struct mlx5_ib_mr *mr;
2201        u32 *in;
2202        int err;
2203
2204        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2205        if (!mr)
2206                return ERR_PTR(-ENOMEM);
2207
2208        in = kzalloc(inlen, GFP_KERNEL);
2209        if (!in) {
2210                err = -ENOMEM;
2211                goto err_free;
2212        }
2213
2214        mr->ibmr.device = pd->device;
2215        mr->umem = NULL;
2216
2217        switch (mr_type) {
2218        case IB_MR_TYPE_MEM_REG:
2219                err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2220                break;
2221        case IB_MR_TYPE_SG_GAPS:
2222                err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2223                break;
2224        case IB_MR_TYPE_INTEGRITY:
2225                err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2226                                                 max_num_meta_sg, in, inlen);
2227                break;
2228        default:
2229                mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2230                err = -EINVAL;
2231        }
2232
2233        if (err)
2234                goto err_free_in;
2235
2236        kfree(in);
2237
2238        return &mr->ibmr;
2239
2240err_free_in:
2241        kfree(in);
2242err_free:
2243        kfree(mr);
2244        return ERR_PTR(err);
2245}
2246
2247struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2248                               u32 max_num_sg)
2249{
2250        return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2251}
2252
2253struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2254                                         u32 max_num_sg, u32 max_num_meta_sg)
2255{
2256        return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2257                                  max_num_meta_sg);
2258}
2259
2260int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2261{
2262        struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2263        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2264        struct mlx5_ib_mw *mw = to_mmw(ibmw);
2265        u32 *in = NULL;
2266        void *mkc;
2267        int ndescs;
2268        int err;
2269        struct mlx5_ib_alloc_mw req = {};
2270        struct {
2271                __u32   comp_mask;
2272                __u32   response_length;
2273        } resp = {};
2274
2275        err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2276        if (err)
2277                return err;
2278
2279        if (req.comp_mask || req.reserved1 || req.reserved2)
2280                return -EOPNOTSUPP;
2281
2282        if (udata->inlen > sizeof(req) &&
2283            !ib_is_udata_cleared(udata, sizeof(req),
2284                                 udata->inlen - sizeof(req)))
2285                return -EOPNOTSUPP;
2286
2287        ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2288
2289        in = kzalloc(inlen, GFP_KERNEL);
2290        if (!in) {
2291                err = -ENOMEM;
2292                goto free;
2293        }
2294
2295        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2296
2297        MLX5_SET(mkc, mkc, free, 1);
2298        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2299        MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2300        MLX5_SET(mkc, mkc, umr_en, 1);
2301        MLX5_SET(mkc, mkc, lr, 1);
2302        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2303        MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2304        MLX5_SET(mkc, mkc, qpn, 0xffffff);
2305
2306        err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2307        if (err)
2308                goto free;
2309
2310        mw->mmkey.type = MLX5_MKEY_MW;
2311        ibmw->rkey = mw->mmkey.key;
2312        mw->ndescs = ndescs;
2313
2314        resp.response_length =
2315                min(offsetofend(typeof(resp), response_length), udata->outlen);
2316        if (resp.response_length) {
2317                err = ib_copy_to_udata(udata, &resp, resp.response_length);
2318                if (err)
2319                        goto free_mkey;
2320        }
2321
2322        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2323                err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2324                if (err)
2325                        goto free_mkey;
2326        }
2327
2328        kfree(in);
2329        return 0;
2330
2331free_mkey:
2332        mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
2333free:
2334        kfree(in);
2335        return err;
2336}
2337
2338int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2339{
2340        struct mlx5_ib_dev *dev = to_mdev(mw->device);
2341        struct mlx5_ib_mw *mmw = to_mmw(mw);
2342
2343        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2344            xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2345                /*
2346                 * pagefault_single_data_segment() may be accessing mmw
2347                 * if the user bound an ODP MR to this MW.
2348                 */
2349                mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2350
2351        return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
2352}
2353
2354int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2355                            struct ib_mr_status *mr_status)
2356{
2357        struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2358        int ret = 0;
2359
2360        if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2361                pr_err("Invalid status check mask\n");
2362                ret = -EINVAL;
2363                goto done;
2364        }
2365
2366        mr_status->fail_status = 0;
2367        if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2368                if (!mmr->sig) {
2369                        ret = -EINVAL;
2370                        pr_err("signature status check requested on a non-signature enabled MR\n");
2371                        goto done;
2372                }
2373
2374                mmr->sig->sig_status_checked = true;
2375                if (!mmr->sig->sig_err_exists)
2376                        goto done;
2377
2378                if (ibmr->lkey == mmr->sig->err_item.key)
2379                        memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2380                               sizeof(mr_status->sig_err));
2381                else {
2382                        mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2383                        mr_status->sig_err.sig_err_offset = 0;
2384                        mr_status->sig_err.key = mmr->sig->err_item.key;
2385                }
2386
2387                mmr->sig->sig_err_exists = false;
2388                mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2389        }
2390
2391done:
2392        return ret;
2393}
2394
2395static int
2396mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2397                        int data_sg_nents, unsigned int *data_sg_offset,
2398                        struct scatterlist *meta_sg, int meta_sg_nents,
2399                        unsigned int *meta_sg_offset)
2400{
2401        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2402        unsigned int sg_offset = 0;
2403        int n = 0;
2404
2405        mr->meta_length = 0;
2406        if (data_sg_nents == 1) {
2407                n++;
2408                mr->ndescs = 1;
2409                if (data_sg_offset)
2410                        sg_offset = *data_sg_offset;
2411                mr->data_length = sg_dma_len(data_sg) - sg_offset;
2412                mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2413                if (meta_sg_nents == 1) {
2414                        n++;
2415                        mr->meta_ndescs = 1;
2416                        if (meta_sg_offset)
2417                                sg_offset = *meta_sg_offset;
2418                        else
2419                                sg_offset = 0;
2420                        mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2421                        mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2422                }
2423                ibmr->length = mr->data_length + mr->meta_length;
2424        }
2425
2426        return n;
2427}
2428
2429static int
2430mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2431                   struct scatterlist *sgl,
2432                   unsigned short sg_nents,
2433                   unsigned int *sg_offset_p,
2434                   struct scatterlist *meta_sgl,
2435                   unsigned short meta_sg_nents,
2436                   unsigned int *meta_sg_offset_p)
2437{
2438        struct scatterlist *sg = sgl;
2439        struct mlx5_klm *klms = mr->descs;
2440        unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2441        u32 lkey = mr->ibmr.pd->local_dma_lkey;
2442        int i, j = 0;
2443
2444        mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2445        mr->ibmr.length = 0;
2446
2447        for_each_sg(sgl, sg, sg_nents, i) {
2448                if (unlikely(i >= mr->max_descs))
2449                        break;
2450                klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2451                klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2452                klms[i].key = cpu_to_be32(lkey);
2453                mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2454
2455                sg_offset = 0;
2456        }
2457
2458        if (sg_offset_p)
2459                *sg_offset_p = sg_offset;
2460
2461        mr->ndescs = i;
2462        mr->data_length = mr->ibmr.length;
2463
2464        if (meta_sg_nents) {
2465                sg = meta_sgl;
2466                sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2467                for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2468                        if (unlikely(i + j >= mr->max_descs))
2469                                break;
2470                        klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2471                                                     sg_offset);
2472                        klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2473                                                         sg_offset);
2474                        klms[i + j].key = cpu_to_be32(lkey);
2475                        mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2476
2477                        sg_offset = 0;
2478                }
2479                if (meta_sg_offset_p)
2480                        *meta_sg_offset_p = sg_offset;
2481
2482                mr->meta_ndescs = j;
2483                mr->meta_length = mr->ibmr.length - mr->data_length;
2484        }
2485
2486        return i + j;
2487}
2488
2489static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2490{
2491        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2492        __be64 *descs;
2493
2494        if (unlikely(mr->ndescs == mr->max_descs))
2495                return -ENOMEM;
2496
2497        descs = mr->descs;
2498        descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2499
2500        return 0;
2501}
2502
2503static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2504{
2505        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2506        __be64 *descs;
2507
2508        if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
2509                return -ENOMEM;
2510
2511        descs = mr->descs;
2512        descs[mr->ndescs + mr->meta_ndescs++] =
2513                cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2514
2515        return 0;
2516}
2517
2518static int
2519mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2520                         int data_sg_nents, unsigned int *data_sg_offset,
2521                         struct scatterlist *meta_sg, int meta_sg_nents,
2522                         unsigned int *meta_sg_offset)
2523{
2524        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2525        struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2526        int n;
2527
2528        pi_mr->ndescs = 0;
2529        pi_mr->meta_ndescs = 0;
2530        pi_mr->meta_length = 0;
2531
2532        ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2533                                   pi_mr->desc_size * pi_mr->max_descs,
2534                                   DMA_TO_DEVICE);
2535
2536        pi_mr->ibmr.page_size = ibmr->page_size;
2537        n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2538                           mlx5_set_page);
2539        if (n != data_sg_nents)
2540                return n;
2541
2542        pi_mr->data_iova = pi_mr->ibmr.iova;
2543        pi_mr->data_length = pi_mr->ibmr.length;
2544        pi_mr->ibmr.length = pi_mr->data_length;
2545        ibmr->length = pi_mr->data_length;
2546
2547        if (meta_sg_nents) {
2548                u64 page_mask = ~((u64)ibmr->page_size - 1);
2549                u64 iova = pi_mr->data_iova;
2550
2551                n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2552                                    meta_sg_offset, mlx5_set_page_pi);
2553
2554                pi_mr->meta_length = pi_mr->ibmr.length;
2555                /*
2556                 * PI address for the HW is the offset of the metadata address
2557                 * relative to the first data page address.
2558                 * It equals to first data page address + size of data pages +
2559                 * metadata offset at the first metadata page
2560                 */
2561                pi_mr->pi_iova = (iova & page_mask) +
2562                                 pi_mr->ndescs * ibmr->page_size +
2563                                 (pi_mr->ibmr.iova & ~page_mask);
2564                /*
2565                 * In order to use one MTT MR for data and metadata, we register
2566                 * also the gaps between the end of the data and the start of
2567                 * the metadata (the sig MR will verify that the HW will access
2568                 * to right addresses). This mapping is safe because we use
2569                 * internal mkey for the registration.
2570                 */
2571                pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2572                pi_mr->ibmr.iova = iova;
2573                ibmr->length += pi_mr->meta_length;
2574        }
2575
2576        ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2577                                      pi_mr->desc_size * pi_mr->max_descs,
2578                                      DMA_TO_DEVICE);
2579
2580        return n;
2581}
2582
2583static int
2584mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2585                         int data_sg_nents, unsigned int *data_sg_offset,
2586                         struct scatterlist *meta_sg, int meta_sg_nents,
2587                         unsigned int *meta_sg_offset)
2588{
2589        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2590        struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2591        int n;
2592
2593        pi_mr->ndescs = 0;
2594        pi_mr->meta_ndescs = 0;
2595        pi_mr->meta_length = 0;
2596
2597        ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2598                                   pi_mr->desc_size * pi_mr->max_descs,
2599                                   DMA_TO_DEVICE);
2600
2601        n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2602                               meta_sg, meta_sg_nents, meta_sg_offset);
2603
2604        ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2605                                      pi_mr->desc_size * pi_mr->max_descs,
2606                                      DMA_TO_DEVICE);
2607
2608        /* This is zero-based memory region */
2609        pi_mr->data_iova = 0;
2610        pi_mr->ibmr.iova = 0;
2611        pi_mr->pi_iova = pi_mr->data_length;
2612        ibmr->length = pi_mr->ibmr.length;
2613
2614        return n;
2615}
2616
2617int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2618                         int data_sg_nents, unsigned int *data_sg_offset,
2619                         struct scatterlist *meta_sg, int meta_sg_nents,
2620                         unsigned int *meta_sg_offset)
2621{
2622        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2623        struct mlx5_ib_mr *pi_mr = NULL;
2624        int n;
2625
2626        WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2627
2628        mr->ndescs = 0;
2629        mr->data_length = 0;
2630        mr->data_iova = 0;
2631        mr->meta_ndescs = 0;
2632        mr->pi_iova = 0;
2633        /*
2634         * As a performance optimization, if possible, there is no need to
2635         * perform UMR operation to register the data/metadata buffers.
2636         * First try to map the sg lists to PA descriptors with local_dma_lkey.
2637         * Fallback to UMR only in case of a failure.
2638         */
2639        n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2640                                    data_sg_offset, meta_sg, meta_sg_nents,
2641                                    meta_sg_offset);
2642        if (n == data_sg_nents + meta_sg_nents)
2643                goto out;
2644        /*
2645         * As a performance optimization, if possible, there is no need to map
2646         * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2647         * descriptors and fallback to KLM only in case of a failure.
2648         * It's more efficient for the HW to work with MTT descriptors
2649         * (especially in high load).
2650         * Use KLM (indirect access) only if it's mandatory.
2651         */
2652        pi_mr = mr->mtt_mr;
2653        n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2654                                     data_sg_offset, meta_sg, meta_sg_nents,
2655                                     meta_sg_offset);
2656        if (n == data_sg_nents + meta_sg_nents)
2657                goto out;
2658
2659        pi_mr = mr->klm_mr;
2660        n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2661                                     data_sg_offset, meta_sg, meta_sg_nents,
2662                                     meta_sg_offset);
2663        if (unlikely(n != data_sg_nents + meta_sg_nents))
2664                return -ENOMEM;
2665
2666out:
2667        /* This is zero-based memory region */
2668        ibmr->iova = 0;
2669        mr->pi_mr = pi_mr;
2670        if (pi_mr)
2671                ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2672        else
2673                ibmr->sig_attrs->meta_length = mr->meta_length;
2674
2675        return 0;
2676}
2677
2678int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2679                      unsigned int *sg_offset)
2680{
2681        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2682        int n;
2683
2684        mr->ndescs = 0;
2685
2686        ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2687                                   mr->desc_size * mr->max_descs,
2688                                   DMA_TO_DEVICE);
2689
2690        if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2691                n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2692                                       NULL);
2693        else
2694                n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2695                                mlx5_set_page);
2696
2697        ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2698                                      mr->desc_size * mr->max_descs,
2699                                      DMA_TO_DEVICE);
2700
2701        return n;
2702}
2703