linux/drivers/infiniband/hw/mlx5/mr.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
   3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34
  35#include <linux/kref.h>
  36#include <linux/random.h>
  37#include <linux/debugfs.h>
  38#include <linux/export.h>
  39#include <linux/delay.h>
  40#include <linux/dma-buf.h>
  41#include <linux/dma-resv.h>
  42#include <rdma/ib_umem.h>
  43#include <rdma/ib_umem_odp.h>
  44#include <rdma/ib_verbs.h>
  45#include "dm.h"
  46#include "mlx5_ib.h"
  47
  48/*
  49 * We can't use an array for xlt_emergency_page because dma_map_single doesn't
  50 * work on kernel modules memory
  51 */
  52void *xlt_emergency_page;
  53static DEFINE_MUTEX(xlt_emergency_page_mutex);
  54
  55enum {
  56        MAX_PENDING_REG_MR = 8,
  57};
  58
  59#define MLX5_UMR_ALIGN 2048
  60
  61static void
  62create_mkey_callback(int status, struct mlx5_async_work *context);
  63static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
  64                                     u64 iova, int access_flags,
  65                                     unsigned int page_size, bool populate);
  66
  67static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
  68                                          struct ib_pd *pd)
  69{
  70        struct mlx5_ib_dev *dev = to_mdev(pd->device);
  71        bool ro_pci_enabled = pcie_relaxed_ordering_enabled(dev->mdev->pdev);
  72
  73        MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
  74        MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
  75        MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
  76        MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
  77        MLX5_SET(mkc, mkc, lr, 1);
  78
  79        if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
  80                MLX5_SET(mkc, mkc, relaxed_ordering_write,
  81                         (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
  82        if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
  83                MLX5_SET(mkc, mkc, relaxed_ordering_read,
  84                         (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
  85
  86        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
  87        MLX5_SET(mkc, mkc, qpn, 0xffffff);
  88        MLX5_SET64(mkc, mkc, start_addr, start_addr);
  89}
  90
  91static void
  92assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
  93                    u32 *in)
  94{
  95        u8 key = atomic_inc_return(&dev->mkey_var);
  96        void *mkc;
  97
  98        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
  99        MLX5_SET(mkc, mkc, mkey_7_0, key);
 100        mkey->key = key;
 101}
 102
 103static int
 104mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
 105                    u32 *in, int inlen)
 106{
 107        assign_mkey_variant(dev, mkey, in);
 108        return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen);
 109}
 110
 111static int
 112mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
 113                       struct mlx5_core_mkey *mkey,
 114                       struct mlx5_async_ctx *async_ctx,
 115                       u32 *in, int inlen, u32 *out, int outlen,
 116                       struct mlx5_async_work *context)
 117{
 118        MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
 119        assign_mkey_variant(dev, mkey, in);
 120        return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
 121                                create_mkey_callback, context);
 122}
 123
 124static int mr_cache_max_order(struct mlx5_ib_dev *dev);
 125static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 126
 127static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
 128{
 129        return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
 130}
 131
 132static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 133{
 134        WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
 135
 136        return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 137}
 138
 139static void create_mkey_callback(int status, struct mlx5_async_work *context)
 140{
 141        struct mlx5_ib_mr *mr =
 142                container_of(context, struct mlx5_ib_mr, cb_work);
 143        struct mlx5_cache_ent *ent = mr->cache_ent;
 144        struct mlx5_ib_dev *dev = ent->dev;
 145        unsigned long flags;
 146
 147        if (status) {
 148                mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 149                kfree(mr);
 150                spin_lock_irqsave(&ent->lock, flags);
 151                ent->pending--;
 152                WRITE_ONCE(dev->fill_delay, 1);
 153                spin_unlock_irqrestore(&ent->lock, flags);
 154                mod_timer(&dev->delay_timer, jiffies + HZ);
 155                return;
 156        }
 157
 158        mr->mmkey.type = MLX5_MKEY_MR;
 159        mr->mmkey.key |= mlx5_idx_to_mkey(
 160                MLX5_GET(create_mkey_out, mr->out, mkey_index));
 161        init_waitqueue_head(&mr->mmkey.wait);
 162
 163        WRITE_ONCE(dev->cache.last_add, jiffies);
 164
 165        spin_lock_irqsave(&ent->lock, flags);
 166        list_add_tail(&mr->list, &ent->head);
 167        ent->available_mrs++;
 168        ent->total_mrs++;
 169        /* If we are doing fill_to_high_water then keep going. */
 170        queue_adjust_cache_locked(ent);
 171        ent->pending--;
 172        spin_unlock_irqrestore(&ent->lock, flags);
 173}
 174
 175static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
 176{
 177        struct mlx5_ib_mr *mr;
 178
 179        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 180        if (!mr)
 181                return NULL;
 182        mr->cache_ent = ent;
 183
 184        set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
 185        MLX5_SET(mkc, mkc, free, 1);
 186        MLX5_SET(mkc, mkc, umr_en, 1);
 187        MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
 188        MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
 189
 190        MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
 191        MLX5_SET(mkc, mkc, log_page_size, ent->page);
 192        return mr;
 193}
 194
 195/* Asynchronously schedule new MRs to be populated in the cache. */
 196static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 197{
 198        size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 199        struct mlx5_ib_mr *mr;
 200        void *mkc;
 201        u32 *in;
 202        int err = 0;
 203        int i;
 204
 205        in = kzalloc(inlen, GFP_KERNEL);
 206        if (!in)
 207                return -ENOMEM;
 208
 209        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 210        for (i = 0; i < num; i++) {
 211                mr = alloc_cache_mr(ent, mkc);
 212                if (!mr) {
 213                        err = -ENOMEM;
 214                        break;
 215                }
 216                spin_lock_irq(&ent->lock);
 217                if (ent->pending >= MAX_PENDING_REG_MR) {
 218                        err = -EAGAIN;
 219                        spin_unlock_irq(&ent->lock);
 220                        kfree(mr);
 221                        break;
 222                }
 223                ent->pending++;
 224                spin_unlock_irq(&ent->lock);
 225                err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
 226                                             &ent->dev->async_ctx, in, inlen,
 227                                             mr->out, sizeof(mr->out),
 228                                             &mr->cb_work);
 229                if (err) {
 230                        spin_lock_irq(&ent->lock);
 231                        ent->pending--;
 232                        spin_unlock_irq(&ent->lock);
 233                        mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
 234                        kfree(mr);
 235                        break;
 236                }
 237        }
 238
 239        kfree(in);
 240        return err;
 241}
 242
 243/* Synchronously create a MR in the cache */
 244static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
 245{
 246        size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 247        struct mlx5_ib_mr *mr;
 248        void *mkc;
 249        u32 *in;
 250        int err;
 251
 252        in = kzalloc(inlen, GFP_KERNEL);
 253        if (!in)
 254                return ERR_PTR(-ENOMEM);
 255        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 256
 257        mr = alloc_cache_mr(ent, mkc);
 258        if (!mr) {
 259                err = -ENOMEM;
 260                goto free_in;
 261        }
 262
 263        err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen);
 264        if (err)
 265                goto free_mr;
 266
 267        mr->mmkey.type = MLX5_MKEY_MR;
 268        WRITE_ONCE(ent->dev->cache.last_add, jiffies);
 269        spin_lock_irq(&ent->lock);
 270        ent->total_mrs++;
 271        spin_unlock_irq(&ent->lock);
 272        kfree(in);
 273        return mr;
 274free_mr:
 275        kfree(mr);
 276free_in:
 277        kfree(in);
 278        return ERR_PTR(err);
 279}
 280
 281static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 282{
 283        struct mlx5_ib_mr *mr;
 284
 285        lockdep_assert_held(&ent->lock);
 286        if (list_empty(&ent->head))
 287                return;
 288        mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 289        list_del(&mr->list);
 290        ent->available_mrs--;
 291        ent->total_mrs--;
 292        spin_unlock_irq(&ent->lock);
 293        mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
 294        kfree(mr);
 295        spin_lock_irq(&ent->lock);
 296}
 297
 298static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 299                                bool limit_fill)
 300{
 301        int err;
 302
 303        lockdep_assert_held(&ent->lock);
 304
 305        while (true) {
 306                if (limit_fill)
 307                        target = ent->limit * 2;
 308                if (target == ent->available_mrs + ent->pending)
 309                        return 0;
 310                if (target > ent->available_mrs + ent->pending) {
 311                        u32 todo = target - (ent->available_mrs + ent->pending);
 312
 313                        spin_unlock_irq(&ent->lock);
 314                        err = add_keys(ent, todo);
 315                        if (err == -EAGAIN)
 316                                usleep_range(3000, 5000);
 317                        spin_lock_irq(&ent->lock);
 318                        if (err) {
 319                                if (err != -EAGAIN)
 320                                        return err;
 321                        } else
 322                                return 0;
 323                } else {
 324                        remove_cache_mr_locked(ent);
 325                }
 326        }
 327}
 328
 329static ssize_t size_write(struct file *filp, const char __user *buf,
 330                          size_t count, loff_t *pos)
 331{
 332        struct mlx5_cache_ent *ent = filp->private_data;
 333        u32 target;
 334        int err;
 335
 336        err = kstrtou32_from_user(buf, count, 0, &target);
 337        if (err)
 338                return err;
 339
 340        /*
 341         * Target is the new value of total_mrs the user requests, however we
 342         * cannot free MRs that are in use. Compute the target value for
 343         * available_mrs.
 344         */
 345        spin_lock_irq(&ent->lock);
 346        if (target < ent->total_mrs - ent->available_mrs) {
 347                err = -EINVAL;
 348                goto err_unlock;
 349        }
 350        target = target - (ent->total_mrs - ent->available_mrs);
 351        if (target < ent->limit || target > ent->limit*2) {
 352                err = -EINVAL;
 353                goto err_unlock;
 354        }
 355        err = resize_available_mrs(ent, target, false);
 356        if (err)
 357                goto err_unlock;
 358        spin_unlock_irq(&ent->lock);
 359
 360        return count;
 361
 362err_unlock:
 363        spin_unlock_irq(&ent->lock);
 364        return err;
 365}
 366
 367static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 368                         loff_t *pos)
 369{
 370        struct mlx5_cache_ent *ent = filp->private_data;
 371        char lbuf[20];
 372        int err;
 373
 374        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
 375        if (err < 0)
 376                return err;
 377
 378        return simple_read_from_buffer(buf, count, pos, lbuf, err);
 379}
 380
 381static const struct file_operations size_fops = {
 382        .owner  = THIS_MODULE,
 383        .open   = simple_open,
 384        .write  = size_write,
 385        .read   = size_read,
 386};
 387
 388static ssize_t limit_write(struct file *filp, const char __user *buf,
 389                           size_t count, loff_t *pos)
 390{
 391        struct mlx5_cache_ent *ent = filp->private_data;
 392        u32 var;
 393        int err;
 394
 395        err = kstrtou32_from_user(buf, count, 0, &var);
 396        if (err)
 397                return err;
 398
 399        /*
 400         * Upon set we immediately fill the cache to high water mark implied by
 401         * the limit.
 402         */
 403        spin_lock_irq(&ent->lock);
 404        ent->limit = var;
 405        err = resize_available_mrs(ent, 0, true);
 406        spin_unlock_irq(&ent->lock);
 407        if (err)
 408                return err;
 409        return count;
 410}
 411
 412static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
 413                          loff_t *pos)
 414{
 415        struct mlx5_cache_ent *ent = filp->private_data;
 416        char lbuf[20];
 417        int err;
 418
 419        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
 420        if (err < 0)
 421                return err;
 422
 423        return simple_read_from_buffer(buf, count, pos, lbuf, err);
 424}
 425
 426static const struct file_operations limit_fops = {
 427        .owner  = THIS_MODULE,
 428        .open   = simple_open,
 429        .write  = limit_write,
 430        .read   = limit_read,
 431};
 432
 433static bool someone_adding(struct mlx5_mr_cache *cache)
 434{
 435        unsigned int i;
 436
 437        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 438                struct mlx5_cache_ent *ent = &cache->ent[i];
 439                bool ret;
 440
 441                spin_lock_irq(&ent->lock);
 442                ret = ent->available_mrs < ent->limit;
 443                spin_unlock_irq(&ent->lock);
 444                if (ret)
 445                        return true;
 446        }
 447        return false;
 448}
 449
 450/*
 451 * Check if the bucket is outside the high/low water mark and schedule an async
 452 * update. The cache refill has hysteresis, once the low water mark is hit it is
 453 * refilled up to the high mark.
 454 */
 455static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 456{
 457        lockdep_assert_held(&ent->lock);
 458
 459        if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
 460                return;
 461        if (ent->available_mrs < ent->limit) {
 462                ent->fill_to_high_water = true;
 463                queue_work(ent->dev->cache.wq, &ent->work);
 464        } else if (ent->fill_to_high_water &&
 465                   ent->available_mrs + ent->pending < 2 * ent->limit) {
 466                /*
 467                 * Once we start populating due to hitting a low water mark
 468                 * continue until we pass the high water mark.
 469                 */
 470                queue_work(ent->dev->cache.wq, &ent->work);
 471        } else if (ent->available_mrs == 2 * ent->limit) {
 472                ent->fill_to_high_water = false;
 473        } else if (ent->available_mrs > 2 * ent->limit) {
 474                /* Queue deletion of excess entries */
 475                ent->fill_to_high_water = false;
 476                if (ent->pending)
 477                        queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
 478                                           msecs_to_jiffies(1000));
 479                else
 480                        queue_work(ent->dev->cache.wq, &ent->work);
 481        }
 482}
 483
 484static void __cache_work_func(struct mlx5_cache_ent *ent)
 485{
 486        struct mlx5_ib_dev *dev = ent->dev;
 487        struct mlx5_mr_cache *cache = &dev->cache;
 488        int err;
 489
 490        spin_lock_irq(&ent->lock);
 491        if (ent->disabled)
 492                goto out;
 493
 494        if (ent->fill_to_high_water &&
 495            ent->available_mrs + ent->pending < 2 * ent->limit &&
 496            !READ_ONCE(dev->fill_delay)) {
 497                spin_unlock_irq(&ent->lock);
 498                err = add_keys(ent, 1);
 499                spin_lock_irq(&ent->lock);
 500                if (ent->disabled)
 501                        goto out;
 502                if (err) {
 503                        /*
 504                         * EAGAIN only happens if pending is positive, so we
 505                         * will be rescheduled from reg_mr_callback(). The only
 506                         * failure path here is ENOMEM.
 507                         */
 508                        if (err != -EAGAIN) {
 509                                mlx5_ib_warn(
 510                                        dev,
 511                                        "command failed order %d, err %d\n",
 512                                        ent->order, err);
 513                                queue_delayed_work(cache->wq, &ent->dwork,
 514                                                   msecs_to_jiffies(1000));
 515                        }
 516                }
 517        } else if (ent->available_mrs > 2 * ent->limit) {
 518                bool need_delay;
 519
 520                /*
 521                 * The remove_cache_mr() logic is performed as garbage
 522                 * collection task. Such task is intended to be run when no
 523                 * other active processes are running.
 524                 *
 525                 * The need_resched() will return TRUE if there are user tasks
 526                 * to be activated in near future.
 527                 *
 528                 * In such case, we don't execute remove_cache_mr() and postpone
 529                 * the garbage collection work to try to run in next cycle, in
 530                 * order to free CPU resources to other tasks.
 531                 */
 532                spin_unlock_irq(&ent->lock);
 533                need_delay = need_resched() || someone_adding(cache) ||
 534                             !time_after(jiffies,
 535                                         READ_ONCE(cache->last_add) + 300 * HZ);
 536                spin_lock_irq(&ent->lock);
 537                if (ent->disabled)
 538                        goto out;
 539                if (need_delay)
 540                        queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
 541                remove_cache_mr_locked(ent);
 542                queue_adjust_cache_locked(ent);
 543        }
 544out:
 545        spin_unlock_irq(&ent->lock);
 546}
 547
 548static void delayed_cache_work_func(struct work_struct *work)
 549{
 550        struct mlx5_cache_ent *ent;
 551
 552        ent = container_of(work, struct mlx5_cache_ent, dwork.work);
 553        __cache_work_func(ent);
 554}
 555
 556static void cache_work_func(struct work_struct *work)
 557{
 558        struct mlx5_cache_ent *ent;
 559
 560        ent = container_of(work, struct mlx5_cache_ent, work);
 561        __cache_work_func(ent);
 562}
 563
 564/* Allocate a special entry from the cache */
 565struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 566                                       unsigned int entry, int access_flags)
 567{
 568        struct mlx5_mr_cache *cache = &dev->cache;
 569        struct mlx5_cache_ent *ent;
 570        struct mlx5_ib_mr *mr;
 571
 572        if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
 573                    entry >= ARRAY_SIZE(cache->ent)))
 574                return ERR_PTR(-EINVAL);
 575
 576        /* Matches access in alloc_cache_mr() */
 577        if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
 578                return ERR_PTR(-EOPNOTSUPP);
 579
 580        ent = &cache->ent[entry];
 581        spin_lock_irq(&ent->lock);
 582        if (list_empty(&ent->head)) {
 583                spin_unlock_irq(&ent->lock);
 584                mr = create_cache_mr(ent);
 585                if (IS_ERR(mr))
 586                        return mr;
 587        } else {
 588                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 589                list_del(&mr->list);
 590                ent->available_mrs--;
 591                queue_adjust_cache_locked(ent);
 592                spin_unlock_irq(&ent->lock);
 593
 594                mlx5_clear_mr(mr);
 595        }
 596        mr->access_flags = access_flags;
 597        return mr;
 598}
 599
 600/* Return a MR already available in the cache */
 601static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
 602{
 603        struct mlx5_ib_dev *dev = req_ent->dev;
 604        struct mlx5_ib_mr *mr = NULL;
 605        struct mlx5_cache_ent *ent = req_ent;
 606
 607        /* Try larger MR pools from the cache to satisfy the allocation */
 608        for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) {
 609                mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
 610                            ent - dev->cache.ent);
 611
 612                spin_lock_irq(&ent->lock);
 613                if (!list_empty(&ent->head)) {
 614                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 615                                              list);
 616                        list_del(&mr->list);
 617                        ent->available_mrs--;
 618                        queue_adjust_cache_locked(ent);
 619                        spin_unlock_irq(&ent->lock);
 620                        mlx5_clear_mr(mr);
 621                        return mr;
 622                }
 623                queue_adjust_cache_locked(ent);
 624                spin_unlock_irq(&ent->lock);
 625        }
 626        req_ent->miss++;
 627        return NULL;
 628}
 629
 630static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 631{
 632        struct mlx5_cache_ent *ent = mr->cache_ent;
 633
 634        spin_lock_irq(&ent->lock);
 635        list_add_tail(&mr->list, &ent->head);
 636        ent->available_mrs++;
 637        queue_adjust_cache_locked(ent);
 638        spin_unlock_irq(&ent->lock);
 639}
 640
 641static void clean_keys(struct mlx5_ib_dev *dev, int c)
 642{
 643        struct mlx5_mr_cache *cache = &dev->cache;
 644        struct mlx5_cache_ent *ent = &cache->ent[c];
 645        struct mlx5_ib_mr *tmp_mr;
 646        struct mlx5_ib_mr *mr;
 647        LIST_HEAD(del_list);
 648
 649        cancel_delayed_work(&ent->dwork);
 650        while (1) {
 651                spin_lock_irq(&ent->lock);
 652                if (list_empty(&ent->head)) {
 653                        spin_unlock_irq(&ent->lock);
 654                        break;
 655                }
 656                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 657                list_move(&mr->list, &del_list);
 658                ent->available_mrs--;
 659                ent->total_mrs--;
 660                spin_unlock_irq(&ent->lock);
 661                mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 662        }
 663
 664        list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 665                list_del(&mr->list);
 666                kfree(mr);
 667        }
 668}
 669
 670static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 671{
 672        if (!mlx5_debugfs_root || dev->is_rep)
 673                return;
 674
 675        debugfs_remove_recursive(dev->cache.root);
 676        dev->cache.root = NULL;
 677}
 678
 679static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 680{
 681        struct mlx5_mr_cache *cache = &dev->cache;
 682        struct mlx5_cache_ent *ent;
 683        struct dentry *dir;
 684        int i;
 685
 686        if (!mlx5_debugfs_root || dev->is_rep)
 687                return;
 688
 689        cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
 690
 691        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 692                ent = &cache->ent[i];
 693                sprintf(ent->name, "%d", ent->order);
 694                dir = debugfs_create_dir(ent->name, cache->root);
 695                debugfs_create_file("size", 0600, dir, ent, &size_fops);
 696                debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
 697                debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
 698                debugfs_create_u32("miss", 0600, dir, &ent->miss);
 699        }
 700}
 701
 702static void delay_time_func(struct timer_list *t)
 703{
 704        struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 705
 706        WRITE_ONCE(dev->fill_delay, 0);
 707}
 708
 709int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 710{
 711        struct mlx5_mr_cache *cache = &dev->cache;
 712        struct mlx5_cache_ent *ent;
 713        int i;
 714
 715        mutex_init(&dev->slow_path_mutex);
 716        cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 717        if (!cache->wq) {
 718                mlx5_ib_warn(dev, "failed to create work queue\n");
 719                return -ENOMEM;
 720        }
 721
 722        mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 723        timer_setup(&dev->delay_timer, delay_time_func, 0);
 724        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 725                ent = &cache->ent[i];
 726                INIT_LIST_HEAD(&ent->head);
 727                spin_lock_init(&ent->lock);
 728                ent->order = i + 2;
 729                ent->dev = dev;
 730                ent->limit = 0;
 731
 732                INIT_WORK(&ent->work, cache_work_func);
 733                INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 734
 735                if (i > MR_CACHE_LAST_STD_ENTRY) {
 736                        mlx5_odp_init_mr_cache_entry(ent);
 737                        continue;
 738                }
 739
 740                if (ent->order > mr_cache_max_order(dev))
 741                        continue;
 742
 743                ent->page = PAGE_SHIFT;
 744                ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
 745                           MLX5_IB_UMR_OCTOWORD;
 746                ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 747                if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
 748                    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
 749                    mlx5_ib_can_load_pas_with_umr(dev, 0))
 750                        ent->limit = dev->mdev->profile.mr_cache[i].limit;
 751                else
 752                        ent->limit = 0;
 753                spin_lock_irq(&ent->lock);
 754                queue_adjust_cache_locked(ent);
 755                spin_unlock_irq(&ent->lock);
 756        }
 757
 758        mlx5_mr_cache_debugfs_init(dev);
 759
 760        return 0;
 761}
 762
 763int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 764{
 765        unsigned int i;
 766
 767        if (!dev->cache.wq)
 768                return 0;
 769
 770        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 771                struct mlx5_cache_ent *ent = &dev->cache.ent[i];
 772
 773                spin_lock_irq(&ent->lock);
 774                ent->disabled = true;
 775                spin_unlock_irq(&ent->lock);
 776                cancel_work_sync(&ent->work);
 777                cancel_delayed_work_sync(&ent->dwork);
 778        }
 779
 780        mlx5_mr_cache_debugfs_cleanup(dev);
 781        mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 782
 783        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
 784                clean_keys(dev, i);
 785
 786        destroy_workqueue(dev->cache.wq);
 787        del_timer_sync(&dev->delay_timer);
 788
 789        return 0;
 790}
 791
 792struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
 793{
 794        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 795        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 796        struct mlx5_ib_mr *mr;
 797        void *mkc;
 798        u32 *in;
 799        int err;
 800
 801        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 802        if (!mr)
 803                return ERR_PTR(-ENOMEM);
 804
 805        in = kzalloc(inlen, GFP_KERNEL);
 806        if (!in) {
 807                err = -ENOMEM;
 808                goto err_free;
 809        }
 810
 811        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 812
 813        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
 814        MLX5_SET(mkc, mkc, length64, 1);
 815        set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
 816                                      pd);
 817
 818        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 819        if (err)
 820                goto err_in;
 821
 822        kfree(in);
 823        mr->mmkey.type = MLX5_MKEY_MR;
 824        mr->ibmr.lkey = mr->mmkey.key;
 825        mr->ibmr.rkey = mr->mmkey.key;
 826        mr->umem = NULL;
 827
 828        return &mr->ibmr;
 829
 830err_in:
 831        kfree(in);
 832
 833err_free:
 834        kfree(mr);
 835
 836        return ERR_PTR(err);
 837}
 838
 839static int get_octo_len(u64 addr, u64 len, int page_shift)
 840{
 841        u64 page_size = 1ULL << page_shift;
 842        u64 offset;
 843        int npages;
 844
 845        offset = addr & (page_size - 1);
 846        npages = ALIGN(len + offset, page_size) >> page_shift;
 847        return (npages + 1) / 2;
 848}
 849
 850static int mr_cache_max_order(struct mlx5_ib_dev *dev)
 851{
 852        if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
 853                return MR_CACHE_LAST_STD_ENTRY + 2;
 854        return MLX5_MAX_UMR_SHIFT;
 855}
 856
 857static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
 858{
 859        struct mlx5_ib_umr_context *context =
 860                container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
 861
 862        context->status = wc->status;
 863        complete(&context->done);
 864}
 865
 866static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
 867{
 868        context->cqe.done = mlx5_ib_umr_done;
 869        context->status = -1;
 870        init_completion(&context->done);
 871}
 872
 873static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
 874                                  struct mlx5_umr_wr *umrwr)
 875{
 876        struct umr_common *umrc = &dev->umrc;
 877        const struct ib_send_wr *bad;
 878        int err;
 879        struct mlx5_ib_umr_context umr_context;
 880
 881        mlx5_ib_init_umr_context(&umr_context);
 882        umrwr->wr.wr_cqe = &umr_context.cqe;
 883
 884        down(&umrc->sem);
 885        err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
 886        if (err) {
 887                mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
 888        } else {
 889                wait_for_completion(&umr_context.done);
 890                if (umr_context.status != IB_WC_SUCCESS) {
 891                        mlx5_ib_warn(dev, "reg umr failed (%u)\n",
 892                                     umr_context.status);
 893                        err = -EFAULT;
 894                }
 895        }
 896        up(&umrc->sem);
 897        return err;
 898}
 899
 900static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
 901                                                      unsigned int order)
 902{
 903        struct mlx5_mr_cache *cache = &dev->cache;
 904
 905        if (order < cache->ent[0].order)
 906                return &cache->ent[0];
 907        order = order - cache->ent[0].order;
 908        if (order > MR_CACHE_LAST_STD_ENTRY)
 909                return NULL;
 910        return &cache->ent[order];
 911}
 912
 913static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 914                          u64 length, int access_flags)
 915{
 916        mr->ibmr.lkey = mr->mmkey.key;
 917        mr->ibmr.rkey = mr->mmkey.key;
 918        mr->ibmr.length = length;
 919        mr->ibmr.device = &dev->ib_dev;
 920        mr->access_flags = access_flags;
 921}
 922
 923static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
 924                                                  u64 iova)
 925{
 926        /*
 927         * The alignment of iova has already been checked upon entering
 928         * UVERBS_METHOD_REG_DMABUF_MR
 929         */
 930        umem->iova = iova;
 931        return PAGE_SIZE;
 932}
 933
 934static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 935                                             struct ib_umem *umem, u64 iova,
 936                                             int access_flags)
 937{
 938        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 939        struct mlx5_cache_ent *ent;
 940        struct mlx5_ib_mr *mr;
 941        unsigned int page_size;
 942
 943        if (umem->is_dmabuf)
 944                page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
 945        else
 946                page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
 947                                                     0, iova);
 948        if (WARN_ON(!page_size))
 949                return ERR_PTR(-EINVAL);
 950        ent = mr_cache_ent_from_order(
 951                dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
 952        /*
 953         * Matches access in alloc_cache_mr(). If the MR can't come from the
 954         * cache then synchronously create an uncached one.
 955         */
 956        if (!ent || ent->limit == 0 ||
 957            !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) {
 958                mutex_lock(&dev->slow_path_mutex);
 959                mr = reg_create(pd, umem, iova, access_flags, page_size, false);
 960                mutex_unlock(&dev->slow_path_mutex);
 961                return mr;
 962        }
 963
 964        mr = get_cache_mr(ent);
 965        if (!mr) {
 966                mr = create_cache_mr(ent);
 967                /*
 968                 * The above already tried to do the same stuff as reg_create(),
 969                 * no reason to try it again.
 970                 */
 971                if (IS_ERR(mr))
 972                        return mr;
 973        }
 974
 975        mr->ibmr.pd = pd;
 976        mr->umem = umem;
 977        mr->mmkey.iova = iova;
 978        mr->mmkey.size = umem->length;
 979        mr->mmkey.pd = to_mpd(pd)->pdn;
 980        mr->page_shift = order_base_2(page_size);
 981        set_mr_fields(dev, mr, umem->length, access_flags);
 982
 983        return mr;
 984}
 985
 986#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
 987                            MLX5_UMR_MTT_ALIGNMENT)
 988#define MLX5_SPARE_UMR_CHUNK 0x10000
 989
 990/*
 991 * Allocate a temporary buffer to hold the per-page information to transfer to
 992 * HW. For efficiency this should be as large as it can be, but buffer
 993 * allocation failure is not allowed, so try smaller sizes.
 994 */
 995static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
 996{
 997        const size_t xlt_chunk_align =
 998                MLX5_UMR_MTT_ALIGNMENT / ent_size;
 999        size_t size;
1000        void *res = NULL;
1001
1002        static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0);
1003
1004        /*
1005         * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
1006         * allocation can't trigger any kind of reclaim.
1007         */
1008        might_sleep();
1009
1010        gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
1011
1012        /*
1013         * If the system already has a suitable high order page then just use
1014         * that, but don't try hard to create one. This max is about 1M, so a
1015         * free x86 huge page will satisfy it.
1016         */
1017        size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
1018                     MLX5_MAX_UMR_CHUNK);
1019        *nents = size / ent_size;
1020        res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1021                                       get_order(size));
1022        if (res)
1023                return res;
1024
1025        if (size > MLX5_SPARE_UMR_CHUNK) {
1026                size = MLX5_SPARE_UMR_CHUNK;
1027                *nents = size / ent_size;
1028                res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
1029                                               get_order(size));
1030                if (res)
1031                        return res;
1032        }
1033
1034        *nents = PAGE_SIZE / ent_size;
1035        res = (void *)__get_free_page(gfp_mask);
1036        if (res)
1037                return res;
1038
1039        mutex_lock(&xlt_emergency_page_mutex);
1040        memset(xlt_emergency_page, 0, PAGE_SIZE);
1041        return xlt_emergency_page;
1042}
1043
1044static void mlx5_ib_free_xlt(void *xlt, size_t length)
1045{
1046        if (xlt == xlt_emergency_page) {
1047                mutex_unlock(&xlt_emergency_page_mutex);
1048                return;
1049        }
1050
1051        free_pages((unsigned long)xlt, get_order(length));
1052}
1053
1054/*
1055 * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
1056 * submission.
1057 */
1058static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
1059                                   struct mlx5_umr_wr *wr, struct ib_sge *sg,
1060                                   size_t nents, size_t ent_size,
1061                                   unsigned int flags)
1062{
1063        struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1064        struct device *ddev = &dev->mdev->pdev->dev;
1065        dma_addr_t dma;
1066        void *xlt;
1067
1068        xlt = mlx5_ib_alloc_xlt(&nents, ent_size,
1069                                flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
1070                                                                 GFP_KERNEL);
1071        sg->length = nents * ent_size;
1072        dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
1073        if (dma_mapping_error(ddev, dma)) {
1074                mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
1075                mlx5_ib_free_xlt(xlt, sg->length);
1076                return NULL;
1077        }
1078        sg->addr = dma;
1079        sg->lkey = dev->umrc.pd->local_dma_lkey;
1080
1081        memset(wr, 0, sizeof(*wr));
1082        wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
1083        if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
1084                wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1085        wr->wr.sg_list = sg;
1086        wr->wr.num_sge = 1;
1087        wr->wr.opcode = MLX5_IB_WR_UMR;
1088        wr->pd = mr->ibmr.pd;
1089        wr->mkey = mr->mmkey.key;
1090        wr->length = mr->mmkey.size;
1091        wr->virt_addr = mr->mmkey.iova;
1092        wr->access_flags = mr->access_flags;
1093        wr->page_shift = mr->page_shift;
1094        wr->xlt_size = sg->length;
1095        return xlt;
1096}
1097
1098static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
1099                                   struct ib_sge *sg)
1100{
1101        struct device *ddev = &dev->mdev->pdev->dev;
1102
1103        dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
1104        mlx5_ib_free_xlt(xlt, sg->length);
1105}
1106
1107static unsigned int xlt_wr_final_send_flags(unsigned int flags)
1108{
1109        unsigned int res = 0;
1110
1111        if (flags & MLX5_IB_UPD_XLT_ENABLE)
1112                res |= MLX5_IB_SEND_UMR_ENABLE_MR |
1113                       MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1114                       MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1115        if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
1116                res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1117        if (flags & MLX5_IB_UPD_XLT_ADDR)
1118                res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1119        return res;
1120}
1121
1122int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
1123                       int page_shift, int flags)
1124{
1125        struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1126        struct device *ddev = &dev->mdev->pdev->dev;
1127        void *xlt;
1128        struct mlx5_umr_wr wr;
1129        struct ib_sge sg;
1130        int err = 0;
1131        int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
1132                               ? sizeof(struct mlx5_klm)
1133                               : sizeof(struct mlx5_mtt);
1134        const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
1135        const int page_mask = page_align - 1;
1136        size_t pages_mapped = 0;
1137        size_t pages_to_map = 0;
1138        size_t pages_iter;
1139        size_t size_to_map = 0;
1140        size_t orig_sg_length;
1141
1142        if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
1143            !umr_can_use_indirect_mkey(dev))
1144                return -EPERM;
1145
1146        if (WARN_ON(!mr->umem->is_odp))
1147                return -EINVAL;
1148
1149        /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1150         * so we need to align the offset and length accordingly
1151         */
1152        if (idx & page_mask) {
1153                npages += idx & page_mask;
1154                idx &= ~page_mask;
1155        }
1156        pages_to_map = ALIGN(npages, page_align);
1157
1158        xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
1159        if (!xlt)
1160                return -ENOMEM;
1161        pages_iter = sg.length / desc_size;
1162        orig_sg_length = sg.length;
1163
1164        if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1165                struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1166                size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1167
1168                pages_to_map = min_t(size_t, pages_to_map, max_pages);
1169        }
1170
1171        wr.page_shift = page_shift;
1172
1173        for (pages_mapped = 0;
1174             pages_mapped < pages_to_map && !err;
1175             pages_mapped += pages_iter, idx += pages_iter) {
1176                npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1177                size_to_map = npages * desc_size;
1178                dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1179                                        DMA_TO_DEVICE);
1180                mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1181                dma_sync_single_for_device(ddev, sg.addr, sg.length,
1182                                           DMA_TO_DEVICE);
1183
1184                sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
1185
1186                if (pages_mapped + pages_iter >= pages_to_map)
1187                        wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1188
1189                wr.offset = idx * desc_size;
1190                wr.xlt_size = sg.length;
1191
1192                err = mlx5_ib_post_send_wait(dev, &wr);
1193        }
1194        sg.length = orig_sg_length;
1195        mlx5_ib_unmap_free_xlt(dev, xlt, &sg);
1196        return err;
1197}
1198
1199/*
1200 * Send the DMA list to the HW for a normal MR using UMR.
1201 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
1202 * flag may be used.
1203 */
1204int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
1205{
1206        struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1207        struct device *ddev = &dev->mdev->pdev->dev;
1208        struct ib_block_iter biter;
1209        struct mlx5_mtt *cur_mtt;
1210        struct mlx5_umr_wr wr;
1211        size_t orig_sg_length;
1212        struct mlx5_mtt *mtt;
1213        size_t final_size;
1214        struct ib_sge sg;
1215        int err = 0;
1216
1217        if (WARN_ON(mr->umem->is_odp))
1218                return -EINVAL;
1219
1220        mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
1221                                    ib_umem_num_dma_blocks(mr->umem,
1222                                                           1 << mr->page_shift),
1223                                    sizeof(*mtt), flags);
1224        if (!mtt)
1225                return -ENOMEM;
1226        orig_sg_length = sg.length;
1227
1228        cur_mtt = mtt;
1229        rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter,
1230                             mr->umem->sgt_append.sgt.nents,
1231                             BIT(mr->page_shift)) {
1232                if (cur_mtt == (void *)mtt + sg.length) {
1233                        dma_sync_single_for_device(ddev, sg.addr, sg.length,
1234                                                   DMA_TO_DEVICE);
1235                        err = mlx5_ib_post_send_wait(dev, &wr);
1236                        if (err)
1237                                goto err;
1238                        dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1239                                                DMA_TO_DEVICE);
1240                        wr.offset += sg.length;
1241                        cur_mtt = mtt;
1242                }
1243
1244                cur_mtt->ptag =
1245                        cpu_to_be64(rdma_block_iter_dma_address(&biter) |
1246                                    MLX5_IB_MTT_PRESENT);
1247
1248                if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
1249                        cur_mtt->ptag = 0;
1250
1251                cur_mtt++;
1252        }
1253
1254        final_size = (void *)cur_mtt - (void *)mtt;
1255        sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
1256        memset(cur_mtt, 0, sg.length - final_size);
1257        wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1258        wr.xlt_size = sg.length;
1259
1260        dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
1261        err = mlx5_ib_post_send_wait(dev, &wr);
1262
1263err:
1264        sg.length = orig_sg_length;
1265        mlx5_ib_unmap_free_xlt(dev, mtt, &sg);
1266        return err;
1267}
1268
1269/*
1270 * If ibmr is NULL it will be allocated by reg_create.
1271 * Else, the given ibmr will be used.
1272 */
1273static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1274                                     u64 iova, int access_flags,
1275                                     unsigned int page_size, bool populate)
1276{
1277        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1278        struct mlx5_ib_mr *mr;
1279        __be64 *pas;
1280        void *mkc;
1281        int inlen;
1282        u32 *in;
1283        int err;
1284        bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1285
1286        if (!page_size)
1287                return ERR_PTR(-EINVAL);
1288        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1289        if (!mr)
1290                return ERR_PTR(-ENOMEM);
1291
1292        mr->ibmr.pd = pd;
1293        mr->access_flags = access_flags;
1294        mr->page_shift = order_base_2(page_size);
1295
1296        inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1297        if (populate)
1298                inlen += sizeof(*pas) *
1299                         roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1300        in = kvzalloc(inlen, GFP_KERNEL);
1301        if (!in) {
1302                err = -ENOMEM;
1303                goto err_1;
1304        }
1305        pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1306        if (populate) {
1307                if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1308                        err = -EINVAL;
1309                        goto err_2;
1310                }
1311                mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1312                                     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1313        }
1314
1315        /* The pg_access bit allows setting the access flags
1316         * in the page list submitted with the command. */
1317        MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1318
1319        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1320        set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1321                                      populate ? pd : dev->umrc.pd);
1322        MLX5_SET(mkc, mkc, free, !populate);
1323        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1324        MLX5_SET(mkc, mkc, umr_en, 1);
1325
1326        MLX5_SET64(mkc, mkc, len, umem->length);
1327        MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1328        MLX5_SET(mkc, mkc, translations_octword_size,
1329                 get_octo_len(iova, umem->length, mr->page_shift));
1330        MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1331        if (populate) {
1332                MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1333                         get_octo_len(iova, umem->length, mr->page_shift));
1334        }
1335
1336        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1337        if (err) {
1338                mlx5_ib_warn(dev, "create mkey failed\n");
1339                goto err_2;
1340        }
1341        mr->mmkey.type = MLX5_MKEY_MR;
1342        mr->umem = umem;
1343        set_mr_fields(dev, mr, umem->length, access_flags);
1344        kvfree(in);
1345
1346        mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1347
1348        return mr;
1349
1350err_2:
1351        kvfree(in);
1352err_1:
1353        kfree(mr);
1354        return ERR_PTR(err);
1355}
1356
1357static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1358                                       u64 length, int acc, int mode)
1359{
1360        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1361        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1362        struct mlx5_ib_mr *mr;
1363        void *mkc;
1364        u32 *in;
1365        int err;
1366
1367        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1368        if (!mr)
1369                return ERR_PTR(-ENOMEM);
1370
1371        in = kzalloc(inlen, GFP_KERNEL);
1372        if (!in) {
1373                err = -ENOMEM;
1374                goto err_free;
1375        }
1376
1377        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1378
1379        MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1380        MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1381        MLX5_SET64(mkc, mkc, len, length);
1382        set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1383
1384        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1385        if (err)
1386                goto err_in;
1387
1388        kfree(in);
1389
1390        set_mr_fields(dev, mr, length, acc);
1391
1392        return &mr->ibmr;
1393
1394err_in:
1395        kfree(in);
1396
1397err_free:
1398        kfree(mr);
1399
1400        return ERR_PTR(err);
1401}
1402
1403int mlx5_ib_advise_mr(struct ib_pd *pd,
1404                      enum ib_uverbs_advise_mr_advice advice,
1405                      u32 flags,
1406                      struct ib_sge *sg_list,
1407                      u32 num_sge,
1408                      struct uverbs_attr_bundle *attrs)
1409{
1410        if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1411            advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1412            advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1413                return -EOPNOTSUPP;
1414
1415        return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1416                                         sg_list, num_sge);
1417}
1418
1419struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1420                                struct ib_dm_mr_attr *attr,
1421                                struct uverbs_attr_bundle *attrs)
1422{
1423        struct mlx5_ib_dm *mdm = to_mdm(dm);
1424        struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1425        u64 start_addr = mdm->dev_addr + attr->offset;
1426        int mode;
1427
1428        switch (mdm->type) {
1429        case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1430                if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1431                        return ERR_PTR(-EINVAL);
1432
1433                mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1434                start_addr -= pci_resource_start(dev->pdev, 0);
1435                break;
1436        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1437        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1438                if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1439                        return ERR_PTR(-EINVAL);
1440
1441                mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1442                break;
1443        default:
1444                return ERR_PTR(-EINVAL);
1445        }
1446
1447        return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1448                                 attr->access_flags, mode);
1449}
1450
1451static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1452                                    u64 iova, int access_flags)
1453{
1454        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1455        struct mlx5_ib_mr *mr = NULL;
1456        bool xlt_with_umr;
1457        int err;
1458
1459        xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length);
1460        if (xlt_with_umr) {
1461                mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1462        } else {
1463                unsigned int page_size = mlx5_umem_find_best_pgsz(
1464                        umem, mkc, log_page_size, 0, iova);
1465
1466                mutex_lock(&dev->slow_path_mutex);
1467                mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1468                mutex_unlock(&dev->slow_path_mutex);
1469        }
1470        if (IS_ERR(mr)) {
1471                ib_umem_release(umem);
1472                return ERR_CAST(mr);
1473        }
1474
1475        mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1476
1477        atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1478
1479        if (xlt_with_umr) {
1480                /*
1481                 * If the MR was created with reg_create then it will be
1482                 * configured properly but left disabled. It is safe to go ahead
1483                 * and configure it again via UMR while enabling it.
1484                 */
1485                err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1486                if (err) {
1487                        mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1488                        return ERR_PTR(err);
1489                }
1490        }
1491        return &mr->ibmr;
1492}
1493
1494static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1495                                        u64 iova, int access_flags,
1496                                        struct ib_udata *udata)
1497{
1498        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1499        struct ib_umem_odp *odp;
1500        struct mlx5_ib_mr *mr;
1501        int err;
1502
1503        if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1504                return ERR_PTR(-EOPNOTSUPP);
1505
1506        err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1507        if (err)
1508                return ERR_PTR(err);
1509        if (!start && length == U64_MAX) {
1510                if (iova != 0)
1511                        return ERR_PTR(-EINVAL);
1512                if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1513                        return ERR_PTR(-EINVAL);
1514
1515                mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1516                if (IS_ERR(mr))
1517                        return ERR_CAST(mr);
1518                return &mr->ibmr;
1519        }
1520
1521        /* ODP requires xlt update via umr to work. */
1522        if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1523                return ERR_PTR(-EINVAL);
1524
1525        odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1526                              &mlx5_mn_ops);
1527        if (IS_ERR(odp))
1528                return ERR_CAST(odp);
1529
1530        mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1531        if (IS_ERR(mr)) {
1532                ib_umem_release(&odp->umem);
1533                return ERR_CAST(mr);
1534        }
1535        xa_init(&mr->implicit_children);
1536
1537        odp->private = mr;
1538        err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1539        if (err)
1540                goto err_dereg_mr;
1541
1542        err = mlx5_ib_init_odp_mr(mr);
1543        if (err)
1544                goto err_dereg_mr;
1545        return &mr->ibmr;
1546
1547err_dereg_mr:
1548        mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1549        return ERR_PTR(err);
1550}
1551
1552struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1553                                  u64 iova, int access_flags,
1554                                  struct ib_udata *udata)
1555{
1556        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1557        struct ib_umem *umem;
1558
1559        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1560                return ERR_PTR(-EOPNOTSUPP);
1561
1562        mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1563                    start, iova, length, access_flags);
1564
1565        if (access_flags & IB_ACCESS_ON_DEMAND)
1566                return create_user_odp_mr(pd, start, length, iova, access_flags,
1567                                          udata);
1568        umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1569        if (IS_ERR(umem))
1570                return ERR_CAST(umem);
1571        return create_real_mr(pd, umem, iova, access_flags);
1572}
1573
1574static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1575{
1576        struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1577        struct mlx5_ib_mr *mr = umem_dmabuf->private;
1578
1579        dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1580
1581        if (!umem_dmabuf->sgt)
1582                return;
1583
1584        mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1585        ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1586}
1587
1588static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1589        .allow_peer2peer = 1,
1590        .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1591};
1592
1593struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1594                                         u64 length, u64 virt_addr,
1595                                         int fd, int access_flags,
1596                                         struct ib_udata *udata)
1597{
1598        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1599        struct mlx5_ib_mr *mr = NULL;
1600        struct ib_umem_dmabuf *umem_dmabuf;
1601        int err;
1602
1603        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1604            !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1605                return ERR_PTR(-EOPNOTSUPP);
1606
1607        mlx5_ib_dbg(dev,
1608                    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1609                    offset, virt_addr, length, fd, access_flags);
1610
1611        /* dmabuf requires xlt update via umr to work. */
1612        if (!mlx5_ib_can_load_pas_with_umr(dev, length))
1613                return ERR_PTR(-EINVAL);
1614
1615        umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1616                                         access_flags,
1617                                         &mlx5_ib_dmabuf_attach_ops);
1618        if (IS_ERR(umem_dmabuf)) {
1619                mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1620                            PTR_ERR(umem_dmabuf));
1621                return ERR_CAST(umem_dmabuf);
1622        }
1623
1624        mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1625                                access_flags);
1626        if (IS_ERR(mr)) {
1627                ib_umem_release(&umem_dmabuf->umem);
1628                return ERR_CAST(mr);
1629        }
1630
1631        mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1632
1633        atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1634        umem_dmabuf->private = mr;
1635        err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1636        if (err)
1637                goto err_dereg_mr;
1638
1639        err = mlx5_ib_init_dmabuf_mr(mr);
1640        if (err)
1641                goto err_dereg_mr;
1642        return &mr->ibmr;
1643
1644err_dereg_mr:
1645        mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1646        return ERR_PTR(err);
1647}
1648
1649/**
1650 * revoke_mr - Fence all DMA on the MR
1651 * @mr: The MR to fence
1652 *
1653 * Upon return the NIC will not be doing any DMA to the pages under the MR,
1654 * and any DMA in progress will be completed. Failure of this function
1655 * indicates the HW has failed catastrophically.
1656 */
1657static int revoke_mr(struct mlx5_ib_mr *mr)
1658{
1659        struct mlx5_umr_wr umrwr = {};
1660
1661        if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1662                return 0;
1663
1664        umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1665                              MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1666        umrwr.wr.opcode = MLX5_IB_WR_UMR;
1667        umrwr.pd = mr_to_mdev(mr)->umrc.pd;
1668        umrwr.mkey = mr->mmkey.key;
1669        umrwr.ignore_free_state = 1;
1670
1671        return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr);
1672}
1673
1674/*
1675 * True if the change in access flags can be done via UMR, only some access
1676 * flags can be updated.
1677 */
1678static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1679                                     unsigned int current_access_flags,
1680                                     unsigned int target_access_flags)
1681{
1682        unsigned int diffs = current_access_flags ^ target_access_flags;
1683
1684        if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1685                      IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
1686                return false;
1687        return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags,
1688                                             target_access_flags);
1689}
1690
1691static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1692                               int access_flags)
1693{
1694        struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1695        struct mlx5_umr_wr umrwr = {
1696                .wr = {
1697                        .send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
1698                                      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS,
1699                        .opcode = MLX5_IB_WR_UMR,
1700                },
1701                .mkey = mr->mmkey.key,
1702                .pd = pd,
1703                .access_flags = access_flags,
1704        };
1705        int err;
1706
1707        err = mlx5_ib_post_send_wait(dev, &umrwr);
1708        if (err)
1709                return err;
1710
1711        mr->access_flags = access_flags;
1712        mr->mmkey.pd = to_mpd(pd)->pdn;
1713        return 0;
1714}
1715
1716static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1717                                  struct ib_umem *new_umem,
1718                                  int new_access_flags, u64 iova,
1719                                  unsigned long *page_size)
1720{
1721        struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1722
1723        /* We only track the allocated sizes of MRs from the cache */
1724        if (!mr->cache_ent)
1725                return false;
1726        if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length))
1727                return false;
1728
1729        *page_size =
1730                mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1731        if (WARN_ON(!*page_size))
1732                return false;
1733        return (1ULL << mr->cache_ent->order) >=
1734               ib_umem_num_dma_blocks(new_umem, *page_size);
1735}
1736
1737static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1738                         int access_flags, int flags, struct ib_umem *new_umem,
1739                         u64 iova, unsigned long page_size)
1740{
1741        struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1742        int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1743        struct ib_umem *old_umem = mr->umem;
1744        int err;
1745
1746        /*
1747         * To keep everything simple the MR is revoked before we start to mess
1748         * with it. This ensure the change is atomic relative to any use of the
1749         * MR.
1750         */
1751        err = revoke_mr(mr);
1752        if (err)
1753                return err;
1754
1755        if (flags & IB_MR_REREG_PD) {
1756                mr->ibmr.pd = pd;
1757                mr->mmkey.pd = to_mpd(pd)->pdn;
1758                upd_flags |= MLX5_IB_UPD_XLT_PD;
1759        }
1760        if (flags & IB_MR_REREG_ACCESS) {
1761                mr->access_flags = access_flags;
1762                upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1763        }
1764
1765        mr->ibmr.length = new_umem->length;
1766        mr->mmkey.iova = iova;
1767        mr->mmkey.size = new_umem->length;
1768        mr->page_shift = order_base_2(page_size);
1769        mr->umem = new_umem;
1770        err = mlx5_ib_update_mr_pas(mr, upd_flags);
1771        if (err) {
1772                /*
1773                 * The MR is revoked at this point so there is no issue to free
1774                 * new_umem.
1775                 */
1776                mr->umem = old_umem;
1777                return err;
1778        }
1779
1780        atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1781        ib_umem_release(old_umem);
1782        atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1783        return 0;
1784}
1785
1786struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1787                                    u64 length, u64 iova, int new_access_flags,
1788                                    struct ib_pd *new_pd,
1789                                    struct ib_udata *udata)
1790{
1791        struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1792        struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1793        int err;
1794
1795        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1796                return ERR_PTR(-EOPNOTSUPP);
1797
1798        mlx5_ib_dbg(
1799                dev,
1800                "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1801                start, iova, length, new_access_flags);
1802
1803        if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1804                return ERR_PTR(-EOPNOTSUPP);
1805
1806        if (!(flags & IB_MR_REREG_ACCESS))
1807                new_access_flags = mr->access_flags;
1808        if (!(flags & IB_MR_REREG_PD))
1809                new_pd = ib_mr->pd;
1810
1811        if (!(flags & IB_MR_REREG_TRANS)) {
1812                struct ib_umem *umem;
1813
1814                /* Fast path for PD/access change */
1815                if (can_use_umr_rereg_access(dev, mr->access_flags,
1816                                             new_access_flags)) {
1817                        err = umr_rereg_pd_access(mr, new_pd, new_access_flags);
1818                        if (err)
1819                                return ERR_PTR(err);
1820                        return NULL;
1821                }
1822                /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1823                if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1824                        goto recreate;
1825
1826                /*
1827                 * Only one active MR can refer to a umem at one time, revoke
1828                 * the old MR before assigning the umem to the new one.
1829                 */
1830                err = revoke_mr(mr);
1831                if (err)
1832                        return ERR_PTR(err);
1833                umem = mr->umem;
1834                mr->umem = NULL;
1835                atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1836
1837                return create_real_mr(new_pd, umem, mr->mmkey.iova,
1838                                      new_access_flags);
1839        }
1840
1841        /*
1842         * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1843         * but the logic around releasing the umem is different
1844         */
1845        if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1846                goto recreate;
1847
1848        if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1849            can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1850                struct ib_umem *new_umem;
1851                unsigned long page_size;
1852
1853                new_umem = ib_umem_get(&dev->ib_dev, start, length,
1854                                       new_access_flags);
1855                if (IS_ERR(new_umem))
1856                        return ERR_CAST(new_umem);
1857
1858                /* Fast path for PAS change */
1859                if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1860                                          &page_size)) {
1861                        err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1862                                            new_umem, iova, page_size);
1863                        if (err) {
1864                                ib_umem_release(new_umem);
1865                                return ERR_PTR(err);
1866                        }
1867                        return NULL;
1868                }
1869                return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1870        }
1871
1872        /*
1873         * Everything else has no state we can preserve, just create a new MR
1874         * from scratch
1875         */
1876recreate:
1877        return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1878                                   new_access_flags, udata);
1879}
1880
1881static int
1882mlx5_alloc_priv_descs(struct ib_device *device,
1883                      struct mlx5_ib_mr *mr,
1884                      int ndescs,
1885                      int desc_size)
1886{
1887        struct mlx5_ib_dev *dev = to_mdev(device);
1888        struct device *ddev = &dev->mdev->pdev->dev;
1889        int size = ndescs * desc_size;
1890        int add_size;
1891        int ret;
1892
1893        add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1894
1895        mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1896        if (!mr->descs_alloc)
1897                return -ENOMEM;
1898
1899        mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1900
1901        mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1902        if (dma_mapping_error(ddev, mr->desc_map)) {
1903                ret = -ENOMEM;
1904                goto err;
1905        }
1906
1907        return 0;
1908err:
1909        kfree(mr->descs_alloc);
1910
1911        return ret;
1912}
1913
1914static void
1915mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1916{
1917        if (!mr->umem && mr->descs) {
1918                struct ib_device *device = mr->ibmr.device;
1919                int size = mr->max_descs * mr->desc_size;
1920                struct mlx5_ib_dev *dev = to_mdev(device);
1921
1922                dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1923                                 DMA_TO_DEVICE);
1924                kfree(mr->descs_alloc);
1925                mr->descs = NULL;
1926        }
1927}
1928
1929int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1930{
1931        struct mlx5_ib_mr *mr = to_mmr(ibmr);
1932        struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1933        int rc;
1934
1935        /*
1936         * Any async use of the mr must hold the refcount, once the refcount
1937         * goes to zero no other thread, such as ODP page faults, prefetch, any
1938         * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1939         */
1940        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1941            refcount_read(&mr->mmkey.usecount) != 0 &&
1942            xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1943                mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1944
1945        if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1946                xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1947                           mr->sig, NULL, GFP_KERNEL);
1948
1949                if (mr->mtt_mr) {
1950                        rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1951                        if (rc)
1952                                return rc;
1953                        mr->mtt_mr = NULL;
1954                }
1955                if (mr->klm_mr) {
1956                        rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1957                        if (rc)
1958                                return rc;
1959                        mr->klm_mr = NULL;
1960                }
1961
1962                if (mlx5_core_destroy_psv(dev->mdev,
1963                                          mr->sig->psv_memory.psv_idx))
1964                        mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1965                                     mr->sig->psv_memory.psv_idx);
1966                if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1967                        mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1968                                     mr->sig->psv_wire.psv_idx);
1969                kfree(mr->sig);
1970                mr->sig = NULL;
1971        }
1972
1973        /* Stop DMA */
1974        if (mr->cache_ent) {
1975                if (revoke_mr(mr)) {
1976                        spin_lock_irq(&mr->cache_ent->lock);
1977                        mr->cache_ent->total_mrs--;
1978                        spin_unlock_irq(&mr->cache_ent->lock);
1979                        mr->cache_ent = NULL;
1980                }
1981        }
1982        if (!mr->cache_ent) {
1983                rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1984                if (rc)
1985                        return rc;
1986        }
1987
1988        if (mr->umem) {
1989                bool is_odp = is_odp_mr(mr);
1990
1991                if (!is_odp)
1992                        atomic_sub(ib_umem_num_pages(mr->umem),
1993                                   &dev->mdev->priv.reg_pages);
1994                ib_umem_release(mr->umem);
1995                if (is_odp)
1996                        mlx5_ib_free_odp_mr(mr);
1997        }
1998
1999        if (mr->cache_ent) {
2000                mlx5_mr_cache_free(dev, mr);
2001        } else {
2002                mlx5_free_priv_descs(mr);
2003                kfree(mr);
2004        }
2005        return 0;
2006}
2007
2008static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2009                                   int access_mode, int page_shift)
2010{
2011        void *mkc;
2012
2013        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2014
2015        /* This is only used from the kernel, so setting the PD is OK. */
2016        set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2017        MLX5_SET(mkc, mkc, free, 1);
2018        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2019        MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2020        MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2021        MLX5_SET(mkc, mkc, umr_en, 1);
2022        MLX5_SET(mkc, mkc, log_page_size, page_shift);
2023}
2024
2025static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2026                                  int ndescs, int desc_size, int page_shift,
2027                                  int access_mode, u32 *in, int inlen)
2028{
2029        struct mlx5_ib_dev *dev = to_mdev(pd->device);
2030        int err;
2031
2032        mr->access_mode = access_mode;
2033        mr->desc_size = desc_size;
2034        mr->max_descs = ndescs;
2035
2036        err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2037        if (err)
2038                return err;
2039
2040        mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2041
2042        err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2043        if (err)
2044                goto err_free_descs;
2045
2046        mr->mmkey.type = MLX5_MKEY_MR;
2047        mr->ibmr.lkey = mr->mmkey.key;
2048        mr->ibmr.rkey = mr->mmkey.key;
2049
2050        return 0;
2051
2052err_free_descs:
2053        mlx5_free_priv_descs(mr);
2054        return err;
2055}
2056
2057static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2058                                u32 max_num_sg, u32 max_num_meta_sg,
2059                                int desc_size, int access_mode)
2060{
2061        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2062        int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2063        int page_shift = 0;
2064        struct mlx5_ib_mr *mr;
2065        u32 *in;
2066        int err;
2067
2068        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2069        if (!mr)
2070                return ERR_PTR(-ENOMEM);
2071
2072        mr->ibmr.pd = pd;
2073        mr->ibmr.device = pd->device;
2074
2075        in = kzalloc(inlen, GFP_KERNEL);
2076        if (!in) {
2077                err = -ENOMEM;
2078                goto err_free;
2079        }
2080
2081        if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2082                page_shift = PAGE_SHIFT;
2083
2084        err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2085                                     access_mode, in, inlen);
2086        if (err)
2087                goto err_free_in;
2088
2089        mr->umem = NULL;
2090        kfree(in);
2091
2092        return mr;
2093
2094err_free_in:
2095        kfree(in);
2096err_free:
2097        kfree(mr);
2098        return ERR_PTR(err);
2099}
2100
2101static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2102                                    int ndescs, u32 *in, int inlen)
2103{
2104        return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2105                                      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2106                                      inlen);
2107}
2108
2109static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2110                                    int ndescs, u32 *in, int inlen)
2111{
2112        return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2113                                      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2114}
2115
2116static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2117                                      int max_num_sg, int max_num_meta_sg,
2118                                      u32 *in, int inlen)
2119{
2120        struct mlx5_ib_dev *dev = to_mdev(pd->device);
2121        u32 psv_index[2];
2122        void *mkc;
2123        int err;
2124
2125        mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2126        if (!mr->sig)
2127                return -ENOMEM;
2128
2129        /* create mem & wire PSVs */
2130        err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2131        if (err)
2132                goto err_free_sig;
2133
2134        mr->sig->psv_memory.psv_idx = psv_index[0];
2135        mr->sig->psv_wire.psv_idx = psv_index[1];
2136
2137        mr->sig->sig_status_checked = true;
2138        mr->sig->sig_err_exists = false;
2139        /* Next UMR, Arm SIGERR */
2140        ++mr->sig->sigerr_count;
2141        mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2142                                         sizeof(struct mlx5_klm),
2143                                         MLX5_MKC_ACCESS_MODE_KLMS);
2144        if (IS_ERR(mr->klm_mr)) {
2145                err = PTR_ERR(mr->klm_mr);
2146                goto err_destroy_psv;
2147        }
2148        mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2149                                         sizeof(struct mlx5_mtt),
2150                                         MLX5_MKC_ACCESS_MODE_MTT);
2151        if (IS_ERR(mr->mtt_mr)) {
2152                err = PTR_ERR(mr->mtt_mr);
2153                goto err_free_klm_mr;
2154        }
2155
2156        /* Set bsf descriptors for mkey */
2157        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2158        MLX5_SET(mkc, mkc, bsf_en, 1);
2159        MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2160
2161        err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2162                                     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2163        if (err)
2164                goto err_free_mtt_mr;
2165
2166        err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2167                              mr->sig, GFP_KERNEL));
2168        if (err)
2169                goto err_free_descs;
2170        return 0;
2171
2172err_free_descs:
2173        destroy_mkey(dev, mr);
2174        mlx5_free_priv_descs(mr);
2175err_free_mtt_mr:
2176        mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2177        mr->mtt_mr = NULL;
2178err_free_klm_mr:
2179        mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2180        mr->klm_mr = NULL;
2181err_destroy_psv:
2182        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2183                mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2184                             mr->sig->psv_memory.psv_idx);
2185        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2186                mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2187                             mr->sig->psv_wire.psv_idx);
2188err_free_sig:
2189        kfree(mr->sig);
2190
2191        return err;
2192}
2193
2194static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2195                                        enum ib_mr_type mr_type, u32 max_num_sg,
2196                                        u32 max_num_meta_sg)
2197{
2198        struct mlx5_ib_dev *dev = to_mdev(pd->device);
2199        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2200        int ndescs = ALIGN(max_num_sg, 4);
2201        struct mlx5_ib_mr *mr;
2202        u32 *in;
2203        int err;
2204
2205        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2206        if (!mr)
2207                return ERR_PTR(-ENOMEM);
2208
2209        in = kzalloc(inlen, GFP_KERNEL);
2210        if (!in) {
2211                err = -ENOMEM;
2212                goto err_free;
2213        }
2214
2215        mr->ibmr.device = pd->device;
2216        mr->umem = NULL;
2217
2218        switch (mr_type) {
2219        case IB_MR_TYPE_MEM_REG:
2220                err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2221                break;
2222        case IB_MR_TYPE_SG_GAPS:
2223                err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2224                break;
2225        case IB_MR_TYPE_INTEGRITY:
2226                err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2227                                                 max_num_meta_sg, in, inlen);
2228                break;
2229        default:
2230                mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2231                err = -EINVAL;
2232        }
2233
2234        if (err)
2235                goto err_free_in;
2236
2237        kfree(in);
2238
2239        return &mr->ibmr;
2240
2241err_free_in:
2242        kfree(in);
2243err_free:
2244        kfree(mr);
2245        return ERR_PTR(err);
2246}
2247
2248struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2249                               u32 max_num_sg)
2250{
2251        return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2252}
2253
2254struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2255                                         u32 max_num_sg, u32 max_num_meta_sg)
2256{
2257        return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2258                                  max_num_meta_sg);
2259}
2260
2261int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2262{
2263        struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2264        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2265        struct mlx5_ib_mw *mw = to_mmw(ibmw);
2266        u32 *in = NULL;
2267        void *mkc;
2268        int ndescs;
2269        int err;
2270        struct mlx5_ib_alloc_mw req = {};
2271        struct {
2272                __u32   comp_mask;
2273                __u32   response_length;
2274        } resp = {};
2275
2276        err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2277        if (err)
2278                return err;
2279
2280        if (req.comp_mask || req.reserved1 || req.reserved2)
2281                return -EOPNOTSUPP;
2282
2283        if (udata->inlen > sizeof(req) &&
2284            !ib_is_udata_cleared(udata, sizeof(req),
2285                                 udata->inlen - sizeof(req)))
2286                return -EOPNOTSUPP;
2287
2288        ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2289
2290        in = kzalloc(inlen, GFP_KERNEL);
2291        if (!in) {
2292                err = -ENOMEM;
2293                goto free;
2294        }
2295
2296        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2297
2298        MLX5_SET(mkc, mkc, free, 1);
2299        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2300        MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2301        MLX5_SET(mkc, mkc, umr_en, 1);
2302        MLX5_SET(mkc, mkc, lr, 1);
2303        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2304        MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2305        MLX5_SET(mkc, mkc, qpn, 0xffffff);
2306
2307        err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2308        if (err)
2309                goto free;
2310
2311        mw->mmkey.type = MLX5_MKEY_MW;
2312        ibmw->rkey = mw->mmkey.key;
2313        mw->ndescs = ndescs;
2314
2315        resp.response_length =
2316                min(offsetofend(typeof(resp), response_length), udata->outlen);
2317        if (resp.response_length) {
2318                err = ib_copy_to_udata(udata, &resp, resp.response_length);
2319                if (err)
2320                        goto free_mkey;
2321        }
2322
2323        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2324                err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2325                if (err)
2326                        goto free_mkey;
2327        }
2328
2329        kfree(in);
2330        return 0;
2331
2332free_mkey:
2333        mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
2334free:
2335        kfree(in);
2336        return err;
2337}
2338
2339int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2340{
2341        struct mlx5_ib_dev *dev = to_mdev(mw->device);
2342        struct mlx5_ib_mw *mmw = to_mmw(mw);
2343
2344        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2345            xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2346                /*
2347                 * pagefault_single_data_segment() may be accessing mmw
2348                 * if the user bound an ODP MR to this MW.
2349                 */
2350                mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2351
2352        return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
2353}
2354
2355int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2356                            struct ib_mr_status *mr_status)
2357{
2358        struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2359        int ret = 0;
2360
2361        if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2362                pr_err("Invalid status check mask\n");
2363                ret = -EINVAL;
2364                goto done;
2365        }
2366
2367        mr_status->fail_status = 0;
2368        if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2369                if (!mmr->sig) {
2370                        ret = -EINVAL;
2371                        pr_err("signature status check requested on a non-signature enabled MR\n");
2372                        goto done;
2373                }
2374
2375                mmr->sig->sig_status_checked = true;
2376                if (!mmr->sig->sig_err_exists)
2377                        goto done;
2378
2379                if (ibmr->lkey == mmr->sig->err_item.key)
2380                        memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2381                               sizeof(mr_status->sig_err));
2382                else {
2383                        mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2384                        mr_status->sig_err.sig_err_offset = 0;
2385                        mr_status->sig_err.key = mmr->sig->err_item.key;
2386                }
2387
2388                mmr->sig->sig_err_exists = false;
2389                mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2390        }
2391
2392done:
2393        return ret;
2394}
2395
2396static int
2397mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2398                        int data_sg_nents, unsigned int *data_sg_offset,
2399                        struct scatterlist *meta_sg, int meta_sg_nents,
2400                        unsigned int *meta_sg_offset)
2401{
2402        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2403        unsigned int sg_offset = 0;
2404        int n = 0;
2405
2406        mr->meta_length = 0;
2407        if (data_sg_nents == 1) {
2408                n++;
2409                mr->ndescs = 1;
2410                if (data_sg_offset)
2411                        sg_offset = *data_sg_offset;
2412                mr->data_length = sg_dma_len(data_sg) - sg_offset;
2413                mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2414                if (meta_sg_nents == 1) {
2415                        n++;
2416                        mr->meta_ndescs = 1;
2417                        if (meta_sg_offset)
2418                                sg_offset = *meta_sg_offset;
2419                        else
2420                                sg_offset = 0;
2421                        mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2422                        mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2423                }
2424                ibmr->length = mr->data_length + mr->meta_length;
2425        }
2426
2427        return n;
2428}
2429
2430static int
2431mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2432                   struct scatterlist *sgl,
2433                   unsigned short sg_nents,
2434                   unsigned int *sg_offset_p,
2435                   struct scatterlist *meta_sgl,
2436                   unsigned short meta_sg_nents,
2437                   unsigned int *meta_sg_offset_p)
2438{
2439        struct scatterlist *sg = sgl;
2440        struct mlx5_klm *klms = mr->descs;
2441        unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2442        u32 lkey = mr->ibmr.pd->local_dma_lkey;
2443        int i, j = 0;
2444
2445        mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2446        mr->ibmr.length = 0;
2447
2448        for_each_sg(sgl, sg, sg_nents, i) {
2449                if (unlikely(i >= mr->max_descs))
2450                        break;
2451                klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2452                klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2453                klms[i].key = cpu_to_be32(lkey);
2454                mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2455
2456                sg_offset = 0;
2457        }
2458
2459        if (sg_offset_p)
2460                *sg_offset_p = sg_offset;
2461
2462        mr->ndescs = i;
2463        mr->data_length = mr->ibmr.length;
2464
2465        if (meta_sg_nents) {
2466                sg = meta_sgl;
2467                sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2468                for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2469                        if (unlikely(i + j >= mr->max_descs))
2470                                break;
2471                        klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2472                                                     sg_offset);
2473                        klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2474                                                         sg_offset);
2475                        klms[i + j].key = cpu_to_be32(lkey);
2476                        mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2477
2478                        sg_offset = 0;
2479                }
2480                if (meta_sg_offset_p)
2481                        *meta_sg_offset_p = sg_offset;
2482
2483                mr->meta_ndescs = j;
2484                mr->meta_length = mr->ibmr.length - mr->data_length;
2485        }
2486
2487        return i + j;
2488}
2489
2490static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2491{
2492        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2493        __be64 *descs;
2494
2495        if (unlikely(mr->ndescs == mr->max_descs))
2496                return -ENOMEM;
2497
2498        descs = mr->descs;
2499        descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2500
2501        return 0;
2502}
2503
2504static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2505{
2506        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2507        __be64 *descs;
2508
2509        if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
2510                return -ENOMEM;
2511
2512        descs = mr->descs;
2513        descs[mr->ndescs + mr->meta_ndescs++] =
2514                cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2515
2516        return 0;
2517}
2518
2519static int
2520mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2521                         int data_sg_nents, unsigned int *data_sg_offset,
2522                         struct scatterlist *meta_sg, int meta_sg_nents,
2523                         unsigned int *meta_sg_offset)
2524{
2525        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2526        struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2527        int n;
2528
2529        pi_mr->ndescs = 0;
2530        pi_mr->meta_ndescs = 0;
2531        pi_mr->meta_length = 0;
2532
2533        ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2534                                   pi_mr->desc_size * pi_mr->max_descs,
2535                                   DMA_TO_DEVICE);
2536
2537        pi_mr->ibmr.page_size = ibmr->page_size;
2538        n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2539                           mlx5_set_page);
2540        if (n != data_sg_nents)
2541                return n;
2542
2543        pi_mr->data_iova = pi_mr->ibmr.iova;
2544        pi_mr->data_length = pi_mr->ibmr.length;
2545        pi_mr->ibmr.length = pi_mr->data_length;
2546        ibmr->length = pi_mr->data_length;
2547
2548        if (meta_sg_nents) {
2549                u64 page_mask = ~((u64)ibmr->page_size - 1);
2550                u64 iova = pi_mr->data_iova;
2551
2552                n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2553                                    meta_sg_offset, mlx5_set_page_pi);
2554
2555                pi_mr->meta_length = pi_mr->ibmr.length;
2556                /*
2557                 * PI address for the HW is the offset of the metadata address
2558                 * relative to the first data page address.
2559                 * It equals to first data page address + size of data pages +
2560                 * metadata offset at the first metadata page
2561                 */
2562                pi_mr->pi_iova = (iova & page_mask) +
2563                                 pi_mr->ndescs * ibmr->page_size +
2564                                 (pi_mr->ibmr.iova & ~page_mask);
2565                /*
2566                 * In order to use one MTT MR for data and metadata, we register
2567                 * also the gaps between the end of the data and the start of
2568                 * the metadata (the sig MR will verify that the HW will access
2569                 * to right addresses). This mapping is safe because we use
2570                 * internal mkey for the registration.
2571                 */
2572                pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2573                pi_mr->ibmr.iova = iova;
2574                ibmr->length += pi_mr->meta_length;
2575        }
2576
2577        ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2578                                      pi_mr->desc_size * pi_mr->max_descs,
2579                                      DMA_TO_DEVICE);
2580
2581        return n;
2582}
2583
2584static int
2585mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2586                         int data_sg_nents, unsigned int *data_sg_offset,
2587                         struct scatterlist *meta_sg, int meta_sg_nents,
2588                         unsigned int *meta_sg_offset)
2589{
2590        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2591        struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2592        int n;
2593
2594        pi_mr->ndescs = 0;
2595        pi_mr->meta_ndescs = 0;
2596        pi_mr->meta_length = 0;
2597
2598        ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2599                                   pi_mr->desc_size * pi_mr->max_descs,
2600                                   DMA_TO_DEVICE);
2601
2602        n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2603                               meta_sg, meta_sg_nents, meta_sg_offset);
2604
2605        ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2606                                      pi_mr->desc_size * pi_mr->max_descs,
2607                                      DMA_TO_DEVICE);
2608
2609        /* This is zero-based memory region */
2610        pi_mr->data_iova = 0;
2611        pi_mr->ibmr.iova = 0;
2612        pi_mr->pi_iova = pi_mr->data_length;
2613        ibmr->length = pi_mr->ibmr.length;
2614
2615        return n;
2616}
2617
2618int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2619                         int data_sg_nents, unsigned int *data_sg_offset,
2620                         struct scatterlist *meta_sg, int meta_sg_nents,
2621                         unsigned int *meta_sg_offset)
2622{
2623        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2624        struct mlx5_ib_mr *pi_mr = NULL;
2625        int n;
2626
2627        WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2628
2629        mr->ndescs = 0;
2630        mr->data_length = 0;
2631        mr->data_iova = 0;
2632        mr->meta_ndescs = 0;
2633        mr->pi_iova = 0;
2634        /*
2635         * As a performance optimization, if possible, there is no need to
2636         * perform UMR operation to register the data/metadata buffers.
2637         * First try to map the sg lists to PA descriptors with local_dma_lkey.
2638         * Fallback to UMR only in case of a failure.
2639         */
2640        n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2641                                    data_sg_offset, meta_sg, meta_sg_nents,
2642                                    meta_sg_offset);
2643        if (n == data_sg_nents + meta_sg_nents)
2644                goto out;
2645        /*
2646         * As a performance optimization, if possible, there is no need to map
2647         * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2648         * descriptors and fallback to KLM only in case of a failure.
2649         * It's more efficient for the HW to work with MTT descriptors
2650         * (especially in high load).
2651         * Use KLM (indirect access) only if it's mandatory.
2652         */
2653        pi_mr = mr->mtt_mr;
2654        n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2655                                     data_sg_offset, meta_sg, meta_sg_nents,
2656                                     meta_sg_offset);
2657        if (n == data_sg_nents + meta_sg_nents)
2658                goto out;
2659
2660        pi_mr = mr->klm_mr;
2661        n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2662                                     data_sg_offset, meta_sg, meta_sg_nents,
2663                                     meta_sg_offset);
2664        if (unlikely(n != data_sg_nents + meta_sg_nents))
2665                return -ENOMEM;
2666
2667out:
2668        /* This is zero-based memory region */
2669        ibmr->iova = 0;
2670        mr->pi_mr = pi_mr;
2671        if (pi_mr)
2672                ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2673        else
2674                ibmr->sig_attrs->meta_length = mr->meta_length;
2675
2676        return 0;
2677}
2678
2679int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2680                      unsigned int *sg_offset)
2681{
2682        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2683        int n;
2684
2685        mr->ndescs = 0;
2686
2687        ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2688                                   mr->desc_size * mr->max_descs,
2689                                   DMA_TO_DEVICE);
2690
2691        if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2692                n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2693                                       NULL);
2694        else
2695                n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2696                                mlx5_set_page);
2697
2698        ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2699                                      mr->desc_size * mr->max_descs,
2700                                      DMA_TO_DEVICE);
2701
2702        return n;
2703}
2704