linux/drivers/infiniband/hw/mlx5/mr.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32
  33
  34#include <linux/kref.h>
  35#include <linux/random.h>
  36#include <linux/debugfs.h>
  37#include <linux/export.h>
  38#include <linux/delay.h>
  39#include <rdma/ib_umem.h>
  40#include <rdma/ib_umem_odp.h>
  41#include <rdma/ib_verbs.h>
  42#include "mlx5_ib.h"
  43
  44enum {
  45        MAX_PENDING_REG_MR = 8,
  46};
  47
  48#define MLX5_UMR_ALIGN 2048
  49
  50static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
  51static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
  52static int mr_cache_max_order(struct mlx5_ib_dev *dev);
  53static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
  54
  55static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
  56{
  57        return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
  58}
  59
  60static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
  61{
  62        int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
  63
  64        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
  65                /* Wait until all page fault handlers using the mr complete. */
  66                synchronize_srcu(&dev->mr_srcu);
  67
  68        return err;
  69}
  70
  71static int order2idx(struct mlx5_ib_dev *dev, int order)
  72{
  73        struct mlx5_mr_cache *cache = &dev->cache;
  74
  75        if (order < cache->ent[0].order)
  76                return 0;
  77        else
  78                return order - cache->ent[0].order;
  79}
  80
  81static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
  82{
  83        return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
  84                length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
  85}
  86
  87static void reg_mr_callback(int status, struct mlx5_async_work *context)
  88{
  89        struct mlx5_ib_mr *mr =
  90                container_of(context, struct mlx5_ib_mr, cb_work);
  91        struct mlx5_ib_dev *dev = mr->dev;
  92        struct mlx5_mr_cache *cache = &dev->cache;
  93        int c = order2idx(dev, mr->order);
  94        struct mlx5_cache_ent *ent = &cache->ent[c];
  95        u8 key;
  96        unsigned long flags;
  97        struct xarray *mkeys = &dev->mdev->priv.mkey_table;
  98        int err;
  99
 100        spin_lock_irqsave(&ent->lock, flags);
 101        ent->pending--;
 102        spin_unlock_irqrestore(&ent->lock, flags);
 103        if (status) {
 104                mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 105                kfree(mr);
 106                dev->fill_delay = 1;
 107                mod_timer(&dev->delay_timer, jiffies + HZ);
 108                return;
 109        }
 110
 111        mr->mmkey.type = MLX5_MKEY_MR;
 112        spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
 113        key = dev->mdev->priv.mkey_key++;
 114        spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
 115        mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
 116
 117        cache->last_add = jiffies;
 118
 119        spin_lock_irqsave(&ent->lock, flags);
 120        list_add_tail(&mr->list, &ent->head);
 121        ent->cur++;
 122        ent->size++;
 123        spin_unlock_irqrestore(&ent->lock, flags);
 124
 125        xa_lock_irqsave(mkeys, flags);
 126        err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key),
 127                                &mr->mmkey, GFP_ATOMIC));
 128        xa_unlock_irqrestore(mkeys, flags);
 129        if (err)
 130                pr_err("Error inserting to mkey tree. 0x%x\n", -err);
 131
 132        if (!completion_done(&ent->compl))
 133                complete(&ent->compl);
 134}
 135
 136static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 137{
 138        struct mlx5_mr_cache *cache = &dev->cache;
 139        struct mlx5_cache_ent *ent = &cache->ent[c];
 140        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 141        struct mlx5_ib_mr *mr;
 142        void *mkc;
 143        u32 *in;
 144        int err = 0;
 145        int i;
 146
 147        in = kzalloc(inlen, GFP_KERNEL);
 148        if (!in)
 149                return -ENOMEM;
 150
 151        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 152        for (i = 0; i < num; i++) {
 153                if (ent->pending >= MAX_PENDING_REG_MR) {
 154                        err = -EAGAIN;
 155                        break;
 156                }
 157
 158                mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 159                if (!mr) {
 160                        err = -ENOMEM;
 161                        break;
 162                }
 163                mr->order = ent->order;
 164                mr->allocated_from_cache = 1;
 165                mr->dev = dev;
 166
 167                MLX5_SET(mkc, mkc, free, 1);
 168                MLX5_SET(mkc, mkc, umr_en, 1);
 169                MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
 170                MLX5_SET(mkc, mkc, access_mode_4_2,
 171                         (ent->access_mode >> 2) & 0x7);
 172
 173                MLX5_SET(mkc, mkc, qpn, 0xffffff);
 174                MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
 175                MLX5_SET(mkc, mkc, log_page_size, ent->page);
 176
 177                spin_lock_irq(&ent->lock);
 178                ent->pending++;
 179                spin_unlock_irq(&ent->lock);
 180                err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
 181                                               &dev->async_ctx, in, inlen,
 182                                               mr->out, sizeof(mr->out),
 183                                               reg_mr_callback, &mr->cb_work);
 184                if (err) {
 185                        spin_lock_irq(&ent->lock);
 186                        ent->pending--;
 187                        spin_unlock_irq(&ent->lock);
 188                        mlx5_ib_warn(dev, "create mkey failed %d\n", err);
 189                        kfree(mr);
 190                        break;
 191                }
 192        }
 193
 194        kfree(in);
 195        return err;
 196}
 197
 198static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
 199{
 200        struct mlx5_mr_cache *cache = &dev->cache;
 201        struct mlx5_cache_ent *ent = &cache->ent[c];
 202        struct mlx5_ib_mr *tmp_mr;
 203        struct mlx5_ib_mr *mr;
 204        LIST_HEAD(del_list);
 205        int i;
 206
 207        for (i = 0; i < num; i++) {
 208                spin_lock_irq(&ent->lock);
 209                if (list_empty(&ent->head)) {
 210                        spin_unlock_irq(&ent->lock);
 211                        break;
 212                }
 213                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 214                list_move(&mr->list, &del_list);
 215                ent->cur--;
 216                ent->size--;
 217                spin_unlock_irq(&ent->lock);
 218                mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 219        }
 220
 221        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
 222                synchronize_srcu(&dev->mr_srcu);
 223
 224        list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 225                list_del(&mr->list);
 226                kfree(mr);
 227        }
 228}
 229
 230static ssize_t size_write(struct file *filp, const char __user *buf,
 231                          size_t count, loff_t *pos)
 232{
 233        struct mlx5_cache_ent *ent = filp->private_data;
 234        struct mlx5_ib_dev *dev = ent->dev;
 235        char lbuf[20] = {0};
 236        u32 var;
 237        int err;
 238        int c;
 239
 240        count = min(count, sizeof(lbuf) - 1);
 241        if (copy_from_user(lbuf, buf, count))
 242                return -EFAULT;
 243
 244        c = order2idx(dev, ent->order);
 245
 246        if (sscanf(lbuf, "%u", &var) != 1)
 247                return -EINVAL;
 248
 249        if (var < ent->limit)
 250                return -EINVAL;
 251
 252        if (var > ent->size) {
 253                do {
 254                        err = add_keys(dev, c, var - ent->size);
 255                        if (err && err != -EAGAIN)
 256                                return err;
 257
 258                        usleep_range(3000, 5000);
 259                } while (err);
 260        } else if (var < ent->size) {
 261                remove_keys(dev, c, ent->size - var);
 262        }
 263
 264        return count;
 265}
 266
 267static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 268                         loff_t *pos)
 269{
 270        struct mlx5_cache_ent *ent = filp->private_data;
 271        char lbuf[20];
 272        int err;
 273
 274        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
 275        if (err < 0)
 276                return err;
 277
 278        return simple_read_from_buffer(buf, count, pos, lbuf, err);
 279}
 280
 281static const struct file_operations size_fops = {
 282        .owner  = THIS_MODULE,
 283        .open   = simple_open,
 284        .write  = size_write,
 285        .read   = size_read,
 286};
 287
 288static ssize_t limit_write(struct file *filp, const char __user *buf,
 289                           size_t count, loff_t *pos)
 290{
 291        struct mlx5_cache_ent *ent = filp->private_data;
 292        struct mlx5_ib_dev *dev = ent->dev;
 293        char lbuf[20] = {0};
 294        u32 var;
 295        int err;
 296        int c;
 297
 298        count = min(count, sizeof(lbuf) - 1);
 299        if (copy_from_user(lbuf, buf, count))
 300                return -EFAULT;
 301
 302        c = order2idx(dev, ent->order);
 303
 304        if (sscanf(lbuf, "%u", &var) != 1)
 305                return -EINVAL;
 306
 307        if (var > ent->size)
 308                return -EINVAL;
 309
 310        ent->limit = var;
 311
 312        if (ent->cur < ent->limit) {
 313                err = add_keys(dev, c, 2 * ent->limit - ent->cur);
 314                if (err)
 315                        return err;
 316        }
 317
 318        return count;
 319}
 320
 321static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
 322                          loff_t *pos)
 323{
 324        struct mlx5_cache_ent *ent = filp->private_data;
 325        char lbuf[20];
 326        int err;
 327
 328        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
 329        if (err < 0)
 330                return err;
 331
 332        return simple_read_from_buffer(buf, count, pos, lbuf, err);
 333}
 334
 335static const struct file_operations limit_fops = {
 336        .owner  = THIS_MODULE,
 337        .open   = simple_open,
 338        .write  = limit_write,
 339        .read   = limit_read,
 340};
 341
 342static int someone_adding(struct mlx5_mr_cache *cache)
 343{
 344        int i;
 345
 346        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 347                if (cache->ent[i].cur < cache->ent[i].limit)
 348                        return 1;
 349        }
 350
 351        return 0;
 352}
 353
 354static void __cache_work_func(struct mlx5_cache_ent *ent)
 355{
 356        struct mlx5_ib_dev *dev = ent->dev;
 357        struct mlx5_mr_cache *cache = &dev->cache;
 358        int i = order2idx(dev, ent->order);
 359        int err;
 360
 361        if (cache->stopped)
 362                return;
 363
 364        ent = &dev->cache.ent[i];
 365        if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
 366                err = add_keys(dev, i, 1);
 367                if (ent->cur < 2 * ent->limit) {
 368                        if (err == -EAGAIN) {
 369                                mlx5_ib_dbg(dev, "returned eagain, order %d\n",
 370                                            i + 2);
 371                                queue_delayed_work(cache->wq, &ent->dwork,
 372                                                   msecs_to_jiffies(3));
 373                        } else if (err) {
 374                                mlx5_ib_warn(dev, "command failed order %d, err %d\n",
 375                                             i + 2, err);
 376                                queue_delayed_work(cache->wq, &ent->dwork,
 377                                                   msecs_to_jiffies(1000));
 378                        } else {
 379                                queue_work(cache->wq, &ent->work);
 380                        }
 381                }
 382        } else if (ent->cur > 2 * ent->limit) {
 383                /*
 384                 * The remove_keys() logic is performed as garbage collection
 385                 * task. Such task is intended to be run when no other active
 386                 * processes are running.
 387                 *
 388                 * The need_resched() will return TRUE if there are user tasks
 389                 * to be activated in near future.
 390                 *
 391                 * In such case, we don't execute remove_keys() and postpone
 392                 * the garbage collection work to try to run in next cycle,
 393                 * in order to free CPU resources to other tasks.
 394                 */
 395                if (!need_resched() && !someone_adding(cache) &&
 396                    time_after(jiffies, cache->last_add + 300 * HZ)) {
 397                        remove_keys(dev, i, 1);
 398                        if (ent->cur > ent->limit)
 399                                queue_work(cache->wq, &ent->work);
 400                } else {
 401                        queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
 402                }
 403        }
 404}
 405
 406static void delayed_cache_work_func(struct work_struct *work)
 407{
 408        struct mlx5_cache_ent *ent;
 409
 410        ent = container_of(work, struct mlx5_cache_ent, dwork.work);
 411        __cache_work_func(ent);
 412}
 413
 414static void cache_work_func(struct work_struct *work)
 415{
 416        struct mlx5_cache_ent *ent;
 417
 418        ent = container_of(work, struct mlx5_cache_ent, work);
 419        __cache_work_func(ent);
 420}
 421
 422struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
 423{
 424        struct mlx5_mr_cache *cache = &dev->cache;
 425        struct mlx5_cache_ent *ent;
 426        struct mlx5_ib_mr *mr;
 427        int err;
 428
 429        if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
 430                mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
 431                return NULL;
 432        }
 433
 434        ent = &cache->ent[entry];
 435        while (1) {
 436                spin_lock_irq(&ent->lock);
 437                if (list_empty(&ent->head)) {
 438                        spin_unlock_irq(&ent->lock);
 439
 440                        err = add_keys(dev, entry, 1);
 441                        if (err && err != -EAGAIN)
 442                                return ERR_PTR(err);
 443
 444                        wait_for_completion(&ent->compl);
 445                } else {
 446                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 447                                              list);
 448                        list_del(&mr->list);
 449                        ent->cur--;
 450                        spin_unlock_irq(&ent->lock);
 451                        if (ent->cur < ent->limit)
 452                                queue_work(cache->wq, &ent->work);
 453                        return mr;
 454                }
 455        }
 456}
 457
 458static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 459{
 460        struct mlx5_mr_cache *cache = &dev->cache;
 461        struct mlx5_ib_mr *mr = NULL;
 462        struct mlx5_cache_ent *ent;
 463        int last_umr_cache_entry;
 464        int c;
 465        int i;
 466
 467        c = order2idx(dev, order);
 468        last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev));
 469        if (c < 0 || c > last_umr_cache_entry) {
 470                mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
 471                return NULL;
 472        }
 473
 474        for (i = c; i <= last_umr_cache_entry; i++) {
 475                ent = &cache->ent[i];
 476
 477                mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
 478
 479                spin_lock_irq(&ent->lock);
 480                if (!list_empty(&ent->head)) {
 481                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 482                                              list);
 483                        list_del(&mr->list);
 484                        ent->cur--;
 485                        spin_unlock_irq(&ent->lock);
 486                        if (ent->cur < ent->limit)
 487                                queue_work(cache->wq, &ent->work);
 488                        break;
 489                }
 490                spin_unlock_irq(&ent->lock);
 491
 492                queue_work(cache->wq, &ent->work);
 493        }
 494
 495        if (!mr)
 496                cache->ent[c].miss++;
 497
 498        return mr;
 499}
 500
 501void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 502{
 503        struct mlx5_mr_cache *cache = &dev->cache;
 504        struct mlx5_cache_ent *ent;
 505        int shrink = 0;
 506        int c;
 507
 508        if (!mr->allocated_from_cache)
 509                return;
 510
 511        c = order2idx(dev, mr->order);
 512        WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES);
 513
 514        if (unreg_umr(dev, mr)) {
 515                mr->allocated_from_cache = false;
 516                destroy_mkey(dev, mr);
 517                ent = &cache->ent[c];
 518                if (ent->cur < ent->limit)
 519                        queue_work(cache->wq, &ent->work);
 520                return;
 521        }
 522
 523        ent = &cache->ent[c];
 524        spin_lock_irq(&ent->lock);
 525        list_add_tail(&mr->list, &ent->head);
 526        ent->cur++;
 527        if (ent->cur > 2 * ent->limit)
 528                shrink = 1;
 529        spin_unlock_irq(&ent->lock);
 530
 531        if (shrink)
 532                queue_work(cache->wq, &ent->work);
 533}
 534
 535static void clean_keys(struct mlx5_ib_dev *dev, int c)
 536{
 537        struct mlx5_mr_cache *cache = &dev->cache;
 538        struct mlx5_cache_ent *ent = &cache->ent[c];
 539        struct mlx5_ib_mr *tmp_mr;
 540        struct mlx5_ib_mr *mr;
 541        LIST_HEAD(del_list);
 542
 543        cancel_delayed_work(&ent->dwork);
 544        while (1) {
 545                spin_lock_irq(&ent->lock);
 546                if (list_empty(&ent->head)) {
 547                        spin_unlock_irq(&ent->lock);
 548                        break;
 549                }
 550                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 551                list_move(&mr->list, &del_list);
 552                ent->cur--;
 553                ent->size--;
 554                spin_unlock_irq(&ent->lock);
 555                mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 556        }
 557
 558#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 559        synchronize_srcu(&dev->mr_srcu);
 560#endif
 561
 562        list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 563                list_del(&mr->list);
 564                kfree(mr);
 565        }
 566}
 567
 568static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 569{
 570        if (!mlx5_debugfs_root || dev->is_rep)
 571                return;
 572
 573        debugfs_remove_recursive(dev->cache.root);
 574        dev->cache.root = NULL;
 575}
 576
 577static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 578{
 579        struct mlx5_mr_cache *cache = &dev->cache;
 580        struct mlx5_cache_ent *ent;
 581        struct dentry *dir;
 582        int i;
 583
 584        if (!mlx5_debugfs_root || dev->is_rep)
 585                return;
 586
 587        cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
 588
 589        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 590                ent = &cache->ent[i];
 591                sprintf(ent->name, "%d", ent->order);
 592                dir = debugfs_create_dir(ent->name, cache->root);
 593                debugfs_create_file("size", 0600, dir, ent, &size_fops);
 594                debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
 595                debugfs_create_u32("cur", 0400, dir, &ent->cur);
 596                debugfs_create_u32("miss", 0600, dir, &ent->miss);
 597        }
 598}
 599
 600static void delay_time_func(struct timer_list *t)
 601{
 602        struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 603
 604        dev->fill_delay = 0;
 605}
 606
 607int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 608{
 609        struct mlx5_mr_cache *cache = &dev->cache;
 610        struct mlx5_cache_ent *ent;
 611        int i;
 612
 613        mutex_init(&dev->slow_path_mutex);
 614        cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 615        if (!cache->wq) {
 616                mlx5_ib_warn(dev, "failed to create work queue\n");
 617                return -ENOMEM;
 618        }
 619
 620        mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 621        timer_setup(&dev->delay_timer, delay_time_func, 0);
 622        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 623                ent = &cache->ent[i];
 624                INIT_LIST_HEAD(&ent->head);
 625                spin_lock_init(&ent->lock);
 626                ent->order = i + 2;
 627                ent->dev = dev;
 628                ent->limit = 0;
 629
 630                init_completion(&ent->compl);
 631                INIT_WORK(&ent->work, cache_work_func);
 632                INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 633
 634                if (i > MR_CACHE_LAST_STD_ENTRY) {
 635                        mlx5_odp_init_mr_cache_entry(ent);
 636                        continue;
 637                }
 638
 639                if (ent->order > mr_cache_max_order(dev))
 640                        continue;
 641
 642                ent->page = PAGE_SHIFT;
 643                ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
 644                           MLX5_IB_UMR_OCTOWORD;
 645                ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 646                if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
 647                    !dev->is_rep &&
 648                    mlx5_core_is_pf(dev->mdev))
 649                        ent->limit = dev->mdev->profile->mr_cache[i].limit;
 650                else
 651                        ent->limit = 0;
 652                queue_work(cache->wq, &ent->work);
 653        }
 654
 655        mlx5_mr_cache_debugfs_init(dev);
 656
 657        return 0;
 658}
 659
 660int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 661{
 662        int i;
 663
 664        if (!dev->cache.wq)
 665                return 0;
 666
 667        dev->cache.stopped = 1;
 668        flush_workqueue(dev->cache.wq);
 669
 670        mlx5_mr_cache_debugfs_cleanup(dev);
 671        mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 672
 673        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
 674                clean_keys(dev, i);
 675
 676        destroy_workqueue(dev->cache.wq);
 677        del_timer_sync(&dev->delay_timer);
 678
 679        return 0;
 680}
 681
 682struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
 683{
 684        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 685        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 686        struct mlx5_core_dev *mdev = dev->mdev;
 687        struct mlx5_ib_mr *mr;
 688        void *mkc;
 689        u32 *in;
 690        int err;
 691
 692        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 693        if (!mr)
 694                return ERR_PTR(-ENOMEM);
 695
 696        in = kzalloc(inlen, GFP_KERNEL);
 697        if (!in) {
 698                err = -ENOMEM;
 699                goto err_free;
 700        }
 701
 702        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 703
 704        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
 705        MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
 706        MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
 707        MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
 708        MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
 709        MLX5_SET(mkc, mkc, lr, 1);
 710
 711        MLX5_SET(mkc, mkc, length64, 1);
 712        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
 713        MLX5_SET(mkc, mkc, qpn, 0xffffff);
 714        MLX5_SET64(mkc, mkc, start_addr, 0);
 715
 716        err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
 717        if (err)
 718                goto err_in;
 719
 720        kfree(in);
 721        mr->mmkey.type = MLX5_MKEY_MR;
 722        mr->ibmr.lkey = mr->mmkey.key;
 723        mr->ibmr.rkey = mr->mmkey.key;
 724        mr->umem = NULL;
 725
 726        return &mr->ibmr;
 727
 728err_in:
 729        kfree(in);
 730
 731err_free:
 732        kfree(mr);
 733
 734        return ERR_PTR(err);
 735}
 736
 737static int get_octo_len(u64 addr, u64 len, int page_shift)
 738{
 739        u64 page_size = 1ULL << page_shift;
 740        u64 offset;
 741        int npages;
 742
 743        offset = addr & (page_size - 1);
 744        npages = ALIGN(len + offset, page_size) >> page_shift;
 745        return (npages + 1) / 2;
 746}
 747
 748static int mr_cache_max_order(struct mlx5_ib_dev *dev)
 749{
 750        if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
 751                return MR_CACHE_LAST_STD_ENTRY + 2;
 752        return MLX5_MAX_UMR_SHIFT;
 753}
 754
 755static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 756                       u64 start, u64 length, int access_flags,
 757                       struct ib_umem **umem, int *npages, int *page_shift,
 758                       int *ncont, int *order)
 759{
 760        struct ib_umem *u;
 761
 762        *umem = NULL;
 763
 764        if (access_flags & IB_ACCESS_ON_DEMAND) {
 765                struct ib_umem_odp *odp;
 766
 767                odp = ib_umem_odp_get(udata, start, length, access_flags);
 768                if (IS_ERR(odp)) {
 769                        mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
 770                                    PTR_ERR(odp));
 771                        return PTR_ERR(odp);
 772                }
 773
 774                u = &odp->umem;
 775
 776                *page_shift = odp->page_shift;
 777                *ncont = ib_umem_odp_num_pages(odp);
 778                *npages = *ncont << (*page_shift - PAGE_SHIFT);
 779                if (order)
 780                        *order = ilog2(roundup_pow_of_two(*ncont));
 781        } else {
 782                u = ib_umem_get(udata, start, length, access_flags, 0);
 783                if (IS_ERR(u)) {
 784                        mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
 785                        return PTR_ERR(u);
 786                }
 787
 788                mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
 789                                   page_shift, ncont, order);
 790        }
 791
 792        if (!*npages) {
 793                mlx5_ib_warn(dev, "avoid zero region\n");
 794                ib_umem_release(u);
 795                return -EINVAL;
 796        }
 797
 798        *umem = u;
 799
 800        mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
 801                    *npages, *ncont, *order, *page_shift);
 802
 803        return 0;
 804}
 805
 806static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
 807{
 808        struct mlx5_ib_umr_context *context =
 809                container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
 810
 811        context->status = wc->status;
 812        complete(&context->done);
 813}
 814
 815static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
 816{
 817        context->cqe.done = mlx5_ib_umr_done;
 818        context->status = -1;
 819        init_completion(&context->done);
 820}
 821
 822static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
 823                                  struct mlx5_umr_wr *umrwr)
 824{
 825        struct umr_common *umrc = &dev->umrc;
 826        const struct ib_send_wr *bad;
 827        int err;
 828        struct mlx5_ib_umr_context umr_context;
 829
 830        mlx5_ib_init_umr_context(&umr_context);
 831        umrwr->wr.wr_cqe = &umr_context.cqe;
 832
 833        down(&umrc->sem);
 834        err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
 835        if (err) {
 836                mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
 837        } else {
 838                wait_for_completion(&umr_context.done);
 839                if (umr_context.status != IB_WC_SUCCESS) {
 840                        mlx5_ib_warn(dev, "reg umr failed (%u)\n",
 841                                     umr_context.status);
 842                        err = -EFAULT;
 843                }
 844        }
 845        up(&umrc->sem);
 846        return err;
 847}
 848
 849static struct mlx5_ib_mr *alloc_mr_from_cache(
 850                                  struct ib_pd *pd, struct ib_umem *umem,
 851                                  u64 virt_addr, u64 len, int npages,
 852                                  int page_shift, int order, int access_flags)
 853{
 854        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 855        struct mlx5_ib_mr *mr;
 856        int err = 0;
 857        int i;
 858
 859        for (i = 0; i < 1; i++) {
 860                mr = alloc_cached_mr(dev, order);
 861                if (mr)
 862                        break;
 863
 864                err = add_keys(dev, order2idx(dev, order), 1);
 865                if (err && err != -EAGAIN) {
 866                        mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
 867                        break;
 868                }
 869        }
 870
 871        if (!mr)
 872                return ERR_PTR(-EAGAIN);
 873
 874        mr->ibmr.pd = pd;
 875        mr->umem = umem;
 876        mr->access_flags = access_flags;
 877        mr->desc_size = sizeof(struct mlx5_mtt);
 878        mr->mmkey.iova = virt_addr;
 879        mr->mmkey.size = len;
 880        mr->mmkey.pd = to_mpd(pd)->pdn;
 881
 882        return mr;
 883}
 884
 885static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
 886                               void *xlt, int page_shift, size_t size,
 887                               int flags)
 888{
 889        struct mlx5_ib_dev *dev = mr->dev;
 890        struct ib_umem *umem = mr->umem;
 891
 892        if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
 893                if (!umr_can_use_indirect_mkey(dev))
 894                        return -EPERM;
 895                mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
 896                return npages;
 897        }
 898
 899        npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
 900
 901        if (!(flags & MLX5_IB_UPD_XLT_ZAP)) {
 902                __mlx5_ib_populate_pas(dev, umem, page_shift,
 903                                       idx, npages, xlt,
 904                                       MLX5_IB_MTT_PRESENT);
 905                /* Clear padding after the pages
 906                 * brought from the umem.
 907                 */
 908                memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0,
 909                       size - npages * sizeof(struct mlx5_mtt));
 910        }
 911
 912        return npages;
 913}
 914
 915#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
 916                            MLX5_UMR_MTT_ALIGNMENT)
 917#define MLX5_SPARE_UMR_CHUNK 0x10000
 918
 919int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 920                       int page_shift, int flags)
 921{
 922        struct mlx5_ib_dev *dev = mr->dev;
 923        struct device *ddev = dev->ib_dev.dev.parent;
 924        int size;
 925        void *xlt;
 926        dma_addr_t dma;
 927        struct mlx5_umr_wr wr;
 928        struct ib_sge sg;
 929        int err = 0;
 930        int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
 931                               ? sizeof(struct mlx5_klm)
 932                               : sizeof(struct mlx5_mtt);
 933        const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
 934        const int page_mask = page_align - 1;
 935        size_t pages_mapped = 0;
 936        size_t pages_to_map = 0;
 937        size_t pages_iter = 0;
 938        gfp_t gfp;
 939        bool use_emergency_page = false;
 940
 941        if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
 942            !umr_can_use_indirect_mkey(dev))
 943                return -EPERM;
 944
 945        /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
 946         * so we need to align the offset and length accordingly
 947         */
 948        if (idx & page_mask) {
 949                npages += idx & page_mask;
 950                idx &= ~page_mask;
 951        }
 952
 953        gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL;
 954        gfp |= __GFP_ZERO | __GFP_NOWARN;
 955
 956        pages_to_map = ALIGN(npages, page_align);
 957        size = desc_size * pages_to_map;
 958        size = min_t(int, size, MLX5_MAX_UMR_CHUNK);
 959
 960        xlt = (void *)__get_free_pages(gfp, get_order(size));
 961        if (!xlt && size > MLX5_SPARE_UMR_CHUNK) {
 962                mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n",
 963                            size, get_order(size), MLX5_SPARE_UMR_CHUNK);
 964
 965                size = MLX5_SPARE_UMR_CHUNK;
 966                xlt = (void *)__get_free_pages(gfp, get_order(size));
 967        }
 968
 969        if (!xlt) {
 970                mlx5_ib_warn(dev, "Using XLT emergency buffer\n");
 971                xlt = (void *)mlx5_ib_get_xlt_emergency_page();
 972                size = PAGE_SIZE;
 973                memset(xlt, 0, size);
 974                use_emergency_page = true;
 975        }
 976        pages_iter = size / desc_size;
 977        dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE);
 978        if (dma_mapping_error(ddev, dma)) {
 979                mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
 980                err = -ENOMEM;
 981                goto free_xlt;
 982        }
 983
 984        sg.addr = dma;
 985        sg.lkey = dev->umrc.pd->local_dma_lkey;
 986
 987        memset(&wr, 0, sizeof(wr));
 988        wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
 989        if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
 990                wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
 991        wr.wr.sg_list = &sg;
 992        wr.wr.num_sge = 1;
 993        wr.wr.opcode = MLX5_IB_WR_UMR;
 994
 995        wr.pd = mr->ibmr.pd;
 996        wr.mkey = mr->mmkey.key;
 997        wr.length = mr->mmkey.size;
 998        wr.virt_addr = mr->mmkey.iova;
 999        wr.access_flags = mr->access_flags;
1000        wr.page_shift = page_shift;
1001
1002        for (pages_mapped = 0;
1003             pages_mapped < pages_to_map && !err;
1004             pages_mapped += pages_iter, idx += pages_iter) {
1005                npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1006                dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
1007                npages = populate_xlt(mr, idx, npages, xlt,
1008                                      page_shift, size, flags);
1009
1010                dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
1011
1012                sg.length = ALIGN(npages * desc_size,
1013                                  MLX5_UMR_MTT_ALIGNMENT);
1014
1015                if (pages_mapped + pages_iter >= pages_to_map) {
1016                        if (flags & MLX5_IB_UPD_XLT_ENABLE)
1017                                wr.wr.send_flags |=
1018                                        MLX5_IB_SEND_UMR_ENABLE_MR |
1019                                        MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1020                                        MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1021                        if (flags & MLX5_IB_UPD_XLT_PD ||
1022                            flags & MLX5_IB_UPD_XLT_ACCESS)
1023                                wr.wr.send_flags |=
1024                                        MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1025                        if (flags & MLX5_IB_UPD_XLT_ADDR)
1026                                wr.wr.send_flags |=
1027                                        MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1028                }
1029
1030                wr.offset = idx * desc_size;
1031                wr.xlt_size = sg.length;
1032
1033                err = mlx5_ib_post_send_wait(dev, &wr);
1034        }
1035        dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1036
1037free_xlt:
1038        if (use_emergency_page)
1039                mlx5_ib_put_xlt_emergency_page();
1040        else
1041                free_pages((unsigned long)xlt, get_order(size));
1042
1043        return err;
1044}
1045
1046/*
1047 * If ibmr is NULL it will be allocated by reg_create.
1048 * Else, the given ibmr will be used.
1049 */
1050static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
1051                                     u64 virt_addr, u64 length,
1052                                     struct ib_umem *umem, int npages,
1053                                     int page_shift, int access_flags,
1054                                     bool populate)
1055{
1056        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1057        struct mlx5_ib_mr *mr;
1058        __be64 *pas;
1059        void *mkc;
1060        int inlen;
1061        u32 *in;
1062        int err;
1063        bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1064
1065        mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
1066        if (!mr)
1067                return ERR_PTR(-ENOMEM);
1068
1069        mr->ibmr.pd = pd;
1070        mr->access_flags = access_flags;
1071
1072        inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1073        if (populate)
1074                inlen += sizeof(*pas) * roundup(npages, 2);
1075        in = kvzalloc(inlen, GFP_KERNEL);
1076        if (!in) {
1077                err = -ENOMEM;
1078                goto err_1;
1079        }
1080        pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1081        if (populate && !(access_flags & IB_ACCESS_ON_DEMAND))
1082                mlx5_ib_populate_pas(dev, umem, page_shift, pas,
1083                                     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1084
1085        /* The pg_access bit allows setting the access flags
1086         * in the page list submitted with the command. */
1087        MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1088
1089        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1090        MLX5_SET(mkc, mkc, free, !populate);
1091        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1092        MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
1093        MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
1094        MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
1095        MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
1096        MLX5_SET(mkc, mkc, lr, 1);
1097        MLX5_SET(mkc, mkc, umr_en, 1);
1098
1099        MLX5_SET64(mkc, mkc, start_addr, virt_addr);
1100        MLX5_SET64(mkc, mkc, len, length);
1101        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1102        MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1103        MLX5_SET(mkc, mkc, translations_octword_size,
1104                 get_octo_len(virt_addr, length, page_shift));
1105        MLX5_SET(mkc, mkc, log_page_size, page_shift);
1106        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1107        if (populate) {
1108                MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1109                         get_octo_len(virt_addr, length, page_shift));
1110        }
1111
1112        err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1113        if (err) {
1114                mlx5_ib_warn(dev, "create mkey failed\n");
1115                goto err_2;
1116        }
1117        mr->mmkey.type = MLX5_MKEY_MR;
1118        mr->desc_size = sizeof(struct mlx5_mtt);
1119        mr->dev = dev;
1120        kvfree(in);
1121
1122        mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1123
1124        return mr;
1125
1126err_2:
1127        kvfree(in);
1128
1129err_1:
1130        if (!ibmr)
1131                kfree(mr);
1132
1133        return ERR_PTR(err);
1134}
1135
1136static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1137                          int npages, u64 length, int access_flags)
1138{
1139        mr->npages = npages;
1140        atomic_add(npages, &dev->mdev->priv.reg_pages);
1141        mr->ibmr.lkey = mr->mmkey.key;
1142        mr->ibmr.rkey = mr->mmkey.key;
1143        mr->ibmr.length = length;
1144        mr->access_flags = access_flags;
1145}
1146
1147static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1148                                       u64 length, int acc, int mode)
1149{
1150        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1151        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1152        struct mlx5_core_dev *mdev = dev->mdev;
1153        struct mlx5_ib_mr *mr;
1154        void *mkc;
1155        u32 *in;
1156        int err;
1157
1158        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1159        if (!mr)
1160                return ERR_PTR(-ENOMEM);
1161
1162        in = kzalloc(inlen, GFP_KERNEL);
1163        if (!in) {
1164                err = -ENOMEM;
1165                goto err_free;
1166        }
1167
1168        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1169
1170        MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1171        MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1172        MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
1173        MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
1174        MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
1175        MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
1176        MLX5_SET(mkc, mkc, lr, 1);
1177
1178        MLX5_SET64(mkc, mkc, len, length);
1179        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1180        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1181        MLX5_SET64(mkc, mkc, start_addr, start_addr);
1182
1183        err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
1184        if (err)
1185                goto err_in;
1186
1187        kfree(in);
1188
1189        mr->umem = NULL;
1190        set_mr_fields(dev, mr, 0, length, acc);
1191
1192        return &mr->ibmr;
1193
1194err_in:
1195        kfree(in);
1196
1197err_free:
1198        kfree(mr);
1199
1200        return ERR_PTR(err);
1201}
1202
1203int mlx5_ib_advise_mr(struct ib_pd *pd,
1204                      enum ib_uverbs_advise_mr_advice advice,
1205                      u32 flags,
1206                      struct ib_sge *sg_list,
1207                      u32 num_sge,
1208                      struct uverbs_attr_bundle *attrs)
1209{
1210        if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1211            advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE)
1212                return -EOPNOTSUPP;
1213
1214        return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1215                                         sg_list, num_sge);
1216}
1217
1218struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1219                                struct ib_dm_mr_attr *attr,
1220                                struct uverbs_attr_bundle *attrs)
1221{
1222        struct mlx5_ib_dm *mdm = to_mdm(dm);
1223        struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1224        u64 start_addr = mdm->dev_addr + attr->offset;
1225        int mode;
1226
1227        switch (mdm->type) {
1228        case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1229                if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1230                        return ERR_PTR(-EINVAL);
1231
1232                mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1233                start_addr -= pci_resource_start(dev->pdev, 0);
1234                break;
1235        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1236        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1237                if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1238                        return ERR_PTR(-EINVAL);
1239
1240                mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1241                break;
1242        default:
1243                return ERR_PTR(-EINVAL);
1244        }
1245
1246        return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1247                                 attr->access_flags, mode);
1248}
1249
1250struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1251                                  u64 virt_addr, int access_flags,
1252                                  struct ib_udata *udata)
1253{
1254        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1255        struct mlx5_ib_mr *mr = NULL;
1256        bool use_umr;
1257        struct ib_umem *umem;
1258        int page_shift;
1259        int npages;
1260        int ncont;
1261        int order;
1262        int err;
1263
1264        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1265                return ERR_PTR(-EOPNOTSUPP);
1266
1267        mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1268                    start, virt_addr, length, access_flags);
1269
1270        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
1271            length == U64_MAX) {
1272                if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
1273                    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1274                        return ERR_PTR(-EINVAL);
1275
1276                mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
1277                if (IS_ERR(mr))
1278                        return ERR_CAST(mr);
1279                return &mr->ibmr;
1280        }
1281
1282        err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
1283                          &npages, &page_shift, &ncont, &order);
1284
1285        if (err < 0)
1286                return ERR_PTR(err);
1287
1288        use_umr = mlx5_ib_can_use_umr(dev, true);
1289
1290        if (order <= mr_cache_max_order(dev) && use_umr) {
1291                mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
1292                                         page_shift, order, access_flags);
1293                if (PTR_ERR(mr) == -EAGAIN) {
1294                        mlx5_ib_dbg(dev, "cache empty for order %d\n", order);
1295                        mr = NULL;
1296                }
1297        } else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) {
1298                if (access_flags & IB_ACCESS_ON_DEMAND) {
1299                        err = -EINVAL;
1300                        pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n");
1301                        goto error;
1302                }
1303                use_umr = false;
1304        }
1305
1306        if (!mr) {
1307                mutex_lock(&dev->slow_path_mutex);
1308                mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
1309                                page_shift, access_flags, !use_umr);
1310                mutex_unlock(&dev->slow_path_mutex);
1311        }
1312
1313        if (IS_ERR(mr)) {
1314                err = PTR_ERR(mr);
1315                goto error;
1316        }
1317
1318        mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1319
1320        mr->umem = umem;
1321        set_mr_fields(dev, mr, npages, length, access_flags);
1322
1323        if (use_umr) {
1324                int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
1325
1326                if (access_flags & IB_ACCESS_ON_DEMAND)
1327                        update_xlt_flags |= MLX5_IB_UPD_XLT_ZAP;
1328
1329                err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift,
1330                                         update_xlt_flags);
1331
1332                if (err) {
1333                        dereg_mr(dev, mr);
1334                        return ERR_PTR(err);
1335                }
1336        }
1337
1338        if (is_odp_mr(mr)) {
1339                to_ib_umem_odp(mr->umem)->private = mr;
1340                atomic_set(&mr->num_pending_prefetch, 0);
1341        }
1342        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1343                smp_store_release(&mr->live, 1);
1344
1345        return &mr->ibmr;
1346error:
1347        ib_umem_release(umem);
1348        return ERR_PTR(err);
1349}
1350
1351static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1352{
1353        struct mlx5_core_dev *mdev = dev->mdev;
1354        struct mlx5_umr_wr umrwr = {};
1355
1356        if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1357                return 0;
1358
1359        umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1360                              MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1361        umrwr.wr.opcode = MLX5_IB_WR_UMR;
1362        umrwr.pd = dev->umrc.pd;
1363        umrwr.mkey = mr->mmkey.key;
1364        umrwr.ignore_free_state = 1;
1365
1366        return mlx5_ib_post_send_wait(dev, &umrwr);
1367}
1368
1369static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1370                     int access_flags, int flags)
1371{
1372        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1373        struct mlx5_umr_wr umrwr = {};
1374        int err;
1375
1376        umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1377
1378        umrwr.wr.opcode = MLX5_IB_WR_UMR;
1379        umrwr.mkey = mr->mmkey.key;
1380
1381        if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) {
1382                umrwr.pd = pd;
1383                umrwr.access_flags = access_flags;
1384                umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1385        }
1386
1387        err = mlx5_ib_post_send_wait(dev, &umrwr);
1388
1389        return err;
1390}
1391
1392int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1393                          u64 length, u64 virt_addr, int new_access_flags,
1394                          struct ib_pd *new_pd, struct ib_udata *udata)
1395{
1396        struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1397        struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1398        struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
1399        int access_flags = flags & IB_MR_REREG_ACCESS ?
1400                            new_access_flags :
1401                            mr->access_flags;
1402        int page_shift = 0;
1403        int upd_flags = 0;
1404        int npages = 0;
1405        int ncont = 0;
1406        int order = 0;
1407        u64 addr, len;
1408        int err;
1409
1410        mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1411                    start, virt_addr, length, access_flags);
1412
1413        atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1414
1415        if (!mr->umem)
1416                return -EINVAL;
1417
1418        if (is_odp_mr(mr))
1419                return -EOPNOTSUPP;
1420
1421        if (flags & IB_MR_REREG_TRANS) {
1422                addr = virt_addr;
1423                len = length;
1424        } else {
1425                addr = mr->umem->address;
1426                len = mr->umem->length;
1427        }
1428
1429        if (flags != IB_MR_REREG_PD) {
1430                /*
1431                 * Replace umem. This needs to be done whether or not UMR is
1432                 * used.
1433                 */
1434                flags |= IB_MR_REREG_TRANS;
1435                ib_umem_release(mr->umem);
1436                mr->umem = NULL;
1437                err = mr_umem_get(dev, udata, addr, len, access_flags,
1438                                  &mr->umem, &npages, &page_shift, &ncont,
1439                                  &order);
1440                if (err)
1441                        goto err;
1442        }
1443
1444        if (!mlx5_ib_can_use_umr(dev, true) ||
1445            (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) {
1446                /*
1447                 * UMR can't be used - MKey needs to be replaced.
1448                 */
1449                if (mr->allocated_from_cache)
1450                        err = unreg_umr(dev, mr);
1451                else
1452                        err = destroy_mkey(dev, mr);
1453                if (err)
1454                        goto err;
1455
1456                mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont,
1457                                page_shift, access_flags, true);
1458
1459                if (IS_ERR(mr)) {
1460                        err = PTR_ERR(mr);
1461                        mr = to_mmr(ib_mr);
1462                        goto err;
1463                }
1464
1465                mr->allocated_from_cache = 0;
1466        } else {
1467                /*
1468                 * Send a UMR WQE
1469                 */
1470                mr->ibmr.pd = pd;
1471                mr->access_flags = access_flags;
1472                mr->mmkey.iova = addr;
1473                mr->mmkey.size = len;
1474                mr->mmkey.pd = to_mpd(pd)->pdn;
1475
1476                if (flags & IB_MR_REREG_TRANS) {
1477                        upd_flags = MLX5_IB_UPD_XLT_ADDR;
1478                        if (flags & IB_MR_REREG_PD)
1479                                upd_flags |= MLX5_IB_UPD_XLT_PD;
1480                        if (flags & IB_MR_REREG_ACCESS)
1481                                upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1482                        err = mlx5_ib_update_xlt(mr, 0, npages, page_shift,
1483                                                 upd_flags);
1484                } else {
1485                        err = rereg_umr(pd, mr, access_flags, flags);
1486                }
1487
1488                if (err)
1489                        goto err;
1490        }
1491
1492        set_mr_fields(dev, mr, npages, len, access_flags);
1493
1494        return 0;
1495
1496err:
1497        ib_umem_release(mr->umem);
1498        mr->umem = NULL;
1499
1500        clean_mr(dev, mr);
1501        return err;
1502}
1503
1504static int
1505mlx5_alloc_priv_descs(struct ib_device *device,
1506                      struct mlx5_ib_mr *mr,
1507                      int ndescs,
1508                      int desc_size)
1509{
1510        int size = ndescs * desc_size;
1511        int add_size;
1512        int ret;
1513
1514        add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1515
1516        mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1517        if (!mr->descs_alloc)
1518                return -ENOMEM;
1519
1520        mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1521
1522        mr->desc_map = dma_map_single(device->dev.parent, mr->descs,
1523                                      size, DMA_TO_DEVICE);
1524        if (dma_mapping_error(device->dev.parent, mr->desc_map)) {
1525                ret = -ENOMEM;
1526                goto err;
1527        }
1528
1529        return 0;
1530err:
1531        kfree(mr->descs_alloc);
1532
1533        return ret;
1534}
1535
1536static void
1537mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1538{
1539        if (mr->descs) {
1540                struct ib_device *device = mr->ibmr.device;
1541                int size = mr->max_descs * mr->desc_size;
1542
1543                dma_unmap_single(device->dev.parent, mr->desc_map,
1544                                 size, DMA_TO_DEVICE);
1545                kfree(mr->descs_alloc);
1546                mr->descs = NULL;
1547        }
1548}
1549
1550static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1551{
1552        int allocated_from_cache = mr->allocated_from_cache;
1553
1554        if (mr->sig) {
1555                if (mlx5_core_destroy_psv(dev->mdev,
1556                                          mr->sig->psv_memory.psv_idx))
1557                        mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1558                                     mr->sig->psv_memory.psv_idx);
1559                if (mlx5_core_destroy_psv(dev->mdev,
1560                                          mr->sig->psv_wire.psv_idx))
1561                        mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1562                                     mr->sig->psv_wire.psv_idx);
1563                kfree(mr->sig);
1564                mr->sig = NULL;
1565        }
1566
1567        if (!allocated_from_cache) {
1568                destroy_mkey(dev, mr);
1569                mlx5_free_priv_descs(mr);
1570        }
1571}
1572
1573static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1574{
1575        int npages = mr->npages;
1576        struct ib_umem *umem = mr->umem;
1577
1578        if (is_odp_mr(mr)) {
1579                struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
1580
1581                /* Prevent new page faults and
1582                 * prefetch requests from succeeding
1583                 */
1584                WRITE_ONCE(mr->live, 0);
1585
1586                /* Wait for all running page-fault handlers to finish. */
1587                synchronize_srcu(&dev->mr_srcu);
1588
1589                /* dequeue pending prefetch requests for the mr */
1590                if (atomic_read(&mr->num_pending_prefetch))
1591                        flush_workqueue(system_unbound_wq);
1592                WARN_ON(atomic_read(&mr->num_pending_prefetch));
1593
1594                /* Destroy all page mappings */
1595                if (!umem_odp->is_implicit_odp)
1596                        mlx5_ib_invalidate_range(umem_odp,
1597                                                 ib_umem_start(umem_odp),
1598                                                 ib_umem_end(umem_odp));
1599                else
1600                        mlx5_ib_free_implicit_mr(mr);
1601                /*
1602                 * We kill the umem before the MR for ODP,
1603                 * so that there will not be any invalidations in
1604                 * flight, looking at the *mr struct.
1605                 */
1606                ib_umem_odp_release(umem_odp);
1607                atomic_sub(npages, &dev->mdev->priv.reg_pages);
1608
1609                /* Avoid double-freeing the umem. */
1610                umem = NULL;
1611        }
1612
1613        clean_mr(dev, mr);
1614
1615        /*
1616         * We should unregister the DMA address from the HCA before
1617         * remove the DMA mapping.
1618         */
1619        mlx5_mr_cache_free(dev, mr);
1620        ib_umem_release(umem);
1621        if (umem)
1622                atomic_sub(npages, &dev->mdev->priv.reg_pages);
1623
1624        if (!mr->allocated_from_cache)
1625                kfree(mr);
1626}
1627
1628int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1629{
1630        struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1631
1632        if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1633                dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
1634                dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
1635        }
1636
1637        dereg_mr(to_mdev(ibmr->device), mmr);
1638
1639        return 0;
1640}
1641
1642static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1643                                   int access_mode, int page_shift)
1644{
1645        void *mkc;
1646
1647        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1648
1649        MLX5_SET(mkc, mkc, free, 1);
1650        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1651        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1652        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1653        MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1654        MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1655        MLX5_SET(mkc, mkc, umr_en, 1);
1656        MLX5_SET(mkc, mkc, log_page_size, page_shift);
1657}
1658
1659static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1660                                  int ndescs, int desc_size, int page_shift,
1661                                  int access_mode, u32 *in, int inlen)
1662{
1663        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1664        int err;
1665
1666        mr->access_mode = access_mode;
1667        mr->desc_size = desc_size;
1668        mr->max_descs = ndescs;
1669
1670        err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1671        if (err)
1672                return err;
1673
1674        mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1675
1676        err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1677        if (err)
1678                goto err_free_descs;
1679
1680        mr->mmkey.type = MLX5_MKEY_MR;
1681        mr->ibmr.lkey = mr->mmkey.key;
1682        mr->ibmr.rkey = mr->mmkey.key;
1683
1684        return 0;
1685
1686err_free_descs:
1687        mlx5_free_priv_descs(mr);
1688        return err;
1689}
1690
1691static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1692                                u32 max_num_sg, u32 max_num_meta_sg,
1693                                int desc_size, int access_mode)
1694{
1695        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1696        int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
1697        int page_shift = 0;
1698        struct mlx5_ib_mr *mr;
1699        u32 *in;
1700        int err;
1701
1702        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1703        if (!mr)
1704                return ERR_PTR(-ENOMEM);
1705
1706        mr->ibmr.pd = pd;
1707        mr->ibmr.device = pd->device;
1708
1709        in = kzalloc(inlen, GFP_KERNEL);
1710        if (!in) {
1711                err = -ENOMEM;
1712                goto err_free;
1713        }
1714
1715        if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
1716                page_shift = PAGE_SHIFT;
1717
1718        err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
1719                                     access_mode, in, inlen);
1720        if (err)
1721                goto err_free_in;
1722
1723        mr->umem = NULL;
1724        kfree(in);
1725
1726        return mr;
1727
1728err_free_in:
1729        kfree(in);
1730err_free:
1731        kfree(mr);
1732        return ERR_PTR(err);
1733}
1734
1735static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1736                                    int ndescs, u32 *in, int inlen)
1737{
1738        return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
1739                                      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
1740                                      inlen);
1741}
1742
1743static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1744                                    int ndescs, u32 *in, int inlen)
1745{
1746        return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
1747                                      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1748}
1749
1750static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1751                                      int max_num_sg, int max_num_meta_sg,
1752                                      u32 *in, int inlen)
1753{
1754        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1755        u32 psv_index[2];
1756        void *mkc;
1757        int err;
1758
1759        mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1760        if (!mr->sig)
1761                return -ENOMEM;
1762
1763        /* create mem & wire PSVs */
1764        err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
1765        if (err)
1766                goto err_free_sig;
1767
1768        mr->sig->psv_memory.psv_idx = psv_index[0];
1769        mr->sig->psv_wire.psv_idx = psv_index[1];
1770
1771        mr->sig->sig_status_checked = true;
1772        mr->sig->sig_err_exists = false;
1773        /* Next UMR, Arm SIGERR */
1774        ++mr->sig->sigerr_count;
1775        mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1776                                         sizeof(struct mlx5_klm),
1777                                         MLX5_MKC_ACCESS_MODE_KLMS);
1778        if (IS_ERR(mr->klm_mr)) {
1779                err = PTR_ERR(mr->klm_mr);
1780                goto err_destroy_psv;
1781        }
1782        mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1783                                         sizeof(struct mlx5_mtt),
1784                                         MLX5_MKC_ACCESS_MODE_MTT);
1785        if (IS_ERR(mr->mtt_mr)) {
1786                err = PTR_ERR(mr->mtt_mr);
1787                goto err_free_klm_mr;
1788        }
1789
1790        /* Set bsf descriptors for mkey */
1791        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1792        MLX5_SET(mkc, mkc, bsf_en, 1);
1793        MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1794
1795        err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
1796                                     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1797        if (err)
1798                goto err_free_mtt_mr;
1799
1800        return 0;
1801
1802err_free_mtt_mr:
1803        dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
1804        mr->mtt_mr = NULL;
1805err_free_klm_mr:
1806        dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
1807        mr->klm_mr = NULL;
1808err_destroy_psv:
1809        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
1810                mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1811                             mr->sig->psv_memory.psv_idx);
1812        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1813                mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1814                             mr->sig->psv_wire.psv_idx);
1815err_free_sig:
1816        kfree(mr->sig);
1817
1818        return err;
1819}
1820
1821static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
1822                                        enum ib_mr_type mr_type, u32 max_num_sg,
1823                                        u32 max_num_meta_sg)
1824{
1825        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1826        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1827        int ndescs = ALIGN(max_num_sg, 4);
1828        struct mlx5_ib_mr *mr;
1829        u32 *in;
1830        int err;
1831
1832        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1833        if (!mr)
1834                return ERR_PTR(-ENOMEM);
1835
1836        in = kzalloc(inlen, GFP_KERNEL);
1837        if (!in) {
1838                err = -ENOMEM;
1839                goto err_free;
1840        }
1841
1842        mr->ibmr.device = pd->device;
1843        mr->umem = NULL;
1844
1845        switch (mr_type) {
1846        case IB_MR_TYPE_MEM_REG:
1847                err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
1848                break;
1849        case IB_MR_TYPE_SG_GAPS:
1850                err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
1851                break;
1852        case IB_MR_TYPE_INTEGRITY:
1853                err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
1854                                                 max_num_meta_sg, in, inlen);
1855                break;
1856        default:
1857                mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1858                err = -EINVAL;
1859        }
1860
1861        if (err)
1862                goto err_free_in;
1863
1864        kfree(in);
1865
1866        return &mr->ibmr;
1867
1868err_free_in:
1869        kfree(in);
1870err_free:
1871        kfree(mr);
1872        return ERR_PTR(err);
1873}
1874
1875struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1876                               u32 max_num_sg, struct ib_udata *udata)
1877{
1878        return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
1879}
1880
1881struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
1882                                         u32 max_num_sg, u32 max_num_meta_sg)
1883{
1884        return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
1885                                  max_num_meta_sg);
1886}
1887
1888struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
1889                               struct ib_udata *udata)
1890{
1891        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1892        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1893        struct mlx5_ib_mw *mw = NULL;
1894        u32 *in = NULL;
1895        void *mkc;
1896        int ndescs;
1897        int err;
1898        struct mlx5_ib_alloc_mw req = {};
1899        struct {
1900                __u32   comp_mask;
1901                __u32   response_length;
1902        } resp = {};
1903
1904        err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1905        if (err)
1906                return ERR_PTR(err);
1907
1908        if (req.comp_mask || req.reserved1 || req.reserved2)
1909                return ERR_PTR(-EOPNOTSUPP);
1910
1911        if (udata->inlen > sizeof(req) &&
1912            !ib_is_udata_cleared(udata, sizeof(req),
1913                                 udata->inlen - sizeof(req)))
1914                return ERR_PTR(-EOPNOTSUPP);
1915
1916        ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
1917
1918        mw = kzalloc(sizeof(*mw), GFP_KERNEL);
1919        in = kzalloc(inlen, GFP_KERNEL);
1920        if (!mw || !in) {
1921                err = -ENOMEM;
1922                goto free;
1923        }
1924
1925        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1926
1927        MLX5_SET(mkc, mkc, free, 1);
1928        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1929        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1930        MLX5_SET(mkc, mkc, umr_en, 1);
1931        MLX5_SET(mkc, mkc, lr, 1);
1932        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
1933        MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
1934        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1935
1936        err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
1937        if (err)
1938                goto free;
1939
1940        mw->mmkey.type = MLX5_MKEY_MW;
1941        mw->ibmw.rkey = mw->mmkey.key;
1942        mw->ndescs = ndescs;
1943
1944        resp.response_length = min(offsetof(typeof(resp), response_length) +
1945                                   sizeof(resp.response_length), udata->outlen);
1946        if (resp.response_length) {
1947                err = ib_copy_to_udata(udata, &resp, resp.response_length);
1948                if (err) {
1949                        mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1950                        goto free;
1951                }
1952        }
1953
1954        kfree(in);
1955        return &mw->ibmw;
1956
1957free:
1958        kfree(mw);
1959        kfree(in);
1960        return ERR_PTR(err);
1961}
1962
1963int mlx5_ib_dealloc_mw(struct ib_mw *mw)
1964{
1965        struct mlx5_ib_dev *dev = to_mdev(mw->device);
1966        struct mlx5_ib_mw *mmw = to_mmw(mw);
1967        int err;
1968
1969        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
1970                xa_erase_irq(&dev->mdev->priv.mkey_table,
1971                             mlx5_base_mkey(mmw->mmkey.key));
1972                /*
1973                 * pagefault_single_data_segment() may be accessing mmw under
1974                 * SRCU if the user bound an ODP MR to this MW.
1975                 */
1976                synchronize_srcu(&dev->mr_srcu);
1977        }
1978
1979        err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
1980        if (err)
1981                return err;
1982        kfree(mmw);
1983        return 0;
1984}
1985
1986int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
1987                            struct ib_mr_status *mr_status)
1988{
1989        struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1990        int ret = 0;
1991
1992        if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
1993                pr_err("Invalid status check mask\n");
1994                ret = -EINVAL;
1995                goto done;
1996        }
1997
1998        mr_status->fail_status = 0;
1999        if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2000                if (!mmr->sig) {
2001                        ret = -EINVAL;
2002                        pr_err("signature status check requested on a non-signature enabled MR\n");
2003                        goto done;
2004                }
2005
2006                mmr->sig->sig_status_checked = true;
2007                if (!mmr->sig->sig_err_exists)
2008                        goto done;
2009
2010                if (ibmr->lkey == mmr->sig->err_item.key)
2011                        memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2012                               sizeof(mr_status->sig_err));
2013                else {
2014                        mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2015                        mr_status->sig_err.sig_err_offset = 0;
2016                        mr_status->sig_err.key = mmr->sig->err_item.key;
2017                }
2018
2019                mmr->sig->sig_err_exists = false;
2020                mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2021        }
2022
2023done:
2024        return ret;
2025}
2026
2027static int
2028mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2029                        int data_sg_nents, unsigned int *data_sg_offset,
2030                        struct scatterlist *meta_sg, int meta_sg_nents,
2031                        unsigned int *meta_sg_offset)
2032{
2033        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2034        unsigned int sg_offset = 0;
2035        int n = 0;
2036
2037        mr->meta_length = 0;
2038        if (data_sg_nents == 1) {
2039                n++;
2040                mr->ndescs = 1;
2041                if (data_sg_offset)
2042                        sg_offset = *data_sg_offset;
2043                mr->data_length = sg_dma_len(data_sg) - sg_offset;
2044                mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2045                if (meta_sg_nents == 1) {
2046                        n++;
2047                        mr->meta_ndescs = 1;
2048                        if (meta_sg_offset)
2049                                sg_offset = *meta_sg_offset;
2050                        else
2051                                sg_offset = 0;
2052                        mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2053                        mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2054                }
2055                ibmr->length = mr->data_length + mr->meta_length;
2056        }
2057
2058        return n;
2059}
2060
2061static int
2062mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2063                   struct scatterlist *sgl,
2064                   unsigned short sg_nents,
2065                   unsigned int *sg_offset_p,
2066                   struct scatterlist *meta_sgl,
2067                   unsigned short meta_sg_nents,
2068                   unsigned int *meta_sg_offset_p)
2069{
2070        struct scatterlist *sg = sgl;
2071        struct mlx5_klm *klms = mr->descs;
2072        unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2073        u32 lkey = mr->ibmr.pd->local_dma_lkey;
2074        int i, j = 0;
2075
2076        mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2077        mr->ibmr.length = 0;
2078
2079        for_each_sg(sgl, sg, sg_nents, i) {
2080                if (unlikely(i >= mr->max_descs))
2081                        break;
2082                klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2083                klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2084                klms[i].key = cpu_to_be32(lkey);
2085                mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2086
2087                sg_offset = 0;
2088        }
2089
2090        if (sg_offset_p)
2091                *sg_offset_p = sg_offset;
2092
2093        mr->ndescs = i;
2094        mr->data_length = mr->ibmr.length;
2095
2096        if (meta_sg_nents) {
2097                sg = meta_sgl;
2098                sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2099                for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2100                        if (unlikely(i + j >= mr->max_descs))
2101                                break;
2102                        klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2103                                                     sg_offset);
2104                        klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2105                                                         sg_offset);
2106                        klms[i + j].key = cpu_to_be32(lkey);
2107                        mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2108
2109                        sg_offset = 0;
2110                }
2111                if (meta_sg_offset_p)
2112                        *meta_sg_offset_p = sg_offset;
2113
2114                mr->meta_ndescs = j;
2115                mr->meta_length = mr->ibmr.length - mr->data_length;
2116        }
2117
2118        return i + j;
2119}
2120
2121static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2122{
2123        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2124        __be64 *descs;
2125
2126        if (unlikely(mr->ndescs == mr->max_descs))
2127                return -ENOMEM;
2128
2129        descs = mr->descs;
2130        descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2131
2132        return 0;
2133}
2134
2135static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2136{
2137        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2138        __be64 *descs;
2139
2140        if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
2141                return -ENOMEM;
2142
2143        descs = mr->descs;
2144        descs[mr->ndescs + mr->meta_ndescs++] =
2145                cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2146
2147        return 0;
2148}
2149
2150static int
2151mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2152                         int data_sg_nents, unsigned int *data_sg_offset,
2153                         struct scatterlist *meta_sg, int meta_sg_nents,
2154                         unsigned int *meta_sg_offset)
2155{
2156        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2157        struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2158        int n;
2159
2160        pi_mr->ndescs = 0;
2161        pi_mr->meta_ndescs = 0;
2162        pi_mr->meta_length = 0;
2163
2164        ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2165                                   pi_mr->desc_size * pi_mr->max_descs,
2166                                   DMA_TO_DEVICE);
2167
2168        pi_mr->ibmr.page_size = ibmr->page_size;
2169        n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2170                           mlx5_set_page);
2171        if (n != data_sg_nents)
2172                return n;
2173
2174        pi_mr->data_iova = pi_mr->ibmr.iova;
2175        pi_mr->data_length = pi_mr->ibmr.length;
2176        pi_mr->ibmr.length = pi_mr->data_length;
2177        ibmr->length = pi_mr->data_length;
2178
2179        if (meta_sg_nents) {
2180                u64 page_mask = ~((u64)ibmr->page_size - 1);
2181                u64 iova = pi_mr->data_iova;
2182
2183                n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2184                                    meta_sg_offset, mlx5_set_page_pi);
2185
2186                pi_mr->meta_length = pi_mr->ibmr.length;
2187                /*
2188                 * PI address for the HW is the offset of the metadata address
2189                 * relative to the first data page address.
2190                 * It equals to first data page address + size of data pages +
2191                 * metadata offset at the first metadata page
2192                 */
2193                pi_mr->pi_iova = (iova & page_mask) +
2194                                 pi_mr->ndescs * ibmr->page_size +
2195                                 (pi_mr->ibmr.iova & ~page_mask);
2196                /*
2197                 * In order to use one MTT MR for data and metadata, we register
2198                 * also the gaps between the end of the data and the start of
2199                 * the metadata (the sig MR will verify that the HW will access
2200                 * to right addresses). This mapping is safe because we use
2201                 * internal mkey for the registration.
2202                 */
2203                pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2204                pi_mr->ibmr.iova = iova;
2205                ibmr->length += pi_mr->meta_length;
2206        }
2207
2208        ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2209                                      pi_mr->desc_size * pi_mr->max_descs,
2210                                      DMA_TO_DEVICE);
2211
2212        return n;
2213}
2214
2215static int
2216mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2217                         int data_sg_nents, unsigned int *data_sg_offset,
2218                         struct scatterlist *meta_sg, int meta_sg_nents,
2219                         unsigned int *meta_sg_offset)
2220{
2221        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2222        struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2223        int n;
2224
2225        pi_mr->ndescs = 0;
2226        pi_mr->meta_ndescs = 0;
2227        pi_mr->meta_length = 0;
2228
2229        ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2230                                   pi_mr->desc_size * pi_mr->max_descs,
2231                                   DMA_TO_DEVICE);
2232
2233        n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2234                               meta_sg, meta_sg_nents, meta_sg_offset);
2235
2236        ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2237                                      pi_mr->desc_size * pi_mr->max_descs,
2238                                      DMA_TO_DEVICE);
2239
2240        /* This is zero-based memory region */
2241        pi_mr->data_iova = 0;
2242        pi_mr->ibmr.iova = 0;
2243        pi_mr->pi_iova = pi_mr->data_length;
2244        ibmr->length = pi_mr->ibmr.length;
2245
2246        return n;
2247}
2248
2249int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2250                         int data_sg_nents, unsigned int *data_sg_offset,
2251                         struct scatterlist *meta_sg, int meta_sg_nents,
2252                         unsigned int *meta_sg_offset)
2253{
2254        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2255        struct mlx5_ib_mr *pi_mr = NULL;
2256        int n;
2257
2258        WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2259
2260        mr->ndescs = 0;
2261        mr->data_length = 0;
2262        mr->data_iova = 0;
2263        mr->meta_ndescs = 0;
2264        mr->pi_iova = 0;
2265        /*
2266         * As a performance optimization, if possible, there is no need to
2267         * perform UMR operation to register the data/metadata buffers.
2268         * First try to map the sg lists to PA descriptors with local_dma_lkey.
2269         * Fallback to UMR only in case of a failure.
2270         */
2271        n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2272                                    data_sg_offset, meta_sg, meta_sg_nents,
2273                                    meta_sg_offset);
2274        if (n == data_sg_nents + meta_sg_nents)
2275                goto out;
2276        /*
2277         * As a performance optimization, if possible, there is no need to map
2278         * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2279         * descriptors and fallback to KLM only in case of a failure.
2280         * It's more efficient for the HW to work with MTT descriptors
2281         * (especially in high load).
2282         * Use KLM (indirect access) only if it's mandatory.
2283         */
2284        pi_mr = mr->mtt_mr;
2285        n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2286                                     data_sg_offset, meta_sg, meta_sg_nents,
2287                                     meta_sg_offset);
2288        if (n == data_sg_nents + meta_sg_nents)
2289                goto out;
2290
2291        pi_mr = mr->klm_mr;
2292        n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2293                                     data_sg_offset, meta_sg, meta_sg_nents,
2294                                     meta_sg_offset);
2295        if (unlikely(n != data_sg_nents + meta_sg_nents))
2296                return -ENOMEM;
2297
2298out:
2299        /* This is zero-based memory region */
2300        ibmr->iova = 0;
2301        mr->pi_mr = pi_mr;
2302        if (pi_mr)
2303                ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2304        else
2305                ibmr->sig_attrs->meta_length = mr->meta_length;
2306
2307        return 0;
2308}
2309
2310int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2311                      unsigned int *sg_offset)
2312{
2313        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2314        int n;
2315
2316        mr->ndescs = 0;
2317
2318        ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2319                                   mr->desc_size * mr->max_descs,
2320                                   DMA_TO_DEVICE);
2321
2322        if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2323                n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2324                                       NULL);
2325        else
2326                n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2327                                mlx5_set_page);
2328
2329        ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2330                                      mr->desc_size * mr->max_descs,
2331                                      DMA_TO_DEVICE);
2332
2333        return n;
2334}
2335