linux/drivers/infiniband/hw/mlx5/mr.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32
  33
  34#include <linux/kref.h>
  35#include <linux/random.h>
  36#include <linux/debugfs.h>
  37#include <linux/export.h>
  38#include <linux/delay.h>
  39#include <rdma/ib_umem.h>
  40#include <rdma/ib_umem_odp.h>
  41#include <rdma/ib_verbs.h>
  42#include "mlx5_ib.h"
  43
  44enum {
  45        MAX_PENDING_REG_MR = 8,
  46};
  47
  48#define MLX5_UMR_ALIGN 2048
  49
  50static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
  51static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
  52static int mr_cache_max_order(struct mlx5_ib_dev *dev);
  53
  54static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
  55{
  56        return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
  57}
  58
  59static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
  60{
  61        WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
  62
  63        return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
  64}
  65
  66static int order2idx(struct mlx5_ib_dev *dev, int order)
  67{
  68        struct mlx5_mr_cache *cache = &dev->cache;
  69
  70        if (order < cache->ent[0].order)
  71                return 0;
  72        else
  73                return order - cache->ent[0].order;
  74}
  75
  76static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
  77{
  78        return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
  79                length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
  80}
  81
  82static void reg_mr_callback(int status, struct mlx5_async_work *context)
  83{
  84        struct mlx5_ib_mr *mr =
  85                container_of(context, struct mlx5_ib_mr, cb_work);
  86        struct mlx5_ib_dev *dev = mr->dev;
  87        struct mlx5_mr_cache *cache = &dev->cache;
  88        int c = order2idx(dev, mr->order);
  89        struct mlx5_cache_ent *ent = &cache->ent[c];
  90        u8 key;
  91        unsigned long flags;
  92
  93        spin_lock_irqsave(&ent->lock, flags);
  94        ent->pending--;
  95        spin_unlock_irqrestore(&ent->lock, flags);
  96        if (status) {
  97                mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
  98                kfree(mr);
  99                dev->fill_delay = 1;
 100                mod_timer(&dev->delay_timer, jiffies + HZ);
 101                return;
 102        }
 103
 104        mr->mmkey.type = MLX5_MKEY_MR;
 105        spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
 106        key = dev->mdev->priv.mkey_key++;
 107        spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
 108        mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
 109
 110        cache->last_add = jiffies;
 111
 112        spin_lock_irqsave(&ent->lock, flags);
 113        list_add_tail(&mr->list, &ent->head);
 114        ent->cur++;
 115        ent->size++;
 116        spin_unlock_irqrestore(&ent->lock, flags);
 117
 118        if (!completion_done(&ent->compl))
 119                complete(&ent->compl);
 120}
 121
 122static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 123{
 124        struct mlx5_mr_cache *cache = &dev->cache;
 125        struct mlx5_cache_ent *ent = &cache->ent[c];
 126        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 127        struct mlx5_ib_mr *mr;
 128        void *mkc;
 129        u32 *in;
 130        int err = 0;
 131        int i;
 132
 133        in = kzalloc(inlen, GFP_KERNEL);
 134        if (!in)
 135                return -ENOMEM;
 136
 137        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 138        for (i = 0; i < num; i++) {
 139                if (ent->pending >= MAX_PENDING_REG_MR) {
 140                        err = -EAGAIN;
 141                        break;
 142                }
 143
 144                mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 145                if (!mr) {
 146                        err = -ENOMEM;
 147                        break;
 148                }
 149                mr->order = ent->order;
 150                mr->allocated_from_cache = true;
 151                mr->dev = dev;
 152
 153                MLX5_SET(mkc, mkc, free, 1);
 154                MLX5_SET(mkc, mkc, umr_en, 1);
 155                MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
 156                MLX5_SET(mkc, mkc, access_mode_4_2,
 157                         (ent->access_mode >> 2) & 0x7);
 158
 159                MLX5_SET(mkc, mkc, qpn, 0xffffff);
 160                MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
 161                MLX5_SET(mkc, mkc, log_page_size, ent->page);
 162
 163                spin_lock_irq(&ent->lock);
 164                ent->pending++;
 165                spin_unlock_irq(&ent->lock);
 166                err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
 167                                               &dev->async_ctx, in, inlen,
 168                                               mr->out, sizeof(mr->out),
 169                                               reg_mr_callback, &mr->cb_work);
 170                if (err) {
 171                        spin_lock_irq(&ent->lock);
 172                        ent->pending--;
 173                        spin_unlock_irq(&ent->lock);
 174                        mlx5_ib_warn(dev, "create mkey failed %d\n", err);
 175                        kfree(mr);
 176                        break;
 177                }
 178        }
 179
 180        kfree(in);
 181        return err;
 182}
 183
 184static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
 185{
 186        struct mlx5_mr_cache *cache = &dev->cache;
 187        struct mlx5_cache_ent *ent = &cache->ent[c];
 188        struct mlx5_ib_mr *tmp_mr;
 189        struct mlx5_ib_mr *mr;
 190        LIST_HEAD(del_list);
 191        int i;
 192
 193        for (i = 0; i < num; i++) {
 194                spin_lock_irq(&ent->lock);
 195                if (list_empty(&ent->head)) {
 196                        spin_unlock_irq(&ent->lock);
 197                        break;
 198                }
 199                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 200                list_move(&mr->list, &del_list);
 201                ent->cur--;
 202                ent->size--;
 203                spin_unlock_irq(&ent->lock);
 204                mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 205        }
 206
 207        list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 208                list_del(&mr->list);
 209                kfree(mr);
 210        }
 211}
 212
 213static ssize_t size_write(struct file *filp, const char __user *buf,
 214                          size_t count, loff_t *pos)
 215{
 216        struct mlx5_cache_ent *ent = filp->private_data;
 217        struct mlx5_ib_dev *dev = ent->dev;
 218        char lbuf[20] = {0};
 219        u32 var;
 220        int err;
 221        int c;
 222
 223        count = min(count, sizeof(lbuf) - 1);
 224        if (copy_from_user(lbuf, buf, count))
 225                return -EFAULT;
 226
 227        c = order2idx(dev, ent->order);
 228
 229        if (sscanf(lbuf, "%u", &var) != 1)
 230                return -EINVAL;
 231
 232        if (var < ent->limit)
 233                return -EINVAL;
 234
 235        if (var > ent->size) {
 236                do {
 237                        err = add_keys(dev, c, var - ent->size);
 238                        if (err && err != -EAGAIN)
 239                                return err;
 240
 241                        usleep_range(3000, 5000);
 242                } while (err);
 243        } else if (var < ent->size) {
 244                remove_keys(dev, c, ent->size - var);
 245        }
 246
 247        return count;
 248}
 249
 250static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 251                         loff_t *pos)
 252{
 253        struct mlx5_cache_ent *ent = filp->private_data;
 254        char lbuf[20];
 255        int err;
 256
 257        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
 258        if (err < 0)
 259                return err;
 260
 261        return simple_read_from_buffer(buf, count, pos, lbuf, err);
 262}
 263
 264static const struct file_operations size_fops = {
 265        .owner  = THIS_MODULE,
 266        .open   = simple_open,
 267        .write  = size_write,
 268        .read   = size_read,
 269};
 270
 271static ssize_t limit_write(struct file *filp, const char __user *buf,
 272                           size_t count, loff_t *pos)
 273{
 274        struct mlx5_cache_ent *ent = filp->private_data;
 275        struct mlx5_ib_dev *dev = ent->dev;
 276        char lbuf[20] = {0};
 277        u32 var;
 278        int err;
 279        int c;
 280
 281        count = min(count, sizeof(lbuf) - 1);
 282        if (copy_from_user(lbuf, buf, count))
 283                return -EFAULT;
 284
 285        c = order2idx(dev, ent->order);
 286
 287        if (sscanf(lbuf, "%u", &var) != 1)
 288                return -EINVAL;
 289
 290        if (var > ent->size)
 291                return -EINVAL;
 292
 293        ent->limit = var;
 294
 295        if (ent->cur < ent->limit) {
 296                err = add_keys(dev, c, 2 * ent->limit - ent->cur);
 297                if (err)
 298                        return err;
 299        }
 300
 301        return count;
 302}
 303
 304static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
 305                          loff_t *pos)
 306{
 307        struct mlx5_cache_ent *ent = filp->private_data;
 308        char lbuf[20];
 309        int err;
 310
 311        err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
 312        if (err < 0)
 313                return err;
 314
 315        return simple_read_from_buffer(buf, count, pos, lbuf, err);
 316}
 317
 318static const struct file_operations limit_fops = {
 319        .owner  = THIS_MODULE,
 320        .open   = simple_open,
 321        .write  = limit_write,
 322        .read   = limit_read,
 323};
 324
 325static int someone_adding(struct mlx5_mr_cache *cache)
 326{
 327        int i;
 328
 329        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 330                if (cache->ent[i].cur < cache->ent[i].limit)
 331                        return 1;
 332        }
 333
 334        return 0;
 335}
 336
 337static void __cache_work_func(struct mlx5_cache_ent *ent)
 338{
 339        struct mlx5_ib_dev *dev = ent->dev;
 340        struct mlx5_mr_cache *cache = &dev->cache;
 341        int i = order2idx(dev, ent->order);
 342        int err;
 343
 344        if (cache->stopped)
 345                return;
 346
 347        ent = &dev->cache.ent[i];
 348        if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
 349                err = add_keys(dev, i, 1);
 350                if (ent->cur < 2 * ent->limit) {
 351                        if (err == -EAGAIN) {
 352                                mlx5_ib_dbg(dev, "returned eagain, order %d\n",
 353                                            i + 2);
 354                                queue_delayed_work(cache->wq, &ent->dwork,
 355                                                   msecs_to_jiffies(3));
 356                        } else if (err) {
 357                                mlx5_ib_warn(dev, "command failed order %d, err %d\n",
 358                                             i + 2, err);
 359                                queue_delayed_work(cache->wq, &ent->dwork,
 360                                                   msecs_to_jiffies(1000));
 361                        } else {
 362                                queue_work(cache->wq, &ent->work);
 363                        }
 364                }
 365        } else if (ent->cur > 2 * ent->limit) {
 366                /*
 367                 * The remove_keys() logic is performed as garbage collection
 368                 * task. Such task is intended to be run when no other active
 369                 * processes are running.
 370                 *
 371                 * The need_resched() will return TRUE if there are user tasks
 372                 * to be activated in near future.
 373                 *
 374                 * In such case, we don't execute remove_keys() and postpone
 375                 * the garbage collection work to try to run in next cycle,
 376                 * in order to free CPU resources to other tasks.
 377                 */
 378                if (!need_resched() && !someone_adding(cache) &&
 379                    time_after(jiffies, cache->last_add + 300 * HZ)) {
 380                        remove_keys(dev, i, 1);
 381                        if (ent->cur > ent->limit)
 382                                queue_work(cache->wq, &ent->work);
 383                } else {
 384                        queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
 385                }
 386        }
 387}
 388
 389static void delayed_cache_work_func(struct work_struct *work)
 390{
 391        struct mlx5_cache_ent *ent;
 392
 393        ent = container_of(work, struct mlx5_cache_ent, dwork.work);
 394        __cache_work_func(ent);
 395}
 396
 397static void cache_work_func(struct work_struct *work)
 398{
 399        struct mlx5_cache_ent *ent;
 400
 401        ent = container_of(work, struct mlx5_cache_ent, work);
 402        __cache_work_func(ent);
 403}
 404
 405struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
 406{
 407        struct mlx5_mr_cache *cache = &dev->cache;
 408        struct mlx5_cache_ent *ent;
 409        struct mlx5_ib_mr *mr;
 410        int err;
 411
 412        if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
 413                mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
 414                return ERR_PTR(-EINVAL);
 415        }
 416
 417        ent = &cache->ent[entry];
 418        while (1) {
 419                spin_lock_irq(&ent->lock);
 420                if (list_empty(&ent->head)) {
 421                        spin_unlock_irq(&ent->lock);
 422
 423                        err = add_keys(dev, entry, 1);
 424                        if (err && err != -EAGAIN)
 425                                return ERR_PTR(err);
 426
 427                        wait_for_completion(&ent->compl);
 428                } else {
 429                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 430                                              list);
 431                        list_del(&mr->list);
 432                        ent->cur--;
 433                        spin_unlock_irq(&ent->lock);
 434                        if (ent->cur < ent->limit)
 435                                queue_work(cache->wq, &ent->work);
 436                        return mr;
 437                }
 438        }
 439}
 440
 441static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 442{
 443        struct mlx5_mr_cache *cache = &dev->cache;
 444        struct mlx5_ib_mr *mr = NULL;
 445        struct mlx5_cache_ent *ent;
 446        int last_umr_cache_entry;
 447        int c;
 448        int i;
 449
 450        c = order2idx(dev, order);
 451        last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev));
 452        if (c < 0 || c > last_umr_cache_entry) {
 453                mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
 454                return NULL;
 455        }
 456
 457        for (i = c; i <= last_umr_cache_entry; i++) {
 458                ent = &cache->ent[i];
 459
 460                mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
 461
 462                spin_lock_irq(&ent->lock);
 463                if (!list_empty(&ent->head)) {
 464                        mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 465                                              list);
 466                        list_del(&mr->list);
 467                        ent->cur--;
 468                        spin_unlock_irq(&ent->lock);
 469                        if (ent->cur < ent->limit)
 470                                queue_work(cache->wq, &ent->work);
 471                        break;
 472                }
 473                spin_unlock_irq(&ent->lock);
 474
 475                queue_work(cache->wq, &ent->work);
 476        }
 477
 478        if (!mr)
 479                cache->ent[c].miss++;
 480
 481        return mr;
 482}
 483
 484void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 485{
 486        struct mlx5_mr_cache *cache = &dev->cache;
 487        struct mlx5_cache_ent *ent;
 488        int shrink = 0;
 489        int c;
 490
 491        if (!mr->allocated_from_cache)
 492                return;
 493
 494        c = order2idx(dev, mr->order);
 495        WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES);
 496
 497        if (mlx5_mr_cache_invalidate(mr)) {
 498                mr->allocated_from_cache = false;
 499                destroy_mkey(dev, mr);
 500                ent = &cache->ent[c];
 501                if (ent->cur < ent->limit)
 502                        queue_work(cache->wq, &ent->work);
 503                return;
 504        }
 505
 506        ent = &cache->ent[c];
 507        spin_lock_irq(&ent->lock);
 508        list_add_tail(&mr->list, &ent->head);
 509        ent->cur++;
 510        if (ent->cur > 2 * ent->limit)
 511                shrink = 1;
 512        spin_unlock_irq(&ent->lock);
 513
 514        if (shrink)
 515                queue_work(cache->wq, &ent->work);
 516}
 517
 518static void clean_keys(struct mlx5_ib_dev *dev, int c)
 519{
 520        struct mlx5_mr_cache *cache = &dev->cache;
 521        struct mlx5_cache_ent *ent = &cache->ent[c];
 522        struct mlx5_ib_mr *tmp_mr;
 523        struct mlx5_ib_mr *mr;
 524        LIST_HEAD(del_list);
 525
 526        cancel_delayed_work(&ent->dwork);
 527        while (1) {
 528                spin_lock_irq(&ent->lock);
 529                if (list_empty(&ent->head)) {
 530                        spin_unlock_irq(&ent->lock);
 531                        break;
 532                }
 533                mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 534                list_move(&mr->list, &del_list);
 535                ent->cur--;
 536                ent->size--;
 537                spin_unlock_irq(&ent->lock);
 538                mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 539        }
 540
 541        list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 542                list_del(&mr->list);
 543                kfree(mr);
 544        }
 545}
 546
 547static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 548{
 549        if (!mlx5_debugfs_root || dev->is_rep)
 550                return;
 551
 552        debugfs_remove_recursive(dev->cache.root);
 553        dev->cache.root = NULL;
 554}
 555
 556static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 557{
 558        struct mlx5_mr_cache *cache = &dev->cache;
 559        struct mlx5_cache_ent *ent;
 560        struct dentry *dir;
 561        int i;
 562
 563        if (!mlx5_debugfs_root || dev->is_rep)
 564                return;
 565
 566        cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
 567
 568        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 569                ent = &cache->ent[i];
 570                sprintf(ent->name, "%d", ent->order);
 571                dir = debugfs_create_dir(ent->name, cache->root);
 572                debugfs_create_file("size", 0600, dir, ent, &size_fops);
 573                debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
 574                debugfs_create_u32("cur", 0400, dir, &ent->cur);
 575                debugfs_create_u32("miss", 0600, dir, &ent->miss);
 576        }
 577}
 578
 579static void delay_time_func(struct timer_list *t)
 580{
 581        struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 582
 583        dev->fill_delay = 0;
 584}
 585
 586int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 587{
 588        struct mlx5_mr_cache *cache = &dev->cache;
 589        struct mlx5_cache_ent *ent;
 590        int i;
 591
 592        mutex_init(&dev->slow_path_mutex);
 593        cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 594        if (!cache->wq) {
 595                mlx5_ib_warn(dev, "failed to create work queue\n");
 596                return -ENOMEM;
 597        }
 598
 599        mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 600        timer_setup(&dev->delay_timer, delay_time_func, 0);
 601        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 602                ent = &cache->ent[i];
 603                INIT_LIST_HEAD(&ent->head);
 604                spin_lock_init(&ent->lock);
 605                ent->order = i + 2;
 606                ent->dev = dev;
 607                ent->limit = 0;
 608
 609                init_completion(&ent->compl);
 610                INIT_WORK(&ent->work, cache_work_func);
 611                INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 612
 613                if (i > MR_CACHE_LAST_STD_ENTRY) {
 614                        mlx5_odp_init_mr_cache_entry(ent);
 615                        continue;
 616                }
 617
 618                if (ent->order > mr_cache_max_order(dev))
 619                        continue;
 620
 621                ent->page = PAGE_SHIFT;
 622                ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
 623                           MLX5_IB_UMR_OCTOWORD;
 624                ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 625                if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
 626                    !dev->is_rep &&
 627                    mlx5_core_is_pf(dev->mdev))
 628                        ent->limit = dev->mdev->profile->mr_cache[i].limit;
 629                else
 630                        ent->limit = 0;
 631                queue_work(cache->wq, &ent->work);
 632        }
 633
 634        mlx5_mr_cache_debugfs_init(dev);
 635
 636        return 0;
 637}
 638
 639int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 640{
 641        int i;
 642
 643        if (!dev->cache.wq)
 644                return 0;
 645
 646        dev->cache.stopped = 1;
 647        flush_workqueue(dev->cache.wq);
 648
 649        mlx5_mr_cache_debugfs_cleanup(dev);
 650        mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 651
 652        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
 653                clean_keys(dev, i);
 654
 655        destroy_workqueue(dev->cache.wq);
 656        del_timer_sync(&dev->delay_timer);
 657
 658        return 0;
 659}
 660
 661static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
 662                                          struct ib_pd *pd)
 663{
 664        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 665
 666        MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
 667        MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
 668        MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
 669        MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
 670        MLX5_SET(mkc, mkc, lr, 1);
 671
 672        if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
 673                MLX5_SET(mkc, mkc, relaxed_ordering_write,
 674                         !!(acc & IB_ACCESS_RELAXED_ORDERING));
 675        if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
 676                MLX5_SET(mkc, mkc, relaxed_ordering_read,
 677                         !!(acc & IB_ACCESS_RELAXED_ORDERING));
 678
 679        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
 680        MLX5_SET(mkc, mkc, qpn, 0xffffff);
 681        MLX5_SET64(mkc, mkc, start_addr, start_addr);
 682}
 683
 684struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
 685{
 686        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 687        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 688        struct mlx5_core_dev *mdev = dev->mdev;
 689        struct mlx5_ib_mr *mr;
 690        void *mkc;
 691        u32 *in;
 692        int err;
 693
 694        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 695        if (!mr)
 696                return ERR_PTR(-ENOMEM);
 697
 698        in = kzalloc(inlen, GFP_KERNEL);
 699        if (!in) {
 700                err = -ENOMEM;
 701                goto err_free;
 702        }
 703
 704        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 705
 706        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
 707        MLX5_SET(mkc, mkc, length64, 1);
 708        set_mkc_access_pd_addr_fields(mkc, acc, 0, pd);
 709
 710        err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
 711        if (err)
 712                goto err_in;
 713
 714        kfree(in);
 715        mr->mmkey.type = MLX5_MKEY_MR;
 716        mr->ibmr.lkey = mr->mmkey.key;
 717        mr->ibmr.rkey = mr->mmkey.key;
 718        mr->umem = NULL;
 719
 720        return &mr->ibmr;
 721
 722err_in:
 723        kfree(in);
 724
 725err_free:
 726        kfree(mr);
 727
 728        return ERR_PTR(err);
 729}
 730
 731static int get_octo_len(u64 addr, u64 len, int page_shift)
 732{
 733        u64 page_size = 1ULL << page_shift;
 734        u64 offset;
 735        int npages;
 736
 737        offset = addr & (page_size - 1);
 738        npages = ALIGN(len + offset, page_size) >> page_shift;
 739        return (npages + 1) / 2;
 740}
 741
 742static int mr_cache_max_order(struct mlx5_ib_dev *dev)
 743{
 744        if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
 745                return MR_CACHE_LAST_STD_ENTRY + 2;
 746        return MLX5_MAX_UMR_SHIFT;
 747}
 748
 749static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length,
 750                       int access_flags, struct ib_umem **umem, int *npages,
 751                       int *page_shift, int *ncont, int *order)
 752{
 753        struct ib_umem *u;
 754
 755        *umem = NULL;
 756
 757        if (access_flags & IB_ACCESS_ON_DEMAND) {
 758                struct ib_umem_odp *odp;
 759
 760                odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
 761                                      &mlx5_mn_ops);
 762                if (IS_ERR(odp)) {
 763                        mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
 764                                    PTR_ERR(odp));
 765                        return PTR_ERR(odp);
 766                }
 767
 768                u = &odp->umem;
 769
 770                *page_shift = odp->page_shift;
 771                *ncont = ib_umem_odp_num_pages(odp);
 772                *npages = *ncont << (*page_shift - PAGE_SHIFT);
 773                if (order)
 774                        *order = ilog2(roundup_pow_of_two(*ncont));
 775        } else {
 776                u = ib_umem_get(&dev->ib_dev, start, length, access_flags);
 777                if (IS_ERR(u)) {
 778                        mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
 779                        return PTR_ERR(u);
 780                }
 781
 782                mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
 783                                   page_shift, ncont, order);
 784        }
 785
 786        if (!*npages) {
 787                mlx5_ib_warn(dev, "avoid zero region\n");
 788                ib_umem_release(u);
 789                return -EINVAL;
 790        }
 791
 792        *umem = u;
 793
 794        mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
 795                    *npages, *ncont, *order, *page_shift);
 796
 797        return 0;
 798}
 799
 800static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
 801{
 802        struct mlx5_ib_umr_context *context =
 803                container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
 804
 805        context->status = wc->status;
 806        complete(&context->done);
 807}
 808
 809static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
 810{
 811        context->cqe.done = mlx5_ib_umr_done;
 812        context->status = -1;
 813        init_completion(&context->done);
 814}
 815
 816static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
 817                                  struct mlx5_umr_wr *umrwr)
 818{
 819        struct umr_common *umrc = &dev->umrc;
 820        const struct ib_send_wr *bad;
 821        int err;
 822        struct mlx5_ib_umr_context umr_context;
 823
 824        mlx5_ib_init_umr_context(&umr_context);
 825        umrwr->wr.wr_cqe = &umr_context.cqe;
 826
 827        down(&umrc->sem);
 828        err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
 829        if (err) {
 830                mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
 831        } else {
 832                wait_for_completion(&umr_context.done);
 833                if (umr_context.status != IB_WC_SUCCESS) {
 834                        mlx5_ib_warn(dev, "reg umr failed (%u)\n",
 835                                     umr_context.status);
 836                        err = -EFAULT;
 837                }
 838        }
 839        up(&umrc->sem);
 840        return err;
 841}
 842
 843static struct mlx5_ib_mr *alloc_mr_from_cache(
 844                                  struct ib_pd *pd, struct ib_umem *umem,
 845                                  u64 virt_addr, u64 len, int npages,
 846                                  int page_shift, int order, int access_flags)
 847{
 848        struct mlx5_ib_dev *dev = to_mdev(pd->device);
 849        struct mlx5_ib_mr *mr;
 850        int err = 0;
 851        int i;
 852
 853        for (i = 0; i < 1; i++) {
 854                mr = alloc_cached_mr(dev, order);
 855                if (mr)
 856                        break;
 857
 858                err = add_keys(dev, order2idx(dev, order), 1);
 859                if (err && err != -EAGAIN) {
 860                        mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
 861                        break;
 862                }
 863        }
 864
 865        if (!mr)
 866                return ERR_PTR(-EAGAIN);
 867
 868        mr->ibmr.pd = pd;
 869        mr->umem = umem;
 870        mr->access_flags = access_flags;
 871        mr->desc_size = sizeof(struct mlx5_mtt);
 872        mr->mmkey.iova = virt_addr;
 873        mr->mmkey.size = len;
 874        mr->mmkey.pd = to_mpd(pd)->pdn;
 875
 876        return mr;
 877}
 878
 879#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
 880                            MLX5_UMR_MTT_ALIGNMENT)
 881#define MLX5_SPARE_UMR_CHUNK 0x10000
 882
 883int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 884                       int page_shift, int flags)
 885{
 886        struct mlx5_ib_dev *dev = mr->dev;
 887        struct device *ddev = dev->ib_dev.dev.parent;
 888        int size;
 889        void *xlt;
 890        dma_addr_t dma;
 891        struct mlx5_umr_wr wr;
 892        struct ib_sge sg;
 893        int err = 0;
 894        int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
 895                               ? sizeof(struct mlx5_klm)
 896                               : sizeof(struct mlx5_mtt);
 897        const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
 898        const int page_mask = page_align - 1;
 899        size_t pages_mapped = 0;
 900        size_t pages_to_map = 0;
 901        size_t pages_iter = 0;
 902        size_t size_to_map = 0;
 903        gfp_t gfp;
 904        bool use_emergency_page = false;
 905
 906        if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
 907            !umr_can_use_indirect_mkey(dev))
 908                return -EPERM;
 909
 910        /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
 911         * so we need to align the offset and length accordingly
 912         */
 913        if (idx & page_mask) {
 914                npages += idx & page_mask;
 915                idx &= ~page_mask;
 916        }
 917
 918        gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL;
 919        gfp |= __GFP_ZERO | __GFP_NOWARN;
 920
 921        pages_to_map = ALIGN(npages, page_align);
 922        size = desc_size * pages_to_map;
 923        size = min_t(int, size, MLX5_MAX_UMR_CHUNK);
 924
 925        xlt = (void *)__get_free_pages(gfp, get_order(size));
 926        if (!xlt && size > MLX5_SPARE_UMR_CHUNK) {
 927                mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n",
 928                            size, get_order(size), MLX5_SPARE_UMR_CHUNK);
 929
 930                size = MLX5_SPARE_UMR_CHUNK;
 931                xlt = (void *)__get_free_pages(gfp, get_order(size));
 932        }
 933
 934        if (!xlt) {
 935                mlx5_ib_warn(dev, "Using XLT emergency buffer\n");
 936                xlt = (void *)mlx5_ib_get_xlt_emergency_page();
 937                size = PAGE_SIZE;
 938                memset(xlt, 0, size);
 939                use_emergency_page = true;
 940        }
 941        pages_iter = size / desc_size;
 942        dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE);
 943        if (dma_mapping_error(ddev, dma)) {
 944                mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
 945                err = -ENOMEM;
 946                goto free_xlt;
 947        }
 948
 949        if (mr->umem->is_odp) {
 950                if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
 951                        struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 952                        size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
 953
 954                        pages_to_map = min_t(size_t, pages_to_map, max_pages);
 955                }
 956        }
 957
 958        sg.addr = dma;
 959        sg.lkey = dev->umrc.pd->local_dma_lkey;
 960
 961        memset(&wr, 0, sizeof(wr));
 962        wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
 963        if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
 964                wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
 965        wr.wr.sg_list = &sg;
 966        wr.wr.num_sge = 1;
 967        wr.wr.opcode = MLX5_IB_WR_UMR;
 968
 969        wr.pd = mr->ibmr.pd;
 970        wr.mkey = mr->mmkey.key;
 971        wr.length = mr->mmkey.size;
 972        wr.virt_addr = mr->mmkey.iova;
 973        wr.access_flags = mr->access_flags;
 974        wr.page_shift = page_shift;
 975
 976        for (pages_mapped = 0;
 977             pages_mapped < pages_to_map && !err;
 978             pages_mapped += pages_iter, idx += pages_iter) {
 979                npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
 980                size_to_map = npages * desc_size;
 981                dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
 982                if (mr->umem->is_odp) {
 983                        mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
 984                } else {
 985                        __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
 986                                               npages, xlt,
 987                                               MLX5_IB_MTT_PRESENT);
 988                        /* Clear padding after the pages
 989                         * brought from the umem.
 990                         */
 991                        memset(xlt + size_to_map, 0, size - size_to_map);
 992                }
 993                dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
 994
 995                sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
 996
 997                if (pages_mapped + pages_iter >= pages_to_map) {
 998                        if (flags & MLX5_IB_UPD_XLT_ENABLE)
 999                                wr.wr.send_flags |=
1000                                        MLX5_IB_SEND_UMR_ENABLE_MR |
1001                                        MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1002                                        MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1003                        if (flags & MLX5_IB_UPD_XLT_PD ||
1004                            flags & MLX5_IB_UPD_XLT_ACCESS)
1005                                wr.wr.send_flags |=
1006                                        MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1007                        if (flags & MLX5_IB_UPD_XLT_ADDR)
1008                                wr.wr.send_flags |=
1009                                        MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1010                }
1011
1012                wr.offset = idx * desc_size;
1013                wr.xlt_size = sg.length;
1014
1015                err = mlx5_ib_post_send_wait(dev, &wr);
1016        }
1017        dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1018
1019free_xlt:
1020        if (use_emergency_page)
1021                mlx5_ib_put_xlt_emergency_page();
1022        else
1023                free_pages((unsigned long)xlt, get_order(size));
1024
1025        return err;
1026}
1027
1028/*
1029 * If ibmr is NULL it will be allocated by reg_create.
1030 * Else, the given ibmr will be used.
1031 */
1032static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
1033                                     u64 virt_addr, u64 length,
1034                                     struct ib_umem *umem, int npages,
1035                                     int page_shift, int access_flags,
1036                                     bool populate)
1037{
1038        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1039        struct mlx5_ib_mr *mr;
1040        __be64 *pas;
1041        void *mkc;
1042        int inlen;
1043        u32 *in;
1044        int err;
1045        bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1046
1047        mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
1048        if (!mr)
1049                return ERR_PTR(-ENOMEM);
1050
1051        mr->ibmr.pd = pd;
1052        mr->access_flags = access_flags;
1053
1054        inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1055        if (populate)
1056                inlen += sizeof(*pas) * roundup(npages, 2);
1057        in = kvzalloc(inlen, GFP_KERNEL);
1058        if (!in) {
1059                err = -ENOMEM;
1060                goto err_1;
1061        }
1062        pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1063        if (populate && !(access_flags & IB_ACCESS_ON_DEMAND))
1064                mlx5_ib_populate_pas(dev, umem, page_shift, pas,
1065                                     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1066
1067        /* The pg_access bit allows setting the access flags
1068         * in the page list submitted with the command. */
1069        MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1070
1071        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1072        MLX5_SET(mkc, mkc, free, !populate);
1073        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1074        if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
1075                MLX5_SET(mkc, mkc, relaxed_ordering_write,
1076                         !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
1077        if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
1078                MLX5_SET(mkc, mkc, relaxed_ordering_read,
1079                         !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
1080        MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
1081        MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
1082        MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
1083        MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
1084        MLX5_SET(mkc, mkc, lr, 1);
1085        MLX5_SET(mkc, mkc, umr_en, 1);
1086
1087        MLX5_SET64(mkc, mkc, start_addr, virt_addr);
1088        MLX5_SET64(mkc, mkc, len, length);
1089        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1090        MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1091        MLX5_SET(mkc, mkc, translations_octword_size,
1092                 get_octo_len(virt_addr, length, page_shift));
1093        MLX5_SET(mkc, mkc, log_page_size, page_shift);
1094        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1095        if (populate) {
1096                MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1097                         get_octo_len(virt_addr, length, page_shift));
1098        }
1099
1100        err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1101        if (err) {
1102                mlx5_ib_warn(dev, "create mkey failed\n");
1103                goto err_2;
1104        }
1105        mr->mmkey.type = MLX5_MKEY_MR;
1106        mr->desc_size = sizeof(struct mlx5_mtt);
1107        mr->dev = dev;
1108        kvfree(in);
1109
1110        mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1111
1112        return mr;
1113
1114err_2:
1115        kvfree(in);
1116
1117err_1:
1118        if (!ibmr)
1119                kfree(mr);
1120
1121        return ERR_PTR(err);
1122}
1123
1124static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1125                          int npages, u64 length, int access_flags)
1126{
1127        mr->npages = npages;
1128        atomic_add(npages, &dev->mdev->priv.reg_pages);
1129        mr->ibmr.lkey = mr->mmkey.key;
1130        mr->ibmr.rkey = mr->mmkey.key;
1131        mr->ibmr.length = length;
1132        mr->access_flags = access_flags;
1133}
1134
1135static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1136                                       u64 length, int acc, int mode)
1137{
1138        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1139        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1140        struct mlx5_core_dev *mdev = dev->mdev;
1141        struct mlx5_ib_mr *mr;
1142        void *mkc;
1143        u32 *in;
1144        int err;
1145
1146        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1147        if (!mr)
1148                return ERR_PTR(-ENOMEM);
1149
1150        in = kzalloc(inlen, GFP_KERNEL);
1151        if (!in) {
1152                err = -ENOMEM;
1153                goto err_free;
1154        }
1155
1156        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1157
1158        MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1159        MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1160        MLX5_SET64(mkc, mkc, len, length);
1161        set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1162
1163        err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
1164        if (err)
1165                goto err_in;
1166
1167        kfree(in);
1168
1169        mr->umem = NULL;
1170        set_mr_fields(dev, mr, 0, length, acc);
1171
1172        return &mr->ibmr;
1173
1174err_in:
1175        kfree(in);
1176
1177err_free:
1178        kfree(mr);
1179
1180        return ERR_PTR(err);
1181}
1182
1183int mlx5_ib_advise_mr(struct ib_pd *pd,
1184                      enum ib_uverbs_advise_mr_advice advice,
1185                      u32 flags,
1186                      struct ib_sge *sg_list,
1187                      u32 num_sge,
1188                      struct uverbs_attr_bundle *attrs)
1189{
1190        if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1191            advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE)
1192                return -EOPNOTSUPP;
1193
1194        return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1195                                         sg_list, num_sge);
1196}
1197
1198struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1199                                struct ib_dm_mr_attr *attr,
1200                                struct uverbs_attr_bundle *attrs)
1201{
1202        struct mlx5_ib_dm *mdm = to_mdm(dm);
1203        struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1204        u64 start_addr = mdm->dev_addr + attr->offset;
1205        int mode;
1206
1207        switch (mdm->type) {
1208        case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1209                if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1210                        return ERR_PTR(-EINVAL);
1211
1212                mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1213                start_addr -= pci_resource_start(dev->pdev, 0);
1214                break;
1215        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1216        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1217                if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1218                        return ERR_PTR(-EINVAL);
1219
1220                mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1221                break;
1222        default:
1223                return ERR_PTR(-EINVAL);
1224        }
1225
1226        return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1227                                 attr->access_flags, mode);
1228}
1229
1230struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1231                                  u64 virt_addr, int access_flags,
1232                                  struct ib_udata *udata)
1233{
1234        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1235        struct mlx5_ib_mr *mr = NULL;
1236        bool use_umr;
1237        struct ib_umem *umem;
1238        int page_shift;
1239        int npages;
1240        int ncont;
1241        int order;
1242        int err;
1243
1244        if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1245                return ERR_PTR(-EOPNOTSUPP);
1246
1247        mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1248                    start, virt_addr, length, access_flags);
1249
1250        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
1251            length == U64_MAX) {
1252                if (virt_addr != start)
1253                        return ERR_PTR(-EINVAL);
1254                if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
1255                    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1256                        return ERR_PTR(-EINVAL);
1257
1258                mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
1259                if (IS_ERR(mr))
1260                        return ERR_CAST(mr);
1261                return &mr->ibmr;
1262        }
1263
1264        err = mr_umem_get(dev, start, length, access_flags, &umem,
1265                          &npages, &page_shift, &ncont, &order);
1266
1267        if (err < 0)
1268                return ERR_PTR(err);
1269
1270        use_umr = mlx5_ib_can_use_umr(dev, true, access_flags);
1271
1272        if (order <= mr_cache_max_order(dev) && use_umr) {
1273                mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
1274                                         page_shift, order, access_flags);
1275                if (PTR_ERR(mr) == -EAGAIN) {
1276                        mlx5_ib_dbg(dev, "cache empty for order %d\n", order);
1277                        mr = NULL;
1278                }
1279        } else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) {
1280                if (access_flags & IB_ACCESS_ON_DEMAND) {
1281                        err = -EINVAL;
1282                        pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n");
1283                        goto error;
1284                }
1285                use_umr = false;
1286        }
1287
1288        if (!mr) {
1289                mutex_lock(&dev->slow_path_mutex);
1290                mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
1291                                page_shift, access_flags, !use_umr);
1292                mutex_unlock(&dev->slow_path_mutex);
1293        }
1294
1295        if (IS_ERR(mr)) {
1296                err = PTR_ERR(mr);
1297                goto error;
1298        }
1299
1300        mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1301
1302        mr->umem = umem;
1303        set_mr_fields(dev, mr, npages, length, access_flags);
1304
1305        if (use_umr) {
1306                int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
1307
1308                if (access_flags & IB_ACCESS_ON_DEMAND)
1309                        update_xlt_flags |= MLX5_IB_UPD_XLT_ZAP;
1310
1311                err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift,
1312                                         update_xlt_flags);
1313
1314                if (err) {
1315                        dereg_mr(dev, mr);
1316                        return ERR_PTR(err);
1317                }
1318        }
1319
1320        if (is_odp_mr(mr)) {
1321                to_ib_umem_odp(mr->umem)->private = mr;
1322                atomic_set(&mr->num_deferred_work, 0);
1323                err = xa_err(xa_store(&dev->odp_mkeys,
1324                                      mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
1325                                      GFP_KERNEL));
1326                if (err) {
1327                        dereg_mr(dev, mr);
1328                        return ERR_PTR(err);
1329                }
1330        }
1331
1332        return &mr->ibmr;
1333error:
1334        ib_umem_release(umem);
1335        return ERR_PTR(err);
1336}
1337
1338/**
1339 * mlx5_mr_cache_invalidate - Fence all DMA on the MR
1340 * @mr: The MR to fence
1341 *
1342 * Upon return the NIC will not be doing any DMA to the pages under the MR,
1343 * and any DMA inprogress will be completed. Failure of this function
1344 * indicates the HW has failed catastrophically.
1345 */
1346int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
1347{
1348        struct mlx5_umr_wr umrwr = {};
1349
1350        if (mr->dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1351                return 0;
1352
1353        umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
1354                              MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1355        umrwr.wr.opcode = MLX5_IB_WR_UMR;
1356        umrwr.pd = mr->dev->umrc.pd;
1357        umrwr.mkey = mr->mmkey.key;
1358        umrwr.ignore_free_state = 1;
1359
1360        return mlx5_ib_post_send_wait(mr->dev, &umrwr);
1361}
1362
1363static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1364                     int access_flags, int flags)
1365{
1366        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1367        struct mlx5_umr_wr umrwr = {};
1368        int err;
1369
1370        umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1371
1372        umrwr.wr.opcode = MLX5_IB_WR_UMR;
1373        umrwr.mkey = mr->mmkey.key;
1374
1375        if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) {
1376                umrwr.pd = pd;
1377                umrwr.access_flags = access_flags;
1378                umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1379        }
1380
1381        err = mlx5_ib_post_send_wait(dev, &umrwr);
1382
1383        return err;
1384}
1385
1386int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1387                          u64 length, u64 virt_addr, int new_access_flags,
1388                          struct ib_pd *new_pd, struct ib_udata *udata)
1389{
1390        struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1391        struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1392        struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
1393        int access_flags = flags & IB_MR_REREG_ACCESS ?
1394                            new_access_flags :
1395                            mr->access_flags;
1396        int page_shift = 0;
1397        int upd_flags = 0;
1398        int npages = 0;
1399        int ncont = 0;
1400        int order = 0;
1401        u64 addr, len;
1402        int err;
1403
1404        mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1405                    start, virt_addr, length, access_flags);
1406
1407        atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1408
1409        if (!mr->umem)
1410                return -EINVAL;
1411
1412        if (is_odp_mr(mr))
1413                return -EOPNOTSUPP;
1414
1415        if (flags & IB_MR_REREG_TRANS) {
1416                addr = virt_addr;
1417                len = length;
1418        } else {
1419                addr = mr->umem->address;
1420                len = mr->umem->length;
1421        }
1422
1423        if (flags != IB_MR_REREG_PD) {
1424                /*
1425                 * Replace umem. This needs to be done whether or not UMR is
1426                 * used.
1427                 */
1428                flags |= IB_MR_REREG_TRANS;
1429                ib_umem_release(mr->umem);
1430                mr->umem = NULL;
1431                err = mr_umem_get(dev, addr, len, access_flags, &mr->umem,
1432                                  &npages, &page_shift, &ncont, &order);
1433                if (err)
1434                        goto err;
1435        }
1436
1437        if (!mlx5_ib_can_use_umr(dev, true, access_flags) ||
1438            (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) {
1439                /*
1440                 * UMR can't be used - MKey needs to be replaced.
1441                 */
1442                if (mr->allocated_from_cache)
1443                        err = mlx5_mr_cache_invalidate(mr);
1444                else
1445                        err = destroy_mkey(dev, mr);
1446                if (err)
1447                        goto err;
1448
1449                mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont,
1450                                page_shift, access_flags, true);
1451
1452                if (IS_ERR(mr)) {
1453                        err = PTR_ERR(mr);
1454                        mr = to_mmr(ib_mr);
1455                        goto err;
1456                }
1457
1458                mr->allocated_from_cache = false;
1459        } else {
1460                /*
1461                 * Send a UMR WQE
1462                 */
1463                mr->ibmr.pd = pd;
1464                mr->access_flags = access_flags;
1465                mr->mmkey.iova = addr;
1466                mr->mmkey.size = len;
1467                mr->mmkey.pd = to_mpd(pd)->pdn;
1468
1469                if (flags & IB_MR_REREG_TRANS) {
1470                        upd_flags = MLX5_IB_UPD_XLT_ADDR;
1471                        if (flags & IB_MR_REREG_PD)
1472                                upd_flags |= MLX5_IB_UPD_XLT_PD;
1473                        if (flags & IB_MR_REREG_ACCESS)
1474                                upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1475                        err = mlx5_ib_update_xlt(mr, 0, npages, page_shift,
1476                                                 upd_flags);
1477                } else {
1478                        err = rereg_umr(pd, mr, access_flags, flags);
1479                }
1480
1481                if (err)
1482                        goto err;
1483        }
1484
1485        set_mr_fields(dev, mr, npages, len, access_flags);
1486
1487        return 0;
1488
1489err:
1490        ib_umem_release(mr->umem);
1491        mr->umem = NULL;
1492
1493        clean_mr(dev, mr);
1494        return err;
1495}
1496
1497static int
1498mlx5_alloc_priv_descs(struct ib_device *device,
1499                      struct mlx5_ib_mr *mr,
1500                      int ndescs,
1501                      int desc_size)
1502{
1503        int size = ndescs * desc_size;
1504        int add_size;
1505        int ret;
1506
1507        add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1508
1509        mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1510        if (!mr->descs_alloc)
1511                return -ENOMEM;
1512
1513        mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1514
1515        mr->desc_map = dma_map_single(device->dev.parent, mr->descs,
1516                                      size, DMA_TO_DEVICE);
1517        if (dma_mapping_error(device->dev.parent, mr->desc_map)) {
1518                ret = -ENOMEM;
1519                goto err;
1520        }
1521
1522        return 0;
1523err:
1524        kfree(mr->descs_alloc);
1525
1526        return ret;
1527}
1528
1529static void
1530mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1531{
1532        if (mr->descs) {
1533                struct ib_device *device = mr->ibmr.device;
1534                int size = mr->max_descs * mr->desc_size;
1535
1536                dma_unmap_single(device->dev.parent, mr->desc_map,
1537                                 size, DMA_TO_DEVICE);
1538                kfree(mr->descs_alloc);
1539                mr->descs = NULL;
1540        }
1541}
1542
1543static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1544{
1545        int allocated_from_cache = mr->allocated_from_cache;
1546
1547        if (mr->sig) {
1548                if (mlx5_core_destroy_psv(dev->mdev,
1549                                          mr->sig->psv_memory.psv_idx))
1550                        mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1551                                     mr->sig->psv_memory.psv_idx);
1552                if (mlx5_core_destroy_psv(dev->mdev,
1553                                          mr->sig->psv_wire.psv_idx))
1554                        mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1555                                     mr->sig->psv_wire.psv_idx);
1556                xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key));
1557                kfree(mr->sig);
1558                mr->sig = NULL;
1559        }
1560
1561        if (!allocated_from_cache) {
1562                destroy_mkey(dev, mr);
1563                mlx5_free_priv_descs(mr);
1564        }
1565}
1566
1567static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1568{
1569        int npages = mr->npages;
1570        struct ib_umem *umem = mr->umem;
1571
1572        /* Stop all DMA */
1573        if (is_odp_mr(mr))
1574                mlx5_ib_fence_odp_mr(mr);
1575        else
1576                clean_mr(dev, mr);
1577
1578        if (mr->allocated_from_cache)
1579                mlx5_mr_cache_free(dev, mr);
1580        else
1581                kfree(mr);
1582
1583        ib_umem_release(umem);
1584        atomic_sub(npages, &dev->mdev->priv.reg_pages);
1585
1586}
1587
1588int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1589{
1590        struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1591
1592        if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1593                dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
1594                dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
1595        }
1596
1597        if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
1598                mlx5_ib_free_implicit_mr(mmr);
1599                return 0;
1600        }
1601
1602        dereg_mr(to_mdev(ibmr->device), mmr);
1603
1604        return 0;
1605}
1606
1607static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1608                                   int access_mode, int page_shift)
1609{
1610        void *mkc;
1611
1612        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1613
1614        MLX5_SET(mkc, mkc, free, 1);
1615        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1616        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1617        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1618        MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1619        MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1620        MLX5_SET(mkc, mkc, umr_en, 1);
1621        MLX5_SET(mkc, mkc, log_page_size, page_shift);
1622}
1623
1624static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1625                                  int ndescs, int desc_size, int page_shift,
1626                                  int access_mode, u32 *in, int inlen)
1627{
1628        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1629        int err;
1630
1631        mr->access_mode = access_mode;
1632        mr->desc_size = desc_size;
1633        mr->max_descs = ndescs;
1634
1635        err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1636        if (err)
1637                return err;
1638
1639        mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1640
1641        err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1642        if (err)
1643                goto err_free_descs;
1644
1645        mr->mmkey.type = MLX5_MKEY_MR;
1646        mr->ibmr.lkey = mr->mmkey.key;
1647        mr->ibmr.rkey = mr->mmkey.key;
1648
1649        return 0;
1650
1651err_free_descs:
1652        mlx5_free_priv_descs(mr);
1653        return err;
1654}
1655
1656static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1657                                u32 max_num_sg, u32 max_num_meta_sg,
1658                                int desc_size, int access_mode)
1659{
1660        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1661        int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
1662        int page_shift = 0;
1663        struct mlx5_ib_mr *mr;
1664        u32 *in;
1665        int err;
1666
1667        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1668        if (!mr)
1669                return ERR_PTR(-ENOMEM);
1670
1671        mr->ibmr.pd = pd;
1672        mr->ibmr.device = pd->device;
1673
1674        in = kzalloc(inlen, GFP_KERNEL);
1675        if (!in) {
1676                err = -ENOMEM;
1677                goto err_free;
1678        }
1679
1680        if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
1681                page_shift = PAGE_SHIFT;
1682
1683        err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
1684                                     access_mode, in, inlen);
1685        if (err)
1686                goto err_free_in;
1687
1688        mr->umem = NULL;
1689        kfree(in);
1690
1691        return mr;
1692
1693err_free_in:
1694        kfree(in);
1695err_free:
1696        kfree(mr);
1697        return ERR_PTR(err);
1698}
1699
1700static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1701                                    int ndescs, u32 *in, int inlen)
1702{
1703        return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
1704                                      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
1705                                      inlen);
1706}
1707
1708static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1709                                    int ndescs, u32 *in, int inlen)
1710{
1711        return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
1712                                      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1713}
1714
1715static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1716                                      int max_num_sg, int max_num_meta_sg,
1717                                      u32 *in, int inlen)
1718{
1719        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1720        u32 psv_index[2];
1721        void *mkc;
1722        int err;
1723
1724        mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1725        if (!mr->sig)
1726                return -ENOMEM;
1727
1728        /* create mem & wire PSVs */
1729        err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
1730        if (err)
1731                goto err_free_sig;
1732
1733        mr->sig->psv_memory.psv_idx = psv_index[0];
1734        mr->sig->psv_wire.psv_idx = psv_index[1];
1735
1736        mr->sig->sig_status_checked = true;
1737        mr->sig->sig_err_exists = false;
1738        /* Next UMR, Arm SIGERR */
1739        ++mr->sig->sigerr_count;
1740        mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1741                                         sizeof(struct mlx5_klm),
1742                                         MLX5_MKC_ACCESS_MODE_KLMS);
1743        if (IS_ERR(mr->klm_mr)) {
1744                err = PTR_ERR(mr->klm_mr);
1745                goto err_destroy_psv;
1746        }
1747        mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1748                                         sizeof(struct mlx5_mtt),
1749                                         MLX5_MKC_ACCESS_MODE_MTT);
1750        if (IS_ERR(mr->mtt_mr)) {
1751                err = PTR_ERR(mr->mtt_mr);
1752                goto err_free_klm_mr;
1753        }
1754
1755        /* Set bsf descriptors for mkey */
1756        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1757        MLX5_SET(mkc, mkc, bsf_en, 1);
1758        MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1759
1760        err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
1761                                     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1762        if (err)
1763                goto err_free_mtt_mr;
1764
1765        err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1766                              mr->sig, GFP_KERNEL));
1767        if (err)
1768                goto err_free_descs;
1769        return 0;
1770
1771err_free_descs:
1772        destroy_mkey(dev, mr);
1773        mlx5_free_priv_descs(mr);
1774err_free_mtt_mr:
1775        dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
1776        mr->mtt_mr = NULL;
1777err_free_klm_mr:
1778        dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
1779        mr->klm_mr = NULL;
1780err_destroy_psv:
1781        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
1782                mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1783                             mr->sig->psv_memory.psv_idx);
1784        if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1785                mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1786                             mr->sig->psv_wire.psv_idx);
1787err_free_sig:
1788        kfree(mr->sig);
1789
1790        return err;
1791}
1792
1793static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
1794                                        enum ib_mr_type mr_type, u32 max_num_sg,
1795                                        u32 max_num_meta_sg)
1796{
1797        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1798        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1799        int ndescs = ALIGN(max_num_sg, 4);
1800        struct mlx5_ib_mr *mr;
1801        u32 *in;
1802        int err;
1803
1804        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1805        if (!mr)
1806                return ERR_PTR(-ENOMEM);
1807
1808        in = kzalloc(inlen, GFP_KERNEL);
1809        if (!in) {
1810                err = -ENOMEM;
1811                goto err_free;
1812        }
1813
1814        mr->ibmr.device = pd->device;
1815        mr->umem = NULL;
1816
1817        switch (mr_type) {
1818        case IB_MR_TYPE_MEM_REG:
1819                err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
1820                break;
1821        case IB_MR_TYPE_SG_GAPS:
1822                err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
1823                break;
1824        case IB_MR_TYPE_INTEGRITY:
1825                err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
1826                                                 max_num_meta_sg, in, inlen);
1827                break;
1828        default:
1829                mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1830                err = -EINVAL;
1831        }
1832
1833        if (err)
1834                goto err_free_in;
1835
1836        kfree(in);
1837
1838        return &mr->ibmr;
1839
1840err_free_in:
1841        kfree(in);
1842err_free:
1843        kfree(mr);
1844        return ERR_PTR(err);
1845}
1846
1847struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1848                               u32 max_num_sg, struct ib_udata *udata)
1849{
1850        return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
1851}
1852
1853struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
1854                                         u32 max_num_sg, u32 max_num_meta_sg)
1855{
1856        return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
1857                                  max_num_meta_sg);
1858}
1859
1860struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
1861                               struct ib_udata *udata)
1862{
1863        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1864        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1865        struct mlx5_ib_mw *mw = NULL;
1866        u32 *in = NULL;
1867        void *mkc;
1868        int ndescs;
1869        int err;
1870        struct mlx5_ib_alloc_mw req = {};
1871        struct {
1872                __u32   comp_mask;
1873                __u32   response_length;
1874        } resp = {};
1875
1876        err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1877        if (err)
1878                return ERR_PTR(err);
1879
1880        if (req.comp_mask || req.reserved1 || req.reserved2)
1881                return ERR_PTR(-EOPNOTSUPP);
1882
1883        if (udata->inlen > sizeof(req) &&
1884            !ib_is_udata_cleared(udata, sizeof(req),
1885                                 udata->inlen - sizeof(req)))
1886                return ERR_PTR(-EOPNOTSUPP);
1887
1888        ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
1889
1890        mw = kzalloc(sizeof(*mw), GFP_KERNEL);
1891        in = kzalloc(inlen, GFP_KERNEL);
1892        if (!mw || !in) {
1893                err = -ENOMEM;
1894                goto free;
1895        }
1896
1897        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1898
1899        MLX5_SET(mkc, mkc, free, 1);
1900        MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1901        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1902        MLX5_SET(mkc, mkc, umr_en, 1);
1903        MLX5_SET(mkc, mkc, lr, 1);
1904        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
1905        MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
1906        MLX5_SET(mkc, mkc, qpn, 0xffffff);
1907
1908        err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
1909        if (err)
1910                goto free;
1911
1912        mw->mmkey.type = MLX5_MKEY_MW;
1913        mw->ibmw.rkey = mw->mmkey.key;
1914        mw->ndescs = ndescs;
1915
1916        resp.response_length = min(offsetof(typeof(resp), response_length) +
1917                                   sizeof(resp.response_length), udata->outlen);
1918        if (resp.response_length) {
1919                err = ib_copy_to_udata(udata, &resp, resp.response_length);
1920                if (err) {
1921                        mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1922                        goto free;
1923                }
1924        }
1925
1926        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
1927                err = xa_err(xa_store(&dev->odp_mkeys,
1928                                      mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
1929                                      GFP_KERNEL));
1930                if (err)
1931                        goto free_mkey;
1932        }
1933
1934        kfree(in);
1935        return &mw->ibmw;
1936
1937free_mkey:
1938        mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1939free:
1940        kfree(mw);
1941        kfree(in);
1942        return ERR_PTR(err);
1943}
1944
1945int mlx5_ib_dealloc_mw(struct ib_mw *mw)
1946{
1947        struct mlx5_ib_dev *dev = to_mdev(mw->device);
1948        struct mlx5_ib_mw *mmw = to_mmw(mw);
1949        int err;
1950
1951        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
1952                xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key));
1953                /*
1954                 * pagefault_single_data_segment() may be accessing mmw under
1955                 * SRCU if the user bound an ODP MR to this MW.
1956                 */
1957                synchronize_srcu(&dev->odp_srcu);
1958        }
1959
1960        err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
1961        if (err)
1962                return err;
1963        kfree(mmw);
1964        return 0;
1965}
1966
1967int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
1968                            struct ib_mr_status *mr_status)
1969{
1970        struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1971        int ret = 0;
1972
1973        if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
1974                pr_err("Invalid status check mask\n");
1975                ret = -EINVAL;
1976                goto done;
1977        }
1978
1979        mr_status->fail_status = 0;
1980        if (check_mask & IB_MR_CHECK_SIG_STATUS) {
1981                if (!mmr->sig) {
1982                        ret = -EINVAL;
1983                        pr_err("signature status check requested on a non-signature enabled MR\n");
1984                        goto done;
1985                }
1986
1987                mmr->sig->sig_status_checked = true;
1988                if (!mmr->sig->sig_err_exists)
1989                        goto done;
1990
1991                if (ibmr->lkey == mmr->sig->err_item.key)
1992                        memcpy(&mr_status->sig_err, &mmr->sig->err_item,
1993                               sizeof(mr_status->sig_err));
1994                else {
1995                        mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
1996                        mr_status->sig_err.sig_err_offset = 0;
1997                        mr_status->sig_err.key = mmr->sig->err_item.key;
1998                }
1999
2000                mmr->sig->sig_err_exists = false;
2001                mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2002        }
2003
2004done:
2005        return ret;
2006}
2007
2008static int
2009mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2010                        int data_sg_nents, unsigned int *data_sg_offset,
2011                        struct scatterlist *meta_sg, int meta_sg_nents,
2012                        unsigned int *meta_sg_offset)
2013{
2014        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2015        unsigned int sg_offset = 0;
2016        int n = 0;
2017
2018        mr->meta_length = 0;
2019        if (data_sg_nents == 1) {
2020                n++;
2021                mr->ndescs = 1;
2022                if (data_sg_offset)
2023                        sg_offset = *data_sg_offset;
2024                mr->data_length = sg_dma_len(data_sg) - sg_offset;
2025                mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2026                if (meta_sg_nents == 1) {
2027                        n++;
2028                        mr->meta_ndescs = 1;
2029                        if (meta_sg_offset)
2030                                sg_offset = *meta_sg_offset;
2031                        else
2032                                sg_offset = 0;
2033                        mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2034                        mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2035                }
2036                ibmr->length = mr->data_length + mr->meta_length;
2037        }
2038
2039        return n;
2040}
2041
2042static int
2043mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2044                   struct scatterlist *sgl,
2045                   unsigned short sg_nents,
2046                   unsigned int *sg_offset_p,
2047                   struct scatterlist *meta_sgl,
2048                   unsigned short meta_sg_nents,
2049                   unsigned int *meta_sg_offset_p)
2050{
2051        struct scatterlist *sg = sgl;
2052        struct mlx5_klm *klms = mr->descs;
2053        unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2054        u32 lkey = mr->ibmr.pd->local_dma_lkey;
2055        int i, j = 0;
2056
2057        mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2058        mr->ibmr.length = 0;
2059
2060        for_each_sg(sgl, sg, sg_nents, i) {
2061                if (unlikely(i >= mr->max_descs))
2062                        break;
2063                klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2064                klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2065                klms[i].key = cpu_to_be32(lkey);
2066                mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2067
2068                sg_offset = 0;
2069        }
2070
2071        if (sg_offset_p)
2072                *sg_offset_p = sg_offset;
2073
2074        mr->ndescs = i;
2075        mr->data_length = mr->ibmr.length;
2076
2077        if (meta_sg_nents) {
2078                sg = meta_sgl;
2079                sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2080                for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2081                        if (unlikely(i + j >= mr->max_descs))
2082                                break;
2083                        klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2084                                                     sg_offset);
2085                        klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2086                                                         sg_offset);
2087                        klms[i + j].key = cpu_to_be32(lkey);
2088                        mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2089
2090                        sg_offset = 0;
2091                }
2092                if (meta_sg_offset_p)
2093                        *meta_sg_offset_p = sg_offset;
2094
2095                mr->meta_ndescs = j;
2096                mr->meta_length = mr->ibmr.length - mr->data_length;
2097        }
2098
2099        return i + j;
2100}
2101
2102static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2103{
2104        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2105        __be64 *descs;
2106
2107        if (unlikely(mr->ndescs == mr->max_descs))
2108                return -ENOMEM;
2109
2110        descs = mr->descs;
2111        descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2112
2113        return 0;
2114}
2115
2116static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2117{
2118        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2119        __be64 *descs;
2120
2121        if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
2122                return -ENOMEM;
2123
2124        descs = mr->descs;
2125        descs[mr->ndescs + mr->meta_ndescs++] =
2126                cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2127
2128        return 0;
2129}
2130
2131static int
2132mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2133                         int data_sg_nents, unsigned int *data_sg_offset,
2134                         struct scatterlist *meta_sg, int meta_sg_nents,
2135                         unsigned int *meta_sg_offset)
2136{
2137        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2138        struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2139        int n;
2140
2141        pi_mr->ndescs = 0;
2142        pi_mr->meta_ndescs = 0;
2143        pi_mr->meta_length = 0;
2144
2145        ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2146                                   pi_mr->desc_size * pi_mr->max_descs,
2147                                   DMA_TO_DEVICE);
2148
2149        pi_mr->ibmr.page_size = ibmr->page_size;
2150        n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2151                           mlx5_set_page);
2152        if (n != data_sg_nents)
2153                return n;
2154
2155        pi_mr->data_iova = pi_mr->ibmr.iova;
2156        pi_mr->data_length = pi_mr->ibmr.length;
2157        pi_mr->ibmr.length = pi_mr->data_length;
2158        ibmr->length = pi_mr->data_length;
2159
2160        if (meta_sg_nents) {
2161                u64 page_mask = ~((u64)ibmr->page_size - 1);
2162                u64 iova = pi_mr->data_iova;
2163
2164                n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2165                                    meta_sg_offset, mlx5_set_page_pi);
2166
2167                pi_mr->meta_length = pi_mr->ibmr.length;
2168                /*
2169                 * PI address for the HW is the offset of the metadata address
2170                 * relative to the first data page address.
2171                 * It equals to first data page address + size of data pages +
2172                 * metadata offset at the first metadata page
2173                 */
2174                pi_mr->pi_iova = (iova & page_mask) +
2175                                 pi_mr->ndescs * ibmr->page_size +
2176                                 (pi_mr->ibmr.iova & ~page_mask);
2177                /*
2178                 * In order to use one MTT MR for data and metadata, we register
2179                 * also the gaps between the end of the data and the start of
2180                 * the metadata (the sig MR will verify that the HW will access
2181                 * to right addresses). This mapping is safe because we use
2182                 * internal mkey for the registration.
2183                 */
2184                pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2185                pi_mr->ibmr.iova = iova;
2186                ibmr->length += pi_mr->meta_length;
2187        }
2188
2189        ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2190                                      pi_mr->desc_size * pi_mr->max_descs,
2191                                      DMA_TO_DEVICE);
2192
2193        return n;
2194}
2195
2196static int
2197mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2198                         int data_sg_nents, unsigned int *data_sg_offset,
2199                         struct scatterlist *meta_sg, int meta_sg_nents,
2200                         unsigned int *meta_sg_offset)
2201{
2202        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2203        struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2204        int n;
2205
2206        pi_mr->ndescs = 0;
2207        pi_mr->meta_ndescs = 0;
2208        pi_mr->meta_length = 0;
2209
2210        ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2211                                   pi_mr->desc_size * pi_mr->max_descs,
2212                                   DMA_TO_DEVICE);
2213
2214        n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2215                               meta_sg, meta_sg_nents, meta_sg_offset);
2216
2217        ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2218                                      pi_mr->desc_size * pi_mr->max_descs,
2219                                      DMA_TO_DEVICE);
2220
2221        /* This is zero-based memory region */
2222        pi_mr->data_iova = 0;
2223        pi_mr->ibmr.iova = 0;
2224        pi_mr->pi_iova = pi_mr->data_length;
2225        ibmr->length = pi_mr->ibmr.length;
2226
2227        return n;
2228}
2229
2230int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2231                         int data_sg_nents, unsigned int *data_sg_offset,
2232                         struct scatterlist *meta_sg, int meta_sg_nents,
2233                         unsigned int *meta_sg_offset)
2234{
2235        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2236        struct mlx5_ib_mr *pi_mr = NULL;
2237        int n;
2238
2239        WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2240
2241        mr->ndescs = 0;
2242        mr->data_length = 0;
2243        mr->data_iova = 0;
2244        mr->meta_ndescs = 0;
2245        mr->pi_iova = 0;
2246        /*
2247         * As a performance optimization, if possible, there is no need to
2248         * perform UMR operation to register the data/metadata buffers.
2249         * First try to map the sg lists to PA descriptors with local_dma_lkey.
2250         * Fallback to UMR only in case of a failure.
2251         */
2252        n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2253                                    data_sg_offset, meta_sg, meta_sg_nents,
2254                                    meta_sg_offset);
2255        if (n == data_sg_nents + meta_sg_nents)
2256                goto out;
2257        /*
2258         * As a performance optimization, if possible, there is no need to map
2259         * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2260         * descriptors and fallback to KLM only in case of a failure.
2261         * It's more efficient for the HW to work with MTT descriptors
2262         * (especially in high load).
2263         * Use KLM (indirect access) only if it's mandatory.
2264         */
2265        pi_mr = mr->mtt_mr;
2266        n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2267                                     data_sg_offset, meta_sg, meta_sg_nents,
2268                                     meta_sg_offset);
2269        if (n == data_sg_nents + meta_sg_nents)
2270                goto out;
2271
2272        pi_mr = mr->klm_mr;
2273        n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2274                                     data_sg_offset, meta_sg, meta_sg_nents,
2275                                     meta_sg_offset);
2276        if (unlikely(n != data_sg_nents + meta_sg_nents))
2277                return -ENOMEM;
2278
2279out:
2280        /* This is zero-based memory region */
2281        ibmr->iova = 0;
2282        mr->pi_mr = pi_mr;
2283        if (pi_mr)
2284                ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2285        else
2286                ibmr->sig_attrs->meta_length = mr->meta_length;
2287
2288        return 0;
2289}
2290
2291int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2292                      unsigned int *sg_offset)
2293{
2294        struct mlx5_ib_mr *mr = to_mmr(ibmr);
2295        int n;
2296
2297        mr->ndescs = 0;
2298
2299        ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2300                                   mr->desc_size * mr->max_descs,
2301                                   DMA_TO_DEVICE);
2302
2303        if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2304                n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2305                                       NULL);
2306        else
2307                n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2308                                mlx5_set_page);
2309
2310        ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2311                                      mr->desc_size * mr->max_descs,
2312                                      DMA_TO_DEVICE);
2313
2314        return n;
2315}
2316