linux/drivers/infiniband/hw/mlx5/odp.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32
  33#include <rdma/ib_umem.h>
  34#include <rdma/ib_umem_odp.h>
  35#include <linux/kernel.h>
  36#include <linux/dma-buf.h>
  37#include <linux/dma-resv.h>
  38
  39#include "mlx5_ib.h"
  40#include "cmd.h"
  41#include "qp.h"
  42
  43#include <linux/mlx5/eq.h>
  44
  45/* Contains the details of a pagefault. */
  46struct mlx5_pagefault {
  47        u32                     bytes_committed;
  48        u32                     token;
  49        u8                      event_subtype;
  50        u8                      type;
  51        union {
  52                /* Initiator or send message responder pagefault details. */
  53                struct {
  54                        /* Received packet size, only valid for responders. */
  55                        u32     packet_size;
  56                        /*
  57                         * Number of resource holding WQE, depends on type.
  58                         */
  59                        u32     wq_num;
  60                        /*
  61                         * WQE index. Refers to either the send queue or
  62                         * receive queue, according to event_subtype.
  63                         */
  64                        u16     wqe_index;
  65                } wqe;
  66                /* RDMA responder pagefault details */
  67                struct {
  68                        u32     r_key;
  69                        /*
  70                         * Received packet size, minimal size page fault
  71                         * resolution required for forward progress.
  72                         */
  73                        u32     packet_size;
  74                        u32     rdma_op_len;
  75                        u64     rdma_va;
  76                } rdma;
  77        };
  78
  79        struct mlx5_ib_pf_eq    *eq;
  80        struct work_struct      work;
  81};
  82
  83#define MAX_PREFETCH_LEN (4*1024*1024U)
  84
  85/* Timeout in ms to wait for an active mmu notifier to complete when handling
  86 * a pagefault. */
  87#define MMU_NOTIFIER_TIMEOUT 1000
  88
  89#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
  90#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
  91#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
  92#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
  93#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
  94
  95#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
  96
  97static u64 mlx5_imr_ksm_entries;
  98
  99static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 100                        struct mlx5_ib_mr *imr, int flags)
 101{
 102        struct mlx5_klm *end = pklm + nentries;
 103
 104        if (flags & MLX5_IB_UPD_XLT_ZAP) {
 105                for (; pklm != end; pklm++, idx++) {
 106                        pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
 107                        pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey);
 108                        pklm->va = 0;
 109                }
 110                return;
 111        }
 112
 113        /*
 114         * The locking here is pretty subtle. Ideally the implicit_children
 115         * xarray would be protected by the umem_mutex, however that is not
 116         * possible. Instead this uses a weaker update-then-lock pattern:
 117         *
 118         *    xa_store()
 119         *    mutex_lock(umem_mutex)
 120         *     mlx5_ib_update_xlt()
 121         *    mutex_unlock(umem_mutex)
 122         *    destroy lkey
 123         *
 124         * ie any change the xarray must be followed by the locked update_xlt
 125         * before destroying.
 126         *
 127         * The umem_mutex provides the acquire/release semantic needed to make
 128         * the xa_store() visible to a racing thread.
 129         */
 130        lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
 131
 132        for (; pklm != end; pklm++, idx++) {
 133                struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
 134
 135                pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
 136                if (mtt) {
 137                        pklm->key = cpu_to_be32(mtt->ibmr.lkey);
 138                        pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
 139                } else {
 140                        pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey);
 141                        pklm->va = 0;
 142                }
 143        }
 144}
 145
 146static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
 147{
 148        u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
 149
 150        if (umem_dma & ODP_READ_ALLOWED_BIT)
 151                mtt_entry |= MLX5_IB_MTT_READ;
 152        if (umem_dma & ODP_WRITE_ALLOWED_BIT)
 153                mtt_entry |= MLX5_IB_MTT_WRITE;
 154
 155        return mtt_entry;
 156}
 157
 158static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
 159                         struct mlx5_ib_mr *mr, int flags)
 160{
 161        struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 162        dma_addr_t pa;
 163        size_t i;
 164
 165        if (flags & MLX5_IB_UPD_XLT_ZAP)
 166                return;
 167
 168        for (i = 0; i < nentries; i++) {
 169                pa = odp->dma_list[idx + i];
 170                pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
 171        }
 172}
 173
 174void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 175                           struct mlx5_ib_mr *mr, int flags)
 176{
 177        if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
 178                populate_klm(xlt, idx, nentries, mr, flags);
 179        } else {
 180                populate_mtt(xlt, idx, nentries, mr, flags);
 181        }
 182}
 183
 184/*
 185 * This must be called after the mr has been removed from implicit_children.
 186 * NOTE: The MR does not necessarily have to be
 187 * empty here, parallel page faults could have raced with the free process and
 188 * added pages to it.
 189 */
 190static void free_implicit_child_mr_work(struct work_struct *work)
 191{
 192        struct mlx5_ib_mr *mr =
 193                container_of(work, struct mlx5_ib_mr, odp_destroy.work);
 194        struct mlx5_ib_mr *imr = mr->parent;
 195        struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
 196        struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 197
 198        mlx5r_deref_wait_odp_mkey(&mr->mmkey);
 199
 200        mutex_lock(&odp_imr->umem_mutex);
 201        mlx5_ib_update_xlt(mr->parent, ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT,
 202                           1, 0,
 203                           MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
 204        mutex_unlock(&odp_imr->umem_mutex);
 205        mlx5_ib_dereg_mr(&mr->ibmr, NULL);
 206
 207        mlx5r_deref_odp_mkey(&imr->mmkey);
 208}
 209
 210static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
 211{
 212        struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 213        unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
 214        struct mlx5_ib_mr *imr = mr->parent;
 215
 216        if (!refcount_inc_not_zero(&imr->mmkey.usecount))
 217                return;
 218
 219        xa_erase(&imr->implicit_children, idx);
 220
 221        /* Freeing a MR is a sleeping operation, so bounce to a work queue */
 222        INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
 223        queue_work(system_unbound_wq, &mr->odp_destroy.work);
 224}
 225
 226static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
 227                                     const struct mmu_notifier_range *range,
 228                                     unsigned long cur_seq)
 229{
 230        struct ib_umem_odp *umem_odp =
 231                container_of(mni, struct ib_umem_odp, notifier);
 232        struct mlx5_ib_mr *mr;
 233        const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
 234                                    sizeof(struct mlx5_mtt)) - 1;
 235        u64 idx = 0, blk_start_idx = 0;
 236        u64 invalidations = 0;
 237        unsigned long start;
 238        unsigned long end;
 239        int in_block = 0;
 240        u64 addr;
 241
 242        if (!mmu_notifier_range_blockable(range))
 243                return false;
 244
 245        mutex_lock(&umem_odp->umem_mutex);
 246        mmu_interval_set_seq(mni, cur_seq);
 247        /*
 248         * If npages is zero then umem_odp->private may not be setup yet. This
 249         * does not complete until after the first page is mapped for DMA.
 250         */
 251        if (!umem_odp->npages)
 252                goto out;
 253        mr = umem_odp->private;
 254
 255        start = max_t(u64, ib_umem_start(umem_odp), range->start);
 256        end = min_t(u64, ib_umem_end(umem_odp), range->end);
 257
 258        /*
 259         * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
 260         * while we are doing the invalidation, no page fault will attempt to
 261         * overwrite the same MTTs.  Concurent invalidations might race us,
 262         * but they will write 0s as well, so no difference in the end result.
 263         */
 264        for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
 265                idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
 266                /*
 267                 * Strive to write the MTTs in chunks, but avoid overwriting
 268                 * non-existing MTTs. The huristic here can be improved to
 269                 * estimate the cost of another UMR vs. the cost of bigger
 270                 * UMR.
 271                 */
 272                if (umem_odp->dma_list[idx] &
 273                    (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
 274                        if (!in_block) {
 275                                blk_start_idx = idx;
 276                                in_block = 1;
 277                        }
 278
 279                        /* Count page invalidations */
 280                        invalidations += idx - blk_start_idx + 1;
 281                } else {
 282                        u64 umr_offset = idx & umr_block_mask;
 283
 284                        if (in_block && umr_offset == 0) {
 285                                mlx5_ib_update_xlt(mr, blk_start_idx,
 286                                                   idx - blk_start_idx, 0,
 287                                                   MLX5_IB_UPD_XLT_ZAP |
 288                                                   MLX5_IB_UPD_XLT_ATOMIC);
 289                                in_block = 0;
 290                        }
 291                }
 292        }
 293        if (in_block)
 294                mlx5_ib_update_xlt(mr, blk_start_idx,
 295                                   idx - blk_start_idx + 1, 0,
 296                                   MLX5_IB_UPD_XLT_ZAP |
 297                                   MLX5_IB_UPD_XLT_ATOMIC);
 298
 299        mlx5_update_odp_stats(mr, invalidations, invalidations);
 300
 301        /*
 302         * We are now sure that the device will not access the
 303         * memory. We can safely unmap it, and mark it as dirty if
 304         * needed.
 305         */
 306
 307        ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
 308
 309        if (unlikely(!umem_odp->npages && mr->parent))
 310                destroy_unused_implicit_child_mr(mr);
 311out:
 312        mutex_unlock(&umem_odp->umem_mutex);
 313        return true;
 314}
 315
 316const struct mmu_interval_notifier_ops mlx5_mn_ops = {
 317        .invalidate = mlx5_ib_invalidate_range,
 318};
 319
 320static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 321{
 322        struct ib_odp_caps *caps = &dev->odp_caps;
 323
 324        memset(caps, 0, sizeof(*caps));
 325
 326        if (!MLX5_CAP_GEN(dev->mdev, pg) ||
 327            !mlx5_ib_can_load_pas_with_umr(dev, 0))
 328                return;
 329
 330        caps->general_caps = IB_ODP_SUPPORT;
 331
 332        if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
 333                dev->odp_max_size = U64_MAX;
 334        else
 335                dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
 336
 337        if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
 338                caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
 339
 340        if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
 341                caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
 342
 343        if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
 344                caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
 345
 346        if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
 347                caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
 348
 349        if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
 350                caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
 351
 352        if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
 353                caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
 354
 355        if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
 356                caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
 357
 358        if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
 359                caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
 360
 361        if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
 362                caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
 363
 364        if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
 365                caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
 366
 367        if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
 368                caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
 369
 370        if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
 371                caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
 372
 373        if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
 374                caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
 375
 376        if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
 377                caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
 378
 379        if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
 380            MLX5_CAP_GEN(dev->mdev, null_mkey) &&
 381            MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
 382            !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
 383                caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
 384}
 385
 386static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
 387                                      struct mlx5_pagefault *pfault,
 388                                      int error)
 389{
 390        int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
 391                     pfault->wqe.wq_num : pfault->token;
 392        u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
 393        int err;
 394
 395        MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
 396        MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
 397        MLX5_SET(page_fault_resume_in, in, token, pfault->token);
 398        MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
 399        MLX5_SET(page_fault_resume_in, in, error, !!error);
 400
 401        err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
 402        if (err)
 403                mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
 404                            wq_num, err);
 405}
 406
 407static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 408                                                unsigned long idx)
 409{
 410        struct ib_umem_odp *odp;
 411        struct mlx5_ib_mr *mr;
 412        struct mlx5_ib_mr *ret;
 413        int err;
 414
 415        odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
 416                                      idx * MLX5_IMR_MTT_SIZE,
 417                                      MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
 418        if (IS_ERR(odp))
 419                return ERR_CAST(odp);
 420
 421        mr = mlx5_mr_cache_alloc(
 422                mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags);
 423        if (IS_ERR(mr)) {
 424                ib_umem_odp_release(odp);
 425                return mr;
 426        }
 427
 428        mr->ibmr.pd = imr->ibmr.pd;
 429        mr->ibmr.device = &mr_to_mdev(imr)->ib_dev;
 430        mr->umem = &odp->umem;
 431        mr->ibmr.lkey = mr->mmkey.key;
 432        mr->ibmr.rkey = mr->mmkey.key;
 433        mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE;
 434        mr->parent = imr;
 435        odp->private = mr;
 436
 437        /*
 438         * First refcount is owned by the xarray and second refconut
 439         * is returned to the caller.
 440         */
 441        refcount_set(&mr->mmkey.usecount, 2);
 442
 443        err = mlx5_ib_update_xlt(mr, 0,
 444                                 MLX5_IMR_MTT_ENTRIES,
 445                                 PAGE_SHIFT,
 446                                 MLX5_IB_UPD_XLT_ZAP |
 447                                 MLX5_IB_UPD_XLT_ENABLE);
 448        if (err) {
 449                ret = ERR_PTR(err);
 450                goto out_mr;
 451        }
 452
 453        xa_lock(&imr->implicit_children);
 454        ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
 455                           GFP_KERNEL);
 456        if (unlikely(ret)) {
 457                if (xa_is_err(ret)) {
 458                        ret = ERR_PTR(xa_err(ret));
 459                        goto out_lock;
 460                }
 461                /*
 462                 * Another thread beat us to creating the child mr, use
 463                 * theirs.
 464                 */
 465                refcount_inc(&ret->mmkey.usecount);
 466                goto out_lock;
 467        }
 468        xa_unlock(&imr->implicit_children);
 469
 470        mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
 471        return mr;
 472
 473out_lock:
 474        xa_unlock(&imr->implicit_children);
 475out_mr:
 476        mlx5_ib_dereg_mr(&mr->ibmr, NULL);
 477        return ret;
 478}
 479
 480struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 481                                             int access_flags)
 482{
 483        struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
 484        struct ib_umem_odp *umem_odp;
 485        struct mlx5_ib_mr *imr;
 486        int err;
 487
 488        if (!mlx5_ib_can_load_pas_with_umr(dev,
 489                                           MLX5_IMR_MTT_ENTRIES * PAGE_SIZE))
 490                return ERR_PTR(-EOPNOTSUPP);
 491
 492        umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
 493        if (IS_ERR(umem_odp))
 494                return ERR_CAST(umem_odp);
 495
 496        imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
 497        if (IS_ERR(imr)) {
 498                ib_umem_odp_release(umem_odp);
 499                return imr;
 500        }
 501
 502        imr->ibmr.pd = &pd->ibpd;
 503        imr->mmkey.iova = 0;
 504        imr->umem = &umem_odp->umem;
 505        imr->ibmr.lkey = imr->mmkey.key;
 506        imr->ibmr.rkey = imr->mmkey.key;
 507        imr->ibmr.device = &dev->ib_dev;
 508        imr->umem = &umem_odp->umem;
 509        imr->is_odp_implicit = true;
 510        xa_init(&imr->implicit_children);
 511
 512        err = mlx5_ib_update_xlt(imr, 0,
 513                                 mlx5_imr_ksm_entries,
 514                                 MLX5_KSM_PAGE_SHIFT,
 515                                 MLX5_IB_UPD_XLT_INDIRECT |
 516                                 MLX5_IB_UPD_XLT_ZAP |
 517                                 MLX5_IB_UPD_XLT_ENABLE);
 518        if (err)
 519                goto out_mr;
 520
 521        err = mlx5r_store_odp_mkey(dev, &imr->mmkey);
 522        if (err)
 523                goto out_mr;
 524
 525        mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
 526        return imr;
 527out_mr:
 528        mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
 529        mlx5_ib_dereg_mr(&imr->ibmr, NULL);
 530        return ERR_PTR(err);
 531}
 532
 533void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
 534{
 535        struct mlx5_ib_mr *mtt;
 536        unsigned long idx;
 537
 538        /*
 539         * If this is an implicit MR it is already invalidated so we can just
 540         * delete the children mkeys.
 541         */
 542        xa_for_each(&mr->implicit_children, idx, mtt) {
 543                xa_erase(&mr->implicit_children, idx);
 544                mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
 545        }
 546}
 547
 548#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
 549#define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
 550#define MLX5_PF_FLAGS_ENABLE BIT(3)
 551static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
 552                             u64 user_va, size_t bcnt, u32 *bytes_mapped,
 553                             u32 flags)
 554{
 555        int page_shift, ret, np;
 556        bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
 557        u64 access_mask;
 558        u64 start_idx;
 559        bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
 560        u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
 561
 562        if (flags & MLX5_PF_FLAGS_ENABLE)
 563                xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
 564
 565        page_shift = odp->page_shift;
 566        start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
 567        access_mask = ODP_READ_ALLOWED_BIT;
 568
 569        if (odp->umem.writable && !downgrade)
 570                access_mask |= ODP_WRITE_ALLOWED_BIT;
 571
 572        np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
 573        if (np < 0)
 574                return np;
 575
 576        /*
 577         * No need to check whether the MTTs really belong to this MR, since
 578         * ib_umem_odp_map_dma_and_lock already checks this.
 579         */
 580        ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags);
 581        mutex_unlock(&odp->umem_mutex);
 582
 583        if (ret < 0) {
 584                if (ret != -EAGAIN)
 585                        mlx5_ib_err(mr_to_mdev(mr),
 586                                    "Failed to update mkey page tables\n");
 587                goto out;
 588        }
 589
 590        if (bytes_mapped) {
 591                u32 new_mappings = (np << page_shift) -
 592                        (user_va - round_down(user_va, 1 << page_shift));
 593
 594                *bytes_mapped += min_t(u32, new_mappings, bcnt);
 595        }
 596
 597        return np << (page_shift - PAGE_SHIFT);
 598
 599out:
 600        return ret;
 601}
 602
 603static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
 604                                 struct ib_umem_odp *odp_imr, u64 user_va,
 605                                 size_t bcnt, u32 *bytes_mapped, u32 flags)
 606{
 607        unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
 608        unsigned long upd_start_idx = end_idx + 1;
 609        unsigned long upd_len = 0;
 610        unsigned long npages = 0;
 611        int err;
 612        int ret;
 613
 614        if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
 615                     mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
 616                return -EFAULT;
 617
 618        /* Fault each child mr that intersects with our interval. */
 619        while (bcnt) {
 620                unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
 621                struct ib_umem_odp *umem_odp;
 622                struct mlx5_ib_mr *mtt;
 623                u64 len;
 624
 625                xa_lock(&imr->implicit_children);
 626                mtt = xa_load(&imr->implicit_children, idx);
 627                if (unlikely(!mtt)) {
 628                        xa_unlock(&imr->implicit_children);
 629                        mtt = implicit_get_child_mr(imr, idx);
 630                        if (IS_ERR(mtt)) {
 631                                ret = PTR_ERR(mtt);
 632                                goto out;
 633                        }
 634                        upd_start_idx = min(upd_start_idx, idx);
 635                        upd_len = idx - upd_start_idx + 1;
 636                } else {
 637                        refcount_inc(&mtt->mmkey.usecount);
 638                        xa_unlock(&imr->implicit_children);
 639                }
 640
 641                umem_odp = to_ib_umem_odp(mtt->umem);
 642                len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
 643                      user_va;
 644
 645                ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
 646                                        bytes_mapped, flags);
 647
 648                mlx5r_deref_odp_mkey(&mtt->mmkey);
 649
 650                if (ret < 0)
 651                        goto out;
 652                user_va += len;
 653                bcnt -= len;
 654                npages += ret;
 655        }
 656
 657        ret = npages;
 658
 659        /*
 660         * Any time the implicit_children are changed we must perform an
 661         * update of the xlt before exiting to ensure the HW and the
 662         * implicit_children remains synchronized.
 663         */
 664out:
 665        if (likely(!upd_len))
 666                return ret;
 667
 668        /*
 669         * Notice this is not strictly ordered right, the KSM is updated after
 670         * the implicit_children is updated, so a parallel page fault could
 671         * see a MR that is not yet visible in the KSM.  This is similar to a
 672         * parallel page fault seeing a MR that is being concurrently removed
 673         * from the KSM. Both of these improbable situations are resolved
 674         * safely by resuming the HW and then taking another page fault. The
 675         * next pagefault handler will see the new information.
 676         */
 677        mutex_lock(&odp_imr->umem_mutex);
 678        err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0,
 679                                 MLX5_IB_UPD_XLT_INDIRECT |
 680                                         MLX5_IB_UPD_XLT_ATOMIC);
 681        mutex_unlock(&odp_imr->umem_mutex);
 682        if (err) {
 683                mlx5_ib_err(mr_to_mdev(imr), "Failed to update PAS\n");
 684                return err;
 685        }
 686        return ret;
 687}
 688
 689static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
 690                               u32 *bytes_mapped, u32 flags)
 691{
 692        struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
 693        u32 xlt_flags = 0;
 694        int err;
 695        unsigned int page_size;
 696
 697        if (flags & MLX5_PF_FLAGS_ENABLE)
 698                xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
 699
 700        dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
 701        err = ib_umem_dmabuf_map_pages(umem_dmabuf);
 702        if (err) {
 703                dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
 704                return err;
 705        }
 706
 707        page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
 708                                             log_page_size, 0,
 709                                             umem_dmabuf->umem.iova);
 710        if (unlikely(page_size < PAGE_SIZE)) {
 711                ib_umem_dmabuf_unmap_pages(umem_dmabuf);
 712                err = -EINVAL;
 713        } else {
 714                err = mlx5_ib_update_mr_pas(mr, xlt_flags);
 715        }
 716        dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
 717
 718        if (err)
 719                return err;
 720
 721        if (bytes_mapped)
 722                *bytes_mapped += bcnt;
 723
 724        return ib_umem_num_pages(mr->umem);
 725}
 726
 727/*
 728 * Returns:
 729 *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
 730 *           not accessible, or the MR is no longer valid.
 731 *  -EAGAIN/-ENOMEM: The operation should be retried
 732 *
 733 *  -EINVAL/others: General internal malfunction
 734 *  >0: Number of pages mapped
 735 */
 736static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
 737                        u32 *bytes_mapped, u32 flags)
 738{
 739        struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 740
 741        if (unlikely(io_virt < mr->mmkey.iova))
 742                return -EFAULT;
 743
 744        if (mr->umem->is_dmabuf)
 745                return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
 746
 747        if (!odp->is_implicit_odp) {
 748                u64 user_va;
 749
 750                if (check_add_overflow(io_virt - mr->mmkey.iova,
 751                                       (u64)odp->umem.address, &user_va))
 752                        return -EFAULT;
 753                if (unlikely(user_va >= ib_umem_end(odp) ||
 754                             ib_umem_end(odp) - user_va < bcnt))
 755                        return -EFAULT;
 756                return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
 757                                         flags);
 758        }
 759        return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
 760                                     flags);
 761}
 762
 763int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr)
 764{
 765        int ret;
 766
 767        ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem), mr->umem->address,
 768                                mr->umem->length, NULL,
 769                                MLX5_PF_FLAGS_SNAPSHOT | MLX5_PF_FLAGS_ENABLE);
 770        return ret >= 0 ? 0 : ret;
 771}
 772
 773int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
 774{
 775        int ret;
 776
 777        ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL,
 778                                  MLX5_PF_FLAGS_ENABLE);
 779
 780        return ret >= 0 ? 0 : ret;
 781}
 782
 783struct pf_frame {
 784        struct pf_frame *next;
 785        u32 key;
 786        u64 io_virt;
 787        size_t bcnt;
 788        int depth;
 789};
 790
 791static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key)
 792{
 793        if (!mmkey)
 794                return false;
 795        if (mmkey->type == MLX5_MKEY_MW)
 796                return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
 797        return mmkey->key == key;
 798}
 799
 800static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey)
 801{
 802        struct mlx5_ib_mw *mw;
 803        struct mlx5_ib_devx_mr *devx_mr;
 804
 805        if (mmkey->type == MLX5_MKEY_MW) {
 806                mw = container_of(mmkey, struct mlx5_ib_mw, mmkey);
 807                return mw->ndescs;
 808        }
 809
 810        devx_mr = container_of(mmkey, struct mlx5_ib_devx_mr,
 811                               mmkey);
 812        return devx_mr->ndescs;
 813}
 814
 815/*
 816 * Handle a single data segment in a page-fault WQE or RDMA region.
 817 *
 818 * Returns number of OS pages retrieved on success. The caller may continue to
 819 * the next data segment.
 820 * Can return the following error codes:
 821 * -EAGAIN to designate a temporary error. The caller will abort handling the
 822 *  page fault and resolve it.
 823 * -EFAULT when there's an error mapping the requested pages. The caller will
 824 *  abort the page fault handling.
 825 */
 826static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 827                                         struct ib_pd *pd, u32 key,
 828                                         u64 io_virt, size_t bcnt,
 829                                         u32 *bytes_committed,
 830                                         u32 *bytes_mapped)
 831{
 832        int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
 833        struct pf_frame *head = NULL, *frame;
 834        struct mlx5_core_mkey *mmkey;
 835        struct mlx5_ib_mr *mr;
 836        struct mlx5_klm *pklm;
 837        u32 *out = NULL;
 838        size_t offset;
 839        int ndescs;
 840
 841        io_virt += *bytes_committed;
 842        bcnt -= *bytes_committed;
 843
 844next_mr:
 845        xa_lock(&dev->odp_mkeys);
 846        mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
 847        if (!mmkey) {
 848                xa_unlock(&dev->odp_mkeys);
 849                mlx5_ib_dbg(
 850                        dev,
 851                        "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
 852                        key);
 853                if (bytes_mapped)
 854                        *bytes_mapped += bcnt;
 855                /*
 856                 * The user could specify a SGL with multiple lkeys and only
 857                 * some of them are ODP. Treat the non-ODP ones as fully
 858                 * faulted.
 859                 */
 860                ret = 0;
 861                goto end;
 862        }
 863        refcount_inc(&mmkey->usecount);
 864        xa_unlock(&dev->odp_mkeys);
 865
 866        if (!mkey_is_eq(mmkey, key)) {
 867                mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
 868                ret = -EFAULT;
 869                goto end;
 870        }
 871
 872        switch (mmkey->type) {
 873        case MLX5_MKEY_MR:
 874                mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
 875
 876                ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
 877                if (ret < 0)
 878                        goto end;
 879
 880                mlx5_update_odp_stats(mr, faults, ret);
 881
 882                npages += ret;
 883                ret = 0;
 884                break;
 885
 886        case MLX5_MKEY_MW:
 887        case MLX5_MKEY_INDIRECT_DEVX:
 888                ndescs = get_indirect_num_descs(mmkey);
 889
 890                if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
 891                        mlx5_ib_dbg(dev, "indirection level exceeded\n");
 892                        ret = -EFAULT;
 893                        goto end;
 894                }
 895
 896                outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
 897                        sizeof(*pklm) * (ndescs - 2);
 898
 899                if (outlen > cur_outlen) {
 900                        kfree(out);
 901                        out = kzalloc(outlen, GFP_KERNEL);
 902                        if (!out) {
 903                                ret = -ENOMEM;
 904                                goto end;
 905                        }
 906                        cur_outlen = outlen;
 907                }
 908
 909                pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
 910                                                       bsf0_klm0_pas_mtt0_1);
 911
 912                ret = mlx5_core_query_mkey(dev->mdev, mmkey, out, outlen);
 913                if (ret)
 914                        goto end;
 915
 916                offset = io_virt - MLX5_GET64(query_mkey_out, out,
 917                                              memory_key_mkey_entry.start_addr);
 918
 919                for (i = 0; bcnt && i < ndescs; i++, pklm++) {
 920                        if (offset >= be32_to_cpu(pklm->bcount)) {
 921                                offset -= be32_to_cpu(pklm->bcount);
 922                                continue;
 923                        }
 924
 925                        frame = kzalloc(sizeof(*frame), GFP_KERNEL);
 926                        if (!frame) {
 927                                ret = -ENOMEM;
 928                                goto end;
 929                        }
 930
 931                        frame->key = be32_to_cpu(pklm->key);
 932                        frame->io_virt = be64_to_cpu(pklm->va) + offset;
 933                        frame->bcnt = min_t(size_t, bcnt,
 934                                            be32_to_cpu(pklm->bcount) - offset);
 935                        frame->depth = depth + 1;
 936                        frame->next = head;
 937                        head = frame;
 938
 939                        bcnt -= frame->bcnt;
 940                        offset = 0;
 941                }
 942                break;
 943
 944        default:
 945                mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
 946                ret = -EFAULT;
 947                goto end;
 948        }
 949
 950        if (head) {
 951                frame = head;
 952                head = frame->next;
 953
 954                key = frame->key;
 955                io_virt = frame->io_virt;
 956                bcnt = frame->bcnt;
 957                depth = frame->depth;
 958                kfree(frame);
 959
 960                mlx5r_deref_odp_mkey(mmkey);
 961                goto next_mr;
 962        }
 963
 964end:
 965        if (mmkey)
 966                mlx5r_deref_odp_mkey(mmkey);
 967        while (head) {
 968                frame = head;
 969                head = frame->next;
 970                kfree(frame);
 971        }
 972        kfree(out);
 973
 974        *bytes_committed = 0;
 975        return ret ? ret : npages;
 976}
 977
 978/*
 979 * Parse a series of data segments for page fault handling.
 980 *
 981 * @dev:  Pointer to mlx5 IB device
 982 * @pfault: contains page fault information.
 983 * @wqe: points at the first data segment in the WQE.
 984 * @wqe_end: points after the end of the WQE.
 985 * @bytes_mapped: receives the number of bytes that the function was able to
 986 *                map. This allows the caller to decide intelligently whether
 987 *                enough memory was mapped to resolve the page fault
 988 *                successfully (e.g. enough for the next MTU, or the entire
 989 *                WQE).
 990 * @total_wqe_bytes: receives the total data size of this WQE in bytes (minus
 991 *                   the committed bytes).
 992 * @receive_queue: receive WQE end of sg list
 993 *
 994 * Returns the number of pages loaded if positive, zero for an empty WQE, or a
 995 * negative error code.
 996 */
 997static int pagefault_data_segments(struct mlx5_ib_dev *dev,
 998                                   struct mlx5_pagefault *pfault,
 999                                   void *wqe,
1000                                   void *wqe_end, u32 *bytes_mapped,
1001                                   u32 *total_wqe_bytes, bool receive_queue)
1002{
1003        int ret = 0, npages = 0;
1004        u64 io_virt;
1005        u32 key;
1006        u32 byte_count;
1007        size_t bcnt;
1008        int inline_segment;
1009
1010        if (bytes_mapped)
1011                *bytes_mapped = 0;
1012        if (total_wqe_bytes)
1013                *total_wqe_bytes = 0;
1014
1015        while (wqe < wqe_end) {
1016                struct mlx5_wqe_data_seg *dseg = wqe;
1017
1018                io_virt = be64_to_cpu(dseg->addr);
1019                key = be32_to_cpu(dseg->lkey);
1020                byte_count = be32_to_cpu(dseg->byte_count);
1021                inline_segment = !!(byte_count &  MLX5_INLINE_SEG);
1022                bcnt           = byte_count & ~MLX5_INLINE_SEG;
1023
1024                if (inline_segment) {
1025                        bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
1026                        wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
1027                                     16);
1028                } else {
1029                        wqe += sizeof(*dseg);
1030                }
1031
1032                /* receive WQE end of sg list. */
1033                if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
1034                    io_virt == 0)
1035                        break;
1036
1037                if (!inline_segment && total_wqe_bytes) {
1038                        *total_wqe_bytes += bcnt - min_t(size_t, bcnt,
1039                                        pfault->bytes_committed);
1040                }
1041
1042                /* A zero length data segment designates a length of 2GB. */
1043                if (bcnt == 0)
1044                        bcnt = 1U << 31;
1045
1046                if (inline_segment || bcnt <= pfault->bytes_committed) {
1047                        pfault->bytes_committed -=
1048                                min_t(size_t, bcnt,
1049                                      pfault->bytes_committed);
1050                        continue;
1051                }
1052
1053                ret = pagefault_single_data_segment(dev, NULL, key,
1054                                                    io_virt, bcnt,
1055                                                    &pfault->bytes_committed,
1056                                                    bytes_mapped);
1057                if (ret < 0)
1058                        break;
1059                npages += ret;
1060        }
1061
1062        return ret < 0 ? ret : npages;
1063}
1064
1065/*
1066 * Parse initiator WQE. Advances the wqe pointer to point at the
1067 * scatter-gather list, and set wqe_end to the end of the WQE.
1068 */
1069static int mlx5_ib_mr_initiator_pfault_handler(
1070        struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
1071        struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
1072{
1073        struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
1074        u16 wqe_index = pfault->wqe.wqe_index;
1075        struct mlx5_base_av *av;
1076        unsigned ds, opcode;
1077        u32 qpn = qp->trans_qp.base.mqp.qpn;
1078
1079        ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
1080        if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
1081                mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
1082                            ds, wqe_length);
1083                return -EFAULT;
1084        }
1085
1086        if (ds == 0) {
1087                mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
1088                            wqe_index, qpn);
1089                return -EFAULT;
1090        }
1091
1092        *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
1093        *wqe += sizeof(*ctrl);
1094
1095        opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
1096                 MLX5_WQE_CTRL_OPCODE_MASK;
1097
1098        if (qp->type == IB_QPT_XRC_INI)
1099                *wqe += sizeof(struct mlx5_wqe_xrc_seg);
1100
1101        if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) {
1102                av = *wqe;
1103                if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
1104                        *wqe += sizeof(struct mlx5_av);
1105                else
1106                        *wqe += sizeof(struct mlx5_base_av);
1107        }
1108
1109        switch (opcode) {
1110        case MLX5_OPCODE_RDMA_WRITE:
1111        case MLX5_OPCODE_RDMA_WRITE_IMM:
1112        case MLX5_OPCODE_RDMA_READ:
1113                *wqe += sizeof(struct mlx5_wqe_raddr_seg);
1114                break;
1115        case MLX5_OPCODE_ATOMIC_CS:
1116        case MLX5_OPCODE_ATOMIC_FA:
1117                *wqe += sizeof(struct mlx5_wqe_raddr_seg);
1118                *wqe += sizeof(struct mlx5_wqe_atomic_seg);
1119                break;
1120        }
1121
1122        return 0;
1123}
1124
1125/*
1126 * Parse responder WQE and set wqe_end to the end of the WQE.
1127 */
1128static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
1129                                                   struct mlx5_ib_srq *srq,
1130                                                   void **wqe, void **wqe_end,
1131                                                   int wqe_length)
1132{
1133        int wqe_size = 1 << srq->msrq.wqe_shift;
1134
1135        if (wqe_size > wqe_length) {
1136                mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
1137                return -EFAULT;
1138        }
1139
1140        *wqe_end = *wqe + wqe_size;
1141        *wqe += sizeof(struct mlx5_wqe_srq_next_seg);
1142
1143        return 0;
1144}
1145
1146static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
1147                                                  struct mlx5_ib_qp *qp,
1148                                                  void *wqe, void **wqe_end,
1149                                                  int wqe_length)
1150{
1151        struct mlx5_ib_wq *wq = &qp->rq;
1152        int wqe_size = 1 << wq->wqe_shift;
1153
1154        if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
1155                mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
1156                return -EFAULT;
1157        }
1158
1159        if (wqe_size > wqe_length) {
1160                mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
1161                return -EFAULT;
1162        }
1163
1164        *wqe_end = wqe + wqe_size;
1165
1166        return 0;
1167}
1168
1169static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
1170                                                       u32 wq_num, int pf_type)
1171{
1172        struct mlx5_core_rsc_common *common = NULL;
1173        struct mlx5_core_srq *srq;
1174
1175        switch (pf_type) {
1176        case MLX5_WQE_PF_TYPE_RMP:
1177                srq = mlx5_cmd_get_srq(dev, wq_num);
1178                if (srq)
1179                        common = &srq->common;
1180                break;
1181        case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
1182        case MLX5_WQE_PF_TYPE_RESP:
1183        case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
1184                common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP);
1185                break;
1186        default:
1187                break;
1188        }
1189
1190        return common;
1191}
1192
1193static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
1194{
1195        struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
1196
1197        return to_mibqp(mqp);
1198}
1199
1200static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
1201{
1202        struct mlx5_core_srq *msrq =
1203                container_of(res, struct mlx5_core_srq, common);
1204
1205        return to_mibsrq(msrq);
1206}
1207
1208static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
1209                                          struct mlx5_pagefault *pfault)
1210{
1211        bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
1212        u16 wqe_index = pfault->wqe.wqe_index;
1213        void *wqe, *wqe_start = NULL, *wqe_end = NULL;
1214        u32 bytes_mapped, total_wqe_bytes;
1215        struct mlx5_core_rsc_common *res;
1216        int resume_with_error = 1;
1217        struct mlx5_ib_qp *qp;
1218        size_t bytes_copied;
1219        int ret = 0;
1220
1221        res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
1222        if (!res) {
1223                mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
1224                return;
1225        }
1226
1227        if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
1228            res->res != MLX5_RES_XSRQ) {
1229                mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
1230                            pfault->type);
1231                goto resolve_page_fault;
1232        }
1233
1234        wqe_start = (void *)__get_free_page(GFP_KERNEL);
1235        if (!wqe_start) {
1236                mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
1237                goto resolve_page_fault;
1238        }
1239
1240        wqe = wqe_start;
1241        qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
1242        if (qp && sq) {
1243                ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
1244                                          &bytes_copied);
1245                if (ret)
1246                        goto read_user;
1247                ret = mlx5_ib_mr_initiator_pfault_handler(
1248                        dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
1249        } else if (qp && !sq) {
1250                ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
1251                                          &bytes_copied);
1252                if (ret)
1253                        goto read_user;
1254                ret = mlx5_ib_mr_responder_pfault_handler_rq(
1255                        dev, qp, wqe, &wqe_end, bytes_copied);
1256        } else if (!qp) {
1257                struct mlx5_ib_srq *srq = res_to_srq(res);
1258
1259                ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
1260                                           &bytes_copied);
1261                if (ret)
1262                        goto read_user;
1263                ret = mlx5_ib_mr_responder_pfault_handler_srq(
1264                        dev, srq, &wqe, &wqe_end, bytes_copied);
1265        }
1266
1267        if (ret < 0 || wqe >= wqe_end)
1268                goto resolve_page_fault;
1269
1270        ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
1271                                      &total_wqe_bytes, !sq);
1272        if (ret == -EAGAIN)
1273                goto out;
1274
1275        if (ret < 0 || total_wqe_bytes > bytes_mapped)
1276                goto resolve_page_fault;
1277
1278out:
1279        ret = 0;
1280        resume_with_error = 0;
1281
1282read_user:
1283        if (ret)
1284                mlx5_ib_err(
1285                        dev,
1286                        "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
1287                        ret, wqe_index, pfault->token);
1288
1289resolve_page_fault:
1290        mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
1291        mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
1292                    pfault->wqe.wq_num, resume_with_error,
1293                    pfault->type);
1294        mlx5_core_res_put(res);
1295        free_page((unsigned long)wqe_start);
1296}
1297
1298static int pages_in_range(u64 address, u32 length)
1299{
1300        return (ALIGN(address + length, PAGE_SIZE) -
1301                (address & PAGE_MASK)) >> PAGE_SHIFT;
1302}
1303
1304static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
1305                                           struct mlx5_pagefault *pfault)
1306{
1307        u64 address;
1308        u32 length;
1309        u32 prefetch_len = pfault->bytes_committed;
1310        int prefetch_activated = 0;
1311        u32 rkey = pfault->rdma.r_key;
1312        int ret;
1313
1314        /* The RDMA responder handler handles the page fault in two parts.
1315         * First it brings the necessary pages for the current packet
1316         * (and uses the pfault context), and then (after resuming the QP)
1317         * prefetches more pages. The second operation cannot use the pfault
1318         * context and therefore uses the dummy_pfault context allocated on
1319         * the stack */
1320        pfault->rdma.rdma_va += pfault->bytes_committed;
1321        pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
1322                                         pfault->rdma.rdma_op_len);
1323        pfault->bytes_committed = 0;
1324
1325        address = pfault->rdma.rdma_va;
1326        length  = pfault->rdma.rdma_op_len;
1327
1328        /* For some operations, the hardware cannot tell the exact message
1329         * length, and in those cases it reports zero. Use prefetch
1330         * logic. */
1331        if (length == 0) {
1332                prefetch_activated = 1;
1333                length = pfault->rdma.packet_size;
1334                prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
1335        }
1336
1337        ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
1338                                            &pfault->bytes_committed, NULL);
1339        if (ret == -EAGAIN) {
1340                /* We're racing with an invalidation, don't prefetch */
1341                prefetch_activated = 0;
1342        } else if (ret < 0 || pages_in_range(address, length) > ret) {
1343                mlx5_ib_page_fault_resume(dev, pfault, 1);
1344                if (ret != -ENOENT)
1345                        mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
1346                                    ret, pfault->token, pfault->type);
1347                return;
1348        }
1349
1350        mlx5_ib_page_fault_resume(dev, pfault, 0);
1351        mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
1352                    pfault->token, pfault->type,
1353                    prefetch_activated);
1354
1355        /* At this point, there might be a new pagefault already arriving in
1356         * the eq, switch to the dummy pagefault for the rest of the
1357         * processing. We're still OK with the objects being alive as the
1358         * work-queue is being fenced. */
1359
1360        if (prefetch_activated) {
1361                u32 bytes_committed = 0;
1362
1363                ret = pagefault_single_data_segment(dev, NULL, rkey, address,
1364                                                    prefetch_len,
1365                                                    &bytes_committed, NULL);
1366                if (ret < 0 && ret != -EAGAIN) {
1367                        mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
1368                                    ret, pfault->token, address, prefetch_len);
1369                }
1370        }
1371}
1372
1373static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
1374{
1375        u8 event_subtype = pfault->event_subtype;
1376
1377        switch (event_subtype) {
1378        case MLX5_PFAULT_SUBTYPE_WQE:
1379                mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
1380                break;
1381        case MLX5_PFAULT_SUBTYPE_RDMA:
1382                mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
1383                break;
1384        default:
1385                mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
1386                            event_subtype);
1387                mlx5_ib_page_fault_resume(dev, pfault, 1);
1388        }
1389}
1390
1391static void mlx5_ib_eqe_pf_action(struct work_struct *work)
1392{
1393        struct mlx5_pagefault *pfault = container_of(work,
1394                                                     struct mlx5_pagefault,
1395                                                     work);
1396        struct mlx5_ib_pf_eq *eq = pfault->eq;
1397
1398        mlx5_ib_pfault(eq->dev, pfault);
1399        mempool_free(pfault, eq->pool);
1400}
1401
1402static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
1403{
1404        struct mlx5_eqe_page_fault *pf_eqe;
1405        struct mlx5_pagefault *pfault;
1406        struct mlx5_eqe *eqe;
1407        int cc = 0;
1408
1409        while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
1410                pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
1411                if (!pfault) {
1412                        schedule_work(&eq->work);
1413                        break;
1414                }
1415
1416                pf_eqe = &eqe->data.page_fault;
1417                pfault->event_subtype = eqe->sub_type;
1418                pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
1419
1420                mlx5_ib_dbg(eq->dev,
1421                            "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
1422                            eqe->sub_type, pfault->bytes_committed);
1423
1424                switch (eqe->sub_type) {
1425                case MLX5_PFAULT_SUBTYPE_RDMA:
1426                        /* RDMA based event */
1427                        pfault->type =
1428                                be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
1429                        pfault->token =
1430                                be32_to_cpu(pf_eqe->rdma.pftype_token) &
1431                                MLX5_24BIT_MASK;
1432                        pfault->rdma.r_key =
1433                                be32_to_cpu(pf_eqe->rdma.r_key);
1434                        pfault->rdma.packet_size =
1435                                be16_to_cpu(pf_eqe->rdma.packet_length);
1436                        pfault->rdma.rdma_op_len =
1437                                be32_to_cpu(pf_eqe->rdma.rdma_op_len);
1438                        pfault->rdma.rdma_va =
1439                                be64_to_cpu(pf_eqe->rdma.rdma_va);
1440                        mlx5_ib_dbg(eq->dev,
1441                                    "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
1442                                    pfault->type, pfault->token,
1443                                    pfault->rdma.r_key);
1444                        mlx5_ib_dbg(eq->dev,
1445                                    "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
1446                                    pfault->rdma.rdma_op_len,
1447                                    pfault->rdma.rdma_va);
1448                        break;
1449
1450                case MLX5_PFAULT_SUBTYPE_WQE:
1451                        /* WQE based event */
1452                        pfault->type =
1453                                (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
1454                        pfault->token =
1455                                be32_to_cpu(pf_eqe->wqe.token);
1456                        pfault->wqe.wq_num =
1457                                be32_to_cpu(pf_eqe->wqe.pftype_wq) &
1458                                MLX5_24BIT_MASK;
1459                        pfault->wqe.wqe_index =
1460                                be16_to_cpu(pf_eqe->wqe.wqe_index);
1461                        pfault->wqe.packet_size =
1462                                be16_to_cpu(pf_eqe->wqe.packet_length);
1463                        mlx5_ib_dbg(eq->dev,
1464                                    "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
1465                                    pfault->type, pfault->token,
1466                                    pfault->wqe.wq_num,
1467                                    pfault->wqe.wqe_index);
1468                        break;
1469
1470                default:
1471                        mlx5_ib_warn(eq->dev,
1472                                     "Unsupported page fault event sub-type: 0x%02hhx\n",
1473                                     eqe->sub_type);
1474                        /* Unsupported page faults should still be
1475                         * resolved by the page fault handler
1476                         */
1477                }
1478
1479                pfault->eq = eq;
1480                INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
1481                queue_work(eq->wq, &pfault->work);
1482
1483                cc = mlx5_eq_update_cc(eq->core, ++cc);
1484        }
1485
1486        mlx5_eq_update_ci(eq->core, cc, 1);
1487}
1488
1489static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
1490                             void *data)
1491{
1492        struct mlx5_ib_pf_eq *eq =
1493                container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
1494        unsigned long flags;
1495
1496        if (spin_trylock_irqsave(&eq->lock, flags)) {
1497                mlx5_ib_eq_pf_process(eq);
1498                spin_unlock_irqrestore(&eq->lock, flags);
1499        } else {
1500                schedule_work(&eq->work);
1501        }
1502
1503        return IRQ_HANDLED;
1504}
1505
1506/* mempool_refill() was proposed but unfortunately wasn't accepted
1507 * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
1508 * Cheap workaround.
1509 */
1510static void mempool_refill(mempool_t *pool)
1511{
1512        while (pool->curr_nr < pool->min_nr)
1513                mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
1514}
1515
1516static void mlx5_ib_eq_pf_action(struct work_struct *work)
1517{
1518        struct mlx5_ib_pf_eq *eq =
1519                container_of(work, struct mlx5_ib_pf_eq, work);
1520
1521        mempool_refill(eq->pool);
1522
1523        spin_lock_irq(&eq->lock);
1524        mlx5_ib_eq_pf_process(eq);
1525        spin_unlock_irq(&eq->lock);
1526}
1527
1528enum {
1529        MLX5_IB_NUM_PF_EQE      = 0x1000,
1530        MLX5_IB_NUM_PF_DRAIN    = 64,
1531};
1532
1533int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1534{
1535        struct mlx5_eq_param param = {};
1536        int err = 0;
1537
1538        mutex_lock(&dev->odp_eq_mutex);
1539        if (eq->core)
1540                goto unlock;
1541        INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
1542        spin_lock_init(&eq->lock);
1543        eq->dev = dev;
1544
1545        eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
1546                                               sizeof(struct mlx5_pagefault));
1547        if (!eq->pool) {
1548                err = -ENOMEM;
1549                goto unlock;
1550        }
1551
1552        eq->wq = alloc_workqueue("mlx5_ib_page_fault",
1553                                 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
1554                                 MLX5_NUM_CMD_EQE);
1555        if (!eq->wq) {
1556                err = -ENOMEM;
1557                goto err_mempool;
1558        }
1559
1560        eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
1561        param = (struct mlx5_eq_param) {
1562                .nent = MLX5_IB_NUM_PF_EQE,
1563        };
1564        param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
1565        if (!zalloc_cpumask_var(&param.affinity, GFP_KERNEL)) {
1566                err = -ENOMEM;
1567                goto err_wq;
1568        }
1569        eq->core = mlx5_eq_create_generic(dev->mdev, &param);
1570        free_cpumask_var(param.affinity);
1571        if (IS_ERR(eq->core)) {
1572                err = PTR_ERR(eq->core);
1573                goto err_wq;
1574        }
1575        err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
1576        if (err) {
1577                mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
1578                goto err_eq;
1579        }
1580
1581        mutex_unlock(&dev->odp_eq_mutex);
1582        return 0;
1583err_eq:
1584        mlx5_eq_destroy_generic(dev->mdev, eq->core);
1585err_wq:
1586        eq->core = NULL;
1587        destroy_workqueue(eq->wq);
1588err_mempool:
1589        mempool_destroy(eq->pool);
1590unlock:
1591        mutex_unlock(&dev->odp_eq_mutex);
1592        return err;
1593}
1594
1595static int
1596mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1597{
1598        int err;
1599
1600        if (!eq->core)
1601                return 0;
1602        mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
1603        err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
1604        cancel_work_sync(&eq->work);
1605        destroy_workqueue(eq->wq);
1606        mempool_destroy(eq->pool);
1607
1608        return err;
1609}
1610
1611void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
1612{
1613        if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1614                return;
1615
1616        switch (ent->order - 2) {
1617        case MLX5_IMR_MTT_CACHE_ENTRY:
1618                ent->page = PAGE_SHIFT;
1619                ent->xlt = MLX5_IMR_MTT_ENTRIES *
1620                           sizeof(struct mlx5_mtt) /
1621                           MLX5_IB_UMR_OCTOWORD;
1622                ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
1623                ent->limit = 0;
1624                break;
1625
1626        case MLX5_IMR_KSM_CACHE_ENTRY:
1627                ent->page = MLX5_KSM_PAGE_SHIFT;
1628                ent->xlt = mlx5_imr_ksm_entries *
1629                           sizeof(struct mlx5_klm) /
1630                           MLX5_IB_UMR_OCTOWORD;
1631                ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
1632                ent->limit = 0;
1633                break;
1634        }
1635}
1636
1637static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
1638        .advise_mr = mlx5_ib_advise_mr,
1639};
1640
1641int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
1642{
1643        int ret = 0;
1644
1645        internal_fill_odp_caps(dev);
1646
1647        if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
1648                return ret;
1649
1650        ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
1651
1652        if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
1653                ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
1654                if (ret) {
1655                        mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
1656                        return ret;
1657                }
1658        }
1659
1660        mutex_init(&dev->odp_eq_mutex);
1661        return ret;
1662}
1663
1664void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
1665{
1666        if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
1667                return;
1668
1669        mlx5_ib_odp_destroy_eq(dev, &dev->odp_pf_eq);
1670}
1671
1672int mlx5_ib_odp_init(void)
1673{
1674        mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
1675                                       MLX5_IMR_MTT_BITS);
1676
1677        return 0;
1678}
1679
1680struct prefetch_mr_work {
1681        struct work_struct work;
1682        u32 pf_flags;
1683        u32 num_sge;
1684        struct {
1685                u64 io_virt;
1686                struct mlx5_ib_mr *mr;
1687                size_t length;
1688        } frags[];
1689};
1690
1691static void destroy_prefetch_work(struct prefetch_mr_work *work)
1692{
1693        u32 i;
1694
1695        for (i = 0; i < work->num_sge; ++i)
1696                mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey);
1697
1698        kvfree(work);
1699}
1700
1701static struct mlx5_ib_mr *
1702get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
1703                    u32 lkey)
1704{
1705        struct mlx5_ib_dev *dev = to_mdev(pd->device);
1706        struct mlx5_core_mkey *mmkey;
1707        struct mlx5_ib_mr *mr = NULL;
1708
1709        xa_lock(&dev->odp_mkeys);
1710        mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
1711        if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
1712                goto end;
1713
1714        mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
1715
1716        if (mr->ibmr.pd != pd) {
1717                mr = NULL;
1718                goto end;
1719        }
1720
1721        /* prefetch with write-access must be supported by the MR */
1722        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1723            !mr->umem->writable) {
1724                mr = NULL;
1725                goto end;
1726        }
1727
1728        refcount_inc(&mmkey->usecount);
1729end:
1730        xa_unlock(&dev->odp_mkeys);
1731        return mr;
1732}
1733
1734static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
1735{
1736        struct prefetch_mr_work *work =
1737                container_of(w, struct prefetch_mr_work, work);
1738        u32 bytes_mapped = 0;
1739        int ret;
1740        u32 i;
1741
1742        /* We rely on IB/core that work is executed if we have num_sge != 0 only. */
1743        WARN_ON(!work->num_sge);
1744        for (i = 0; i < work->num_sge; ++i) {
1745                ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
1746                                   work->frags[i].length, &bytes_mapped,
1747                                   work->pf_flags);
1748                if (ret <= 0)
1749                        continue;
1750                mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
1751        }
1752
1753        destroy_prefetch_work(work);
1754}
1755
1756static bool init_prefetch_work(struct ib_pd *pd,
1757                               enum ib_uverbs_advise_mr_advice advice,
1758                               u32 pf_flags, struct prefetch_mr_work *work,
1759                               struct ib_sge *sg_list, u32 num_sge)
1760{
1761        u32 i;
1762
1763        INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
1764        work->pf_flags = pf_flags;
1765
1766        for (i = 0; i < num_sge; ++i) {
1767                work->frags[i].io_virt = sg_list[i].addr;
1768                work->frags[i].length = sg_list[i].length;
1769                work->frags[i].mr =
1770                        get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1771                if (!work->frags[i].mr) {
1772                        work->num_sge = i;
1773                        return false;
1774                }
1775        }
1776        work->num_sge = num_sge;
1777        return true;
1778}
1779
1780static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
1781                                    enum ib_uverbs_advise_mr_advice advice,
1782                                    u32 pf_flags, struct ib_sge *sg_list,
1783                                    u32 num_sge)
1784{
1785        u32 bytes_mapped = 0;
1786        int ret = 0;
1787        u32 i;
1788
1789        for (i = 0; i < num_sge; ++i) {
1790                struct mlx5_ib_mr *mr;
1791
1792                mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1793                if (!mr)
1794                        return -ENOENT;
1795                ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
1796                                   &bytes_mapped, pf_flags);
1797                if (ret < 0) {
1798                        mlx5r_deref_odp_mkey(&mr->mmkey);
1799                        return ret;
1800                }
1801                mlx5_update_odp_stats(mr, prefetch, ret);
1802                mlx5r_deref_odp_mkey(&mr->mmkey);
1803        }
1804
1805        return 0;
1806}
1807
1808int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
1809                               enum ib_uverbs_advise_mr_advice advice,
1810                               u32 flags, struct ib_sge *sg_list, u32 num_sge)
1811{
1812        u32 pf_flags = 0;
1813        struct prefetch_mr_work *work;
1814
1815        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
1816                pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
1817
1818        if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1819                pf_flags |= MLX5_PF_FLAGS_SNAPSHOT;
1820
1821        if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
1822                return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
1823                                                num_sge);
1824
1825        work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
1826        if (!work)
1827                return -ENOMEM;
1828
1829        if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
1830                destroy_prefetch_work(work);
1831                return -EINVAL;
1832        }
1833        queue_work(system_unbound_wq, &work->work);
1834        return 0;
1835}
1836