linux/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
   2/* Copyright (c) 2019 Mellanox Technologies. */
   3
   4#include <linux/smp.h>
   5#include "dr_types.h"
   6
   7#define QUEUE_SIZE 128
   8#define SIGNAL_PER_DIV_QUEUE 16
   9#define TH_NUMS_TO_DRAIN 2
  10
  11enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
  12
  13struct dr_data_seg {
  14        u64 addr;
  15        u32 length;
  16        u32 lkey;
  17        unsigned int send_flags;
  18};
  19
  20struct postsend_info {
  21        struct dr_data_seg write;
  22        struct dr_data_seg read;
  23        u64 remote_addr;
  24        u32 rkey;
  25};
  26
  27struct dr_qp_rtr_attr {
  28        struct mlx5dr_cmd_gid_attr dgid_attr;
  29        enum ib_mtu mtu;
  30        u32 qp_num;
  31        u16 port_num;
  32        u8 min_rnr_timer;
  33        u8 sgid_index;
  34        u16 udp_src_port;
  35        u8 fl:1;
  36};
  37
  38struct dr_qp_rts_attr {
  39        u8 timeout;
  40        u8 retry_cnt;
  41        u8 rnr_retry;
  42};
  43
  44struct dr_qp_init_attr {
  45        u32 cqn;
  46        u32 pdn;
  47        u32 max_send_wr;
  48        struct mlx5_uars_page *uar;
  49        u8 isolate_vl_tc:1;
  50};
  51
  52static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
  53{
  54        unsigned int idx;
  55        u8 opcode;
  56
  57        opcode = get_cqe_opcode(cqe64);
  58        if (opcode == MLX5_CQE_REQ_ERR) {
  59                idx = be16_to_cpu(cqe64->wqe_counter) &
  60                        (dr_cq->qp->sq.wqe_cnt - 1);
  61                dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
  62        } else if (opcode == MLX5_CQE_RESP_ERR) {
  63                ++dr_cq->qp->sq.cc;
  64        } else {
  65                idx = be16_to_cpu(cqe64->wqe_counter) &
  66                        (dr_cq->qp->sq.wqe_cnt - 1);
  67                dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
  68
  69                return CQ_OK;
  70        }
  71
  72        return CQ_POLL_ERR;
  73}
  74
  75static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq)
  76{
  77        struct mlx5_cqe64 *cqe64;
  78        int err;
  79
  80        cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq);
  81        if (!cqe64)
  82                return CQ_EMPTY;
  83
  84        mlx5_cqwq_pop(&dr_cq->wq);
  85        err = dr_parse_cqe(dr_cq, cqe64);
  86        mlx5_cqwq_update_db_record(&dr_cq->wq);
  87
  88        return err;
  89}
  90
  91static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
  92{
  93        int npolled;
  94        int err = 0;
  95
  96        for (npolled = 0; npolled < ne; ++npolled) {
  97                err = dr_cq_poll_one(dr_cq);
  98                if (err != CQ_OK)
  99                        break;
 100        }
 101
 102        return err == CQ_POLL_ERR ? err : npolled;
 103}
 104
 105static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
 106                                         struct dr_qp_init_attr *attr)
 107{
 108        u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
 109        u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
 110        struct mlx5_wq_param wqp;
 111        struct mlx5dr_qp *dr_qp;
 112        int inlen;
 113        void *qpc;
 114        void *in;
 115        int err;
 116
 117        dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL);
 118        if (!dr_qp)
 119                return NULL;
 120
 121        wqp.buf_numa_node = mdev->priv.numa_node;
 122        wqp.db_numa_node = mdev->priv.numa_node;
 123
 124        dr_qp->rq.pc = 0;
 125        dr_qp->rq.cc = 0;
 126        dr_qp->rq.wqe_cnt = 4;
 127        dr_qp->sq.pc = 0;
 128        dr_qp->sq.cc = 0;
 129        dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr);
 130
 131        MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
 132        MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
 133        MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
 134        err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq,
 135                                &dr_qp->wq_ctrl);
 136        if (err) {
 137                mlx5_core_warn(mdev, "Can't create QP WQ\n");
 138                goto err_wq;
 139        }
 140
 141        dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt,
 142                                     sizeof(dr_qp->sq.wqe_head[0]),
 143                                     GFP_KERNEL);
 144
 145        if (!dr_qp->sq.wqe_head) {
 146                mlx5_core_warn(mdev, "Can't allocate wqe head\n");
 147                goto err_wqe_head;
 148        }
 149
 150        inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
 151                MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
 152                dr_qp->wq_ctrl.buf.npages;
 153        in = kvzalloc(inlen, GFP_KERNEL);
 154        if (!in) {
 155                err = -ENOMEM;
 156                goto err_in;
 157        }
 158
 159        qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
 160        MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
 161        MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
 162        MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc);
 163        MLX5_SET(qpc, qpc, pd, attr->pdn);
 164        MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
 165        MLX5_SET(qpc, qpc, log_page_size,
 166                 dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 167        MLX5_SET(qpc, qpc, fre, 1);
 168        MLX5_SET(qpc, qpc, rlky, 1);
 169        MLX5_SET(qpc, qpc, cqn_snd, attr->cqn);
 170        MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn);
 171        MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
 172        MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
 173        MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
 174        MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
 175        MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
 176        MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
 177        if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
 178                MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
 179        mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf,
 180                                  (__be64 *)MLX5_ADDR_OF(create_qp_in,
 181                                                         in, pas));
 182
 183        MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
 184        err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
 185        dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn);
 186        kvfree(in);
 187        if (err)
 188                goto err_in;
 189        dr_qp->uar = attr->uar;
 190
 191        return dr_qp;
 192
 193err_in:
 194        kfree(dr_qp->sq.wqe_head);
 195err_wqe_head:
 196        mlx5_wq_destroy(&dr_qp->wq_ctrl);
 197err_wq:
 198        kfree(dr_qp);
 199        return NULL;
 200}
 201
 202static void dr_destroy_qp(struct mlx5_core_dev *mdev,
 203                          struct mlx5dr_qp *dr_qp)
 204{
 205        u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
 206
 207        MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
 208        MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn);
 209        mlx5_cmd_exec_in(mdev, destroy_qp, in);
 210
 211        kfree(dr_qp->sq.wqe_head);
 212        mlx5_wq_destroy(&dr_qp->wq_ctrl);
 213        kfree(dr_qp);
 214}
 215
 216static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl)
 217{
 218        dma_wmb();
 219        *dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff);
 220
 221        /* After wmb() the hw aware of new work */
 222        wmb();
 223
 224        mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET);
 225}
 226
 227static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
 228                             u32 rkey, struct dr_data_seg *data_seg,
 229                             u32 opcode, bool notify_hw)
 230{
 231        struct mlx5_wqe_raddr_seg *wq_raddr;
 232        struct mlx5_wqe_ctrl_seg *wq_ctrl;
 233        struct mlx5_wqe_data_seg *wq_dseg;
 234        unsigned int size;
 235        unsigned int idx;
 236
 237        size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 +
 238                sizeof(*wq_raddr) / 16;
 239
 240        idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1);
 241
 242        wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
 243        wq_ctrl->imm = 0;
 244        wq_ctrl->fm_ce_se = (data_seg->send_flags) ?
 245                MLX5_WQE_CTRL_CQ_UPDATE : 0;
 246        wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) |
 247                                                opcode);
 248        wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8);
 249        wq_raddr = (void *)(wq_ctrl + 1);
 250        wq_raddr->raddr = cpu_to_be64(remote_addr);
 251        wq_raddr->rkey = cpu_to_be32(rkey);
 252        wq_raddr->reserved = 0;
 253
 254        wq_dseg = (void *)(wq_raddr + 1);
 255        wq_dseg->byte_count = cpu_to_be32(data_seg->length);
 256        wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
 257        wq_dseg->addr = cpu_to_be64(data_seg->addr);
 258
 259        dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++;
 260
 261        if (notify_hw)
 262                dr_cmd_notify_hw(dr_qp, wq_ctrl);
 263}
 264
 265static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info)
 266{
 267        dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
 268                         &send_info->write, MLX5_OPCODE_RDMA_WRITE, false);
 269        dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
 270                         &send_info->read, MLX5_OPCODE_RDMA_READ, true);
 271}
 272
 273/**
 274 * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent
 275 * with send_list parameters:
 276 *
 277 *     @ste:       The data that attached to this specific ste
 278 *     @size:      of data to write
 279 *     @offset:    of the data from start of the hw_ste entry
 280 *     @data:      data
 281 *     @ste_info:  ste to be sent with send_list
 282 *     @send_list: to append into it
 283 *     @copy_data: if true indicates that the data should be kept because
 284 *                 it's not backuped any where (like in re-hash).
 285 *                 if false, it lets the data to be updated after
 286 *                 it was added to the list.
 287 */
 288void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size,
 289                                               u16 offset, u8 *data,
 290                                               struct mlx5dr_ste_send_info *ste_info,
 291                                               struct list_head *send_list,
 292                                               bool copy_data)
 293{
 294        ste_info->size = size;
 295        ste_info->ste = ste;
 296        ste_info->offset = offset;
 297
 298        if (copy_data) {
 299                memcpy(ste_info->data_cont, data, size);
 300                ste_info->data = ste_info->data_cont;
 301        } else {
 302                ste_info->data = data;
 303        }
 304
 305        list_add_tail(&ste_info->send_list, send_list);
 306}
 307
 308/* The function tries to consume one wc each time, unless the queue is full, in
 309 * that case, which means that the hw is behind the sw in a full queue len
 310 * the function will drain the cq till it empty.
 311 */
 312static int dr_handle_pending_wc(struct mlx5dr_domain *dmn,
 313                                struct mlx5dr_send_ring *send_ring)
 314{
 315        bool is_drain = false;
 316        int ne;
 317
 318        if (send_ring->pending_wqe < send_ring->signal_th)
 319                return 0;
 320
 321        /* Queue is full start drain it */
 322        if (send_ring->pending_wqe >=
 323            dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
 324                is_drain = true;
 325
 326        do {
 327                ne = dr_poll_cq(send_ring->cq, 1);
 328                if (unlikely(ne < 0)) {
 329                        mlx5_core_warn_once(dmn->mdev, "SMFS QPN 0x%x is disabled/limited",
 330                                            send_ring->qp->qpn);
 331                        send_ring->err_state = true;
 332                        return ne;
 333                } else if (ne == 1) {
 334                        send_ring->pending_wqe -= send_ring->signal_th;
 335                }
 336        } while (is_drain && send_ring->pending_wqe);
 337
 338        return 0;
 339}
 340
 341static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
 342                              struct postsend_info *send_info)
 343{
 344        send_ring->pending_wqe++;
 345
 346        if (send_ring->pending_wqe % send_ring->signal_th == 0)
 347                send_info->write.send_flags |= IB_SEND_SIGNALED;
 348
 349        send_ring->pending_wqe++;
 350        send_info->read.length = send_info->write.length;
 351        /* Read into the same write area */
 352        send_info->read.addr = (uintptr_t)send_info->write.addr;
 353        send_info->read.lkey = send_ring->mr->mkey.key;
 354
 355        if (send_ring->pending_wqe % send_ring->signal_th == 0)
 356                send_info->read.send_flags = IB_SEND_SIGNALED;
 357        else
 358                send_info->read.send_flags = 0;
 359}
 360
 361static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
 362                                struct postsend_info *send_info)
 363{
 364        struct mlx5dr_send_ring *send_ring = dmn->send_ring;
 365        u32 buff_offset;
 366        int ret;
 367
 368        if (unlikely(dmn->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR ||
 369                     send_ring->err_state)) {
 370                mlx5_core_dbg_once(dmn->mdev,
 371                                   "Skipping post send: QP err state: %d, device state: %d\n",
 372                                   send_ring->err_state, dmn->mdev->state);
 373                return 0;
 374        }
 375
 376        spin_lock(&send_ring->lock);
 377
 378        ret = dr_handle_pending_wc(dmn, send_ring);
 379        if (ret)
 380                goto out_unlock;
 381
 382        if (send_info->write.length > dmn->info.max_inline_size) {
 383                buff_offset = (send_ring->tx_head &
 384                               (dmn->send_ring->signal_th - 1)) *
 385                        send_ring->max_post_send_size;
 386                /* Copy to ring mr */
 387                memcpy(send_ring->buf + buff_offset,
 388                       (void *)(uintptr_t)send_info->write.addr,
 389                       send_info->write.length);
 390                send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
 391                send_info->write.lkey = send_ring->mr->mkey.key;
 392        }
 393
 394        send_ring->tx_head++;
 395        dr_fill_data_segs(send_ring, send_info);
 396        dr_post_send(send_ring->qp, send_info);
 397
 398out_unlock:
 399        spin_unlock(&send_ring->lock);
 400        return ret;
 401}
 402
 403static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn,
 404                                   struct mlx5dr_ste_htbl *htbl,
 405                                   u8 **data,
 406                                   u32 *byte_size,
 407                                   int *iterations,
 408                                   int *num_stes)
 409{
 410        int alloc_size;
 411
 412        if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) {
 413                *iterations = htbl->chunk->byte_size /
 414                        dmn->send_ring->max_post_send_size;
 415                *byte_size = dmn->send_ring->max_post_send_size;
 416                alloc_size = *byte_size;
 417                *num_stes = *byte_size / DR_STE_SIZE;
 418        } else {
 419                *iterations = 1;
 420                *num_stes = htbl->chunk->num_of_entries;
 421                alloc_size = *num_stes * DR_STE_SIZE;
 422        }
 423
 424        *data = kvzalloc(alloc_size, GFP_KERNEL);
 425        if (!*data)
 426                return -ENOMEM;
 427
 428        return 0;
 429}
 430
 431/**
 432 * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm.
 433 *
 434 *     @dmn:    Domain
 435 *     @ste:    The ste struct that contains the data (at
 436 *              least part of it)
 437 *     @data:   The real data to send size data
 438 *     @size:   for writing.
 439 *     @offset: The offset from the icm mapped data to
 440 *              start write to this for write only part of the
 441 *              buffer.
 442 *
 443 * Return: 0 on success.
 444 */
 445int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste,
 446                             u8 *data, u16 size, u16 offset)
 447{
 448        struct postsend_info send_info = {};
 449
 450        mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size);
 451
 452        send_info.write.addr = (uintptr_t)data;
 453        send_info.write.length = size;
 454        send_info.write.lkey = 0;
 455        send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset;
 456        send_info.rkey = ste->htbl->chunk->rkey;
 457
 458        return dr_postsend_icm_data(dmn, &send_info);
 459}
 460
 461int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn,
 462                              struct mlx5dr_ste_htbl *htbl,
 463                              u8 *formatted_ste, u8 *mask)
 464{
 465        u32 byte_size = htbl->chunk->byte_size;
 466        int num_stes_per_iter;
 467        int iterations;
 468        u8 *data;
 469        int ret;
 470        int i;
 471        int j;
 472
 473        ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
 474                                      &iterations, &num_stes_per_iter);
 475        if (ret)
 476                return ret;
 477
 478        mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE);
 479
 480        /* Send the data iteration times */
 481        for (i = 0; i < iterations; i++) {
 482                u32 ste_index = i * (byte_size / DR_STE_SIZE);
 483                struct postsend_info send_info = {};
 484
 485                /* Copy all ste's on the data buffer
 486                 * need to add the bit_mask
 487                 */
 488                for (j = 0; j < num_stes_per_iter; j++) {
 489                        struct mlx5dr_ste *ste = &htbl->ste_arr[ste_index + j];
 490                        u32 ste_off = j * DR_STE_SIZE;
 491
 492                        if (mlx5dr_ste_is_not_used(ste)) {
 493                                memcpy(data + ste_off,
 494                                       formatted_ste, DR_STE_SIZE);
 495                        } else {
 496                                /* Copy data */
 497                                memcpy(data + ste_off,
 498                                       htbl->ste_arr[ste_index + j].hw_ste,
 499                                       DR_STE_SIZE_REDUCED);
 500                                /* Copy bit_mask */
 501                                memcpy(data + ste_off + DR_STE_SIZE_REDUCED,
 502                                       mask, DR_STE_SIZE_MASK);
 503                                /* Only when we have mask we need to re-arrange the STE */
 504                                mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx,
 505                                                                data + (j * DR_STE_SIZE),
 506                                                                DR_STE_SIZE);
 507                        }
 508                }
 509
 510                send_info.write.addr = (uintptr_t)data;
 511                send_info.write.length = byte_size;
 512                send_info.write.lkey = 0;
 513                send_info.remote_addr =
 514                        mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
 515                send_info.rkey = htbl->chunk->rkey;
 516
 517                ret = dr_postsend_icm_data(dmn, &send_info);
 518                if (ret)
 519                        goto out_free;
 520        }
 521
 522out_free:
 523        kvfree(data);
 524        return ret;
 525}
 526
 527/* Initialize htble with default STEs */
 528int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn,
 529                                        struct mlx5dr_ste_htbl *htbl,
 530                                        u8 *ste_init_data,
 531                                        bool update_hw_ste)
 532{
 533        u32 byte_size = htbl->chunk->byte_size;
 534        int iterations;
 535        int num_stes;
 536        u8 *copy_dst;
 537        u8 *data;
 538        int ret;
 539        int i;
 540
 541        ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
 542                                      &iterations, &num_stes);
 543        if (ret)
 544                return ret;
 545
 546        if (update_hw_ste) {
 547                /* Copy the reduced STE to hash table ste_arr */
 548                for (i = 0; i < num_stes; i++) {
 549                        copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
 550                        memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
 551                }
 552        }
 553
 554        mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE);
 555
 556        /* Copy the same STE on the data buffer */
 557        for (i = 0; i < num_stes; i++) {
 558                copy_dst = data + i * DR_STE_SIZE;
 559                memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
 560        }
 561
 562        /* Send the data iteration times */
 563        for (i = 0; i < iterations; i++) {
 564                u8 ste_index = i * (byte_size / DR_STE_SIZE);
 565                struct postsend_info send_info = {};
 566
 567                send_info.write.addr = (uintptr_t)data;
 568                send_info.write.length = byte_size;
 569                send_info.write.lkey = 0;
 570                send_info.remote_addr =
 571                        mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
 572                send_info.rkey = htbl->chunk->rkey;
 573
 574                ret = dr_postsend_icm_data(dmn, &send_info);
 575                if (ret)
 576                        goto out_free;
 577        }
 578
 579out_free:
 580        kvfree(data);
 581        return ret;
 582}
 583
 584int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn,
 585                                struct mlx5dr_action *action)
 586{
 587        struct postsend_info send_info = {};
 588        int ret;
 589
 590        send_info.write.addr = (uintptr_t)action->rewrite->data;
 591        send_info.write.length = action->rewrite->num_of_actions *
 592                                 DR_MODIFY_ACTION_SIZE;
 593        send_info.write.lkey = 0;
 594        send_info.remote_addr = action->rewrite->chunk->mr_addr;
 595        send_info.rkey = action->rewrite->chunk->rkey;
 596
 597        ret = dr_postsend_icm_data(dmn, &send_info);
 598
 599        return ret;
 600}
 601
 602static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev,
 603                                 struct mlx5dr_qp *dr_qp,
 604                                 int port)
 605{
 606        u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
 607        void *qpc;
 608
 609        qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
 610
 611        MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port);
 612        MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
 613        MLX5_SET(qpc, qpc, rre, 1);
 614        MLX5_SET(qpc, qpc, rwe, 1);
 615
 616        MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP);
 617        MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn);
 618
 619        return mlx5_cmd_exec_in(mdev, rst2init_qp, in);
 620}
 621
 622static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev,
 623                                    struct mlx5dr_qp *dr_qp,
 624                                    struct dr_qp_rts_attr *attr)
 625{
 626        u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
 627        void *qpc;
 628
 629        qpc  = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
 630
 631        MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
 632
 633        MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
 634        MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry);
 635        MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
 636
 637        MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
 638        MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
 639
 640        return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in);
 641}
 642
 643static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
 644                                     struct mlx5dr_qp *dr_qp,
 645                                     struct dr_qp_rtr_attr *attr)
 646{
 647        u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
 648        void *qpc;
 649
 650        qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
 651
 652        MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
 653
 654        MLX5_SET(qpc, qpc, mtu, attr->mtu);
 655        MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1);
 656        MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num);
 657        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
 658               attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac));
 659        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
 660               attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid));
 661        MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
 662                 attr->sgid_index);
 663
 664        if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2)
 665                MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
 666                         attr->udp_src_port);
 667
 668        MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num);
 669        MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl);
 670        MLX5_SET(qpc, qpc, min_rnr_nak, 1);
 671
 672        MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
 673        MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
 674
 675        return mlx5_cmd_exec_in(mdev, init2rtr_qp, in);
 676}
 677
 678static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps)
 679{
 680        /* Check whether RC RoCE QP creation with force loopback is allowed.
 681         * There are two separate capability bits for this:
 682         *  - force loopback when RoCE is enabled
 683         *  - force loopback when RoCE is disabled
 684         */
 685        return ((caps->roce_caps.roce_en &&
 686                 caps->roce_caps.fl_rc_qp_when_roce_enabled) ||
 687                (!caps->roce_caps.roce_en &&
 688                 caps->roce_caps.fl_rc_qp_when_roce_disabled));
 689}
 690
 691static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
 692{
 693        struct mlx5dr_qp *dr_qp = dmn->send_ring->qp;
 694        struct dr_qp_rts_attr rts_attr = {};
 695        struct dr_qp_rtr_attr rtr_attr = {};
 696        enum ib_mtu mtu = IB_MTU_1024;
 697        u16 gid_index = 0;
 698        int port = 1;
 699        int ret;
 700
 701        /* Init */
 702        ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port);
 703        if (ret) {
 704                mlx5dr_err(dmn, "Failed modify QP rst2init\n");
 705                return ret;
 706        }
 707
 708        /* RTR */
 709        rtr_attr.mtu            = mtu;
 710        rtr_attr.qp_num         = dr_qp->qpn;
 711        rtr_attr.min_rnr_timer  = 12;
 712        rtr_attr.port_num       = port;
 713        rtr_attr.udp_src_port   = dmn->info.caps.roce_min_src_udp;
 714
 715        /* If QP creation with force loopback is allowed, then there
 716         * is no need for GID index when creating the QP.
 717         * Otherwise we query GID attributes and use GID index.
 718         */
 719        rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps);
 720        if (!rtr_attr.fl) {
 721                ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index,
 722                                           &rtr_attr.dgid_attr);
 723                if (ret)
 724                        return ret;
 725
 726                rtr_attr.sgid_index = gid_index;
 727        }
 728
 729        ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr);
 730        if (ret) {
 731                mlx5dr_err(dmn, "Failed modify QP init2rtr\n");
 732                return ret;
 733        }
 734
 735        /* RTS */
 736        rts_attr.timeout        = 14;
 737        rts_attr.retry_cnt      = 7;
 738        rts_attr.rnr_retry      = 7;
 739
 740        ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr);
 741        if (ret) {
 742                mlx5dr_err(dmn, "Failed modify QP rtr2rts\n");
 743                return ret;
 744        }
 745
 746        return 0;
 747}
 748
 749static void dr_cq_complete(struct mlx5_core_cq *mcq,
 750                           struct mlx5_eqe *eqe)
 751{
 752        pr_err("CQ completion CQ: #%u\n", mcq->cqn);
 753}
 754
 755static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
 756                                      struct mlx5_uars_page *uar,
 757                                      size_t ncqe)
 758{
 759        u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {};
 760        u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 761        struct mlx5_wq_param wqp;
 762        struct mlx5_cqe64 *cqe;
 763        struct mlx5dr_cq *cq;
 764        int inlen, err, eqn;
 765        void *cqc, *in;
 766        __be64 *pas;
 767        int vector;
 768        u32 i;
 769
 770        cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 771        if (!cq)
 772                return NULL;
 773
 774        ncqe = roundup_pow_of_two(ncqe);
 775        MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe));
 776
 777        wqp.buf_numa_node = mdev->priv.numa_node;
 778        wqp.db_numa_node = mdev->priv.numa_node;
 779
 780        err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq,
 781                               &cq->wq_ctrl);
 782        if (err)
 783                goto out;
 784
 785        for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 786                cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
 787                cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
 788        }
 789
 790        inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
 791                sizeof(u64) * cq->wq_ctrl.buf.npages;
 792        in = kvzalloc(inlen, GFP_KERNEL);
 793        if (!in)
 794                goto err_cqwq;
 795
 796        vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
 797        err = mlx5_vector2eqn(mdev, vector, &eqn);
 798        if (err) {
 799                kvfree(in);
 800                goto err_cqwq;
 801        }
 802
 803        cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
 804        MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
 805        MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
 806        MLX5_SET(cqc, cqc, uar_page, uar->index);
 807        MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 808                 MLX5_ADAPTER_PAGE_SHIFT);
 809        MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
 810
 811        pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
 812        mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas);
 813
 814        cq->mcq.comp  = dr_cq_complete;
 815
 816        err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
 817        kvfree(in);
 818
 819        if (err)
 820                goto err_cqwq;
 821
 822        cq->mcq.cqe_sz = 64;
 823        cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
 824        cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
 825        *cq->mcq.set_ci_db = 0;
 826
 827        /* set no-zero value, in order to avoid the HW to run db-recovery on
 828         * CQ that used in polling mode.
 829         */
 830        *cq->mcq.arm_db = cpu_to_be32(2 << 28);
 831
 832        cq->mcq.vector = 0;
 833        cq->mcq.uar = uar;
 834
 835        return cq;
 836
 837err_cqwq:
 838        mlx5_wq_destroy(&cq->wq_ctrl);
 839out:
 840        kfree(cq);
 841        return NULL;
 842}
 843
 844static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq)
 845{
 846        mlx5_core_destroy_cq(mdev, &cq->mcq);
 847        mlx5_wq_destroy(&cq->wq_ctrl);
 848        kfree(cq);
 849}
 850
 851static int
 852dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey)
 853{
 854        u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
 855        void *mkc;
 856
 857        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 858        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
 859        MLX5_SET(mkc, mkc, a, 1);
 860        MLX5_SET(mkc, mkc, rw, 1);
 861        MLX5_SET(mkc, mkc, rr, 1);
 862        MLX5_SET(mkc, mkc, lw, 1);
 863        MLX5_SET(mkc, mkc, lr, 1);
 864
 865        MLX5_SET(mkc, mkc, pd, pdn);
 866        MLX5_SET(mkc, mkc, length64, 1);
 867        MLX5_SET(mkc, mkc, qpn, 0xffffff);
 868
 869        return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in));
 870}
 871
 872static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev,
 873                                   u32 pdn, void *buf, size_t size)
 874{
 875        struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 876        struct device *dma_device;
 877        dma_addr_t dma_addr;
 878        int err;
 879
 880        if (!mr)
 881                return NULL;
 882
 883        dma_device = mlx5_core_dma_dev(mdev);
 884        dma_addr = dma_map_single(dma_device, buf, size,
 885                                  DMA_BIDIRECTIONAL);
 886        err = dma_mapping_error(dma_device, dma_addr);
 887        if (err) {
 888                mlx5_core_warn(mdev, "Can't dma buf\n");
 889                kfree(mr);
 890                return NULL;
 891        }
 892
 893        err = dr_create_mkey(mdev, pdn, &mr->mkey);
 894        if (err) {
 895                mlx5_core_warn(mdev, "Can't create mkey\n");
 896                dma_unmap_single(dma_device, dma_addr, size,
 897                                 DMA_BIDIRECTIONAL);
 898                kfree(mr);
 899                return NULL;
 900        }
 901
 902        mr->dma_addr = dma_addr;
 903        mr->size = size;
 904        mr->addr = buf;
 905
 906        return mr;
 907}
 908
 909static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr)
 910{
 911        mlx5_core_destroy_mkey(mdev, &mr->mkey);
 912        dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size,
 913                         DMA_BIDIRECTIONAL);
 914        kfree(mr);
 915}
 916
 917int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
 918{
 919        struct dr_qp_init_attr init_attr = {};
 920        int cq_size;
 921        int size;
 922        int ret;
 923
 924        dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL);
 925        if (!dmn->send_ring)
 926                return -ENOMEM;
 927
 928        cq_size = QUEUE_SIZE + 1;
 929        dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size);
 930        if (!dmn->send_ring->cq) {
 931                mlx5dr_err(dmn, "Failed creating CQ\n");
 932                ret = -ENOMEM;
 933                goto free_send_ring;
 934        }
 935
 936        init_attr.cqn = dmn->send_ring->cq->mcq.cqn;
 937        init_attr.pdn = dmn->pdn;
 938        init_attr.uar = dmn->uar;
 939        init_attr.max_send_wr = QUEUE_SIZE;
 940
 941        /* Isolated VL is applicable only if force loopback is supported */
 942        if (dr_send_allow_fl(&dmn->info.caps))
 943                init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc;
 944
 945        spin_lock_init(&dmn->send_ring->lock);
 946
 947        dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr);
 948        if (!dmn->send_ring->qp)  {
 949                mlx5dr_err(dmn, "Failed creating QP\n");
 950                ret = -ENOMEM;
 951                goto clean_cq;
 952        }
 953
 954        dmn->send_ring->cq->qp = dmn->send_ring->qp;
 955
 956        dmn->info.max_send_wr = QUEUE_SIZE;
 957        dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
 958                                        DR_STE_SIZE);
 959
 960        dmn->send_ring->signal_th = dmn->info.max_send_wr /
 961                SIGNAL_PER_DIV_QUEUE;
 962
 963        /* Prepare qp to be used */
 964        ret = dr_prepare_qp_to_rts(dmn);
 965        if (ret)
 966                goto clean_qp;
 967
 968        dmn->send_ring->max_post_send_size =
 969                mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K,
 970                                                   DR_ICM_TYPE_STE);
 971
 972        /* Allocating the max size as a buffer for writing */
 973        size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size;
 974        dmn->send_ring->buf = kzalloc(size, GFP_KERNEL);
 975        if (!dmn->send_ring->buf) {
 976                ret = -ENOMEM;
 977                goto clean_qp;
 978        }
 979
 980        dmn->send_ring->buf_size = size;
 981
 982        dmn->send_ring->mr = dr_reg_mr(dmn->mdev,
 983                                       dmn->pdn, dmn->send_ring->buf, size);
 984        if (!dmn->send_ring->mr) {
 985                ret = -ENOMEM;
 986                goto free_mem;
 987        }
 988
 989        dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev,
 990                                            dmn->pdn, dmn->send_ring->sync_buff,
 991                                            MIN_READ_SYNC);
 992        if (!dmn->send_ring->sync_mr) {
 993                ret = -ENOMEM;
 994                goto clean_mr;
 995        }
 996
 997        return 0;
 998
 999clean_mr:
1000        dr_dereg_mr(dmn->mdev, dmn->send_ring->mr);
1001free_mem:
1002        kfree(dmn->send_ring->buf);
1003clean_qp:
1004        dr_destroy_qp(dmn->mdev, dmn->send_ring->qp);
1005clean_cq:
1006        dr_destroy_cq(dmn->mdev, dmn->send_ring->cq);
1007free_send_ring:
1008        kfree(dmn->send_ring);
1009
1010        return ret;
1011}
1012
1013void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn,
1014                           struct mlx5dr_send_ring *send_ring)
1015{
1016        dr_destroy_qp(dmn->mdev, send_ring->qp);
1017        dr_destroy_cq(dmn->mdev, send_ring->cq);
1018        dr_dereg_mr(dmn->mdev, send_ring->sync_mr);
1019        dr_dereg_mr(dmn->mdev, send_ring->mr);
1020        kfree(send_ring->buf);
1021        kfree(send_ring);
1022}
1023
1024int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn)
1025{
1026        struct mlx5dr_send_ring *send_ring = dmn->send_ring;
1027        struct postsend_info send_info = {};
1028        u8 data[DR_STE_SIZE];
1029        int num_of_sends_req;
1030        int ret;
1031        int i;
1032
1033        /* Sending this amount of requests makes sure we will get drain */
1034        num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2;
1035
1036        /* Send fake requests forcing the last to be signaled */
1037        send_info.write.addr = (uintptr_t)data;
1038        send_info.write.length = DR_STE_SIZE;
1039        send_info.write.lkey = 0;
1040        /* Using the sync_mr in order to write/read */
1041        send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr;
1042        send_info.rkey = send_ring->sync_mr->mkey.key;
1043
1044        for (i = 0; i < num_of_sends_req; i++) {
1045                ret = dr_postsend_icm_data(dmn, &send_info);
1046                if (ret)
1047                        return ret;
1048        }
1049
1050        spin_lock(&send_ring->lock);
1051        ret = dr_handle_pending_wc(dmn, send_ring);
1052        spin_unlock(&send_ring->lock);
1053
1054        return ret;
1055}
1056