linux/drivers/infiniband/hw/mlx5/main.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32
  33#include <linux/highmem.h>
  34#include <linux/module.h>
  35#include <linux/init.h>
  36#include <linux/errno.h>
  37#include <linux/pci.h>
  38#include <linux/dma-mapping.h>
  39#include <linux/slab.h>
  40#if defined(CONFIG_X86)
  41#include <asm/pat.h>
  42#endif
  43#include <linux/sched.h>
  44#include <linux/delay.h>
  45#include <rdma/ib_user_verbs.h>
  46#include <rdma/ib_addr.h>
  47#include <rdma/ib_cache.h>
  48#include <linux/mlx5/port.h>
  49#include <linux/mlx5/vport.h>
  50#include <linux/list.h>
  51#include <rdma/ib_smi.h>
  52#include <rdma/ib_umem.h>
  53#include <linux/in.h>
  54#include <linux/etherdevice.h>
  55#include <linux/mlx5/fs.h>
  56#include "mlx5_ib.h"
  57
  58#define DRIVER_NAME "mlx5_ib"
  59#define DRIVER_VERSION "2.2-1"
  60#define DRIVER_RELDATE  "Feb 2014"
  61
  62MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
  63MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
  64MODULE_LICENSE("Dual BSD/GPL");
  65MODULE_VERSION(DRIVER_VERSION);
  66
  67static int deprecated_prof_sel = 2;
  68module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
  69MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
  70
  71static char mlx5_version[] =
  72        DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
  73        DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
  74
  75enum {
  76        MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
  77};
  78
  79static enum rdma_link_layer
  80mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
  81{
  82        switch (port_type_cap) {
  83        case MLX5_CAP_PORT_TYPE_IB:
  84                return IB_LINK_LAYER_INFINIBAND;
  85        case MLX5_CAP_PORT_TYPE_ETH:
  86                return IB_LINK_LAYER_ETHERNET;
  87        default:
  88                return IB_LINK_LAYER_UNSPECIFIED;
  89        }
  90}
  91
  92static enum rdma_link_layer
  93mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
  94{
  95        struct mlx5_ib_dev *dev = to_mdev(device);
  96        int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
  97
  98        return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
  99}
 100
 101static int mlx5_netdev_event(struct notifier_block *this,
 102                             unsigned long event, void *ptr)
 103{
 104        struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
 105        struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
 106                                                 roce.nb);
 107
 108        switch (event) {
 109        case NETDEV_REGISTER:
 110        case NETDEV_UNREGISTER:
 111                write_lock(&ibdev->roce.netdev_lock);
 112                if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
 113                        ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ?
 114                                             NULL : ndev;
 115                write_unlock(&ibdev->roce.netdev_lock);
 116                break;
 117
 118        case NETDEV_UP:
 119        case NETDEV_DOWN: {
 120                struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
 121                struct net_device *upper = NULL;
 122
 123                if (lag_ndev) {
 124                        upper = netdev_master_upper_dev_get(lag_ndev);
 125                        dev_put(lag_ndev);
 126                }
 127
 128                if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
 129                    && ibdev->ib_active) {
 130                        struct ib_event ibev = {0};
 131
 132                        ibev.device = &ibdev->ib_dev;
 133                        ibev.event = (event == NETDEV_UP) ?
 134                                     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
 135                        ibev.element.port_num = 1;
 136                        ib_dispatch_event(&ibev);
 137                }
 138                break;
 139        }
 140
 141        default:
 142                break;
 143        }
 144
 145        return NOTIFY_DONE;
 146}
 147
 148static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
 149                                             u8 port_num)
 150{
 151        struct mlx5_ib_dev *ibdev = to_mdev(device);
 152        struct net_device *ndev;
 153
 154        ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
 155        if (ndev)
 156                return ndev;
 157
 158        /* Ensure ndev does not disappear before we invoke dev_hold()
 159         */
 160        read_lock(&ibdev->roce.netdev_lock);
 161        ndev = ibdev->roce.netdev;
 162        if (ndev)
 163                dev_hold(ndev);
 164        read_unlock(&ibdev->roce.netdev_lock);
 165
 166        return ndev;
 167}
 168
 169static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 170                                struct ib_port_attr *props)
 171{
 172        struct mlx5_ib_dev *dev = to_mdev(device);
 173        struct net_device *ndev, *upper;
 174        enum ib_mtu ndev_ib_mtu;
 175        u16 qkey_viol_cntr;
 176
 177        memset(props, 0, sizeof(*props));
 178
 179        props->port_cap_flags  |= IB_PORT_CM_SUP;
 180        props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
 181
 182        props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
 183                                                roce_address_table_size);
 184        props->max_mtu          = IB_MTU_4096;
 185        props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
 186        props->pkey_tbl_len     = 1;
 187        props->state            = IB_PORT_DOWN;
 188        props->phys_state       = 3;
 189
 190        mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
 191        props->qkey_viol_cntr = qkey_viol_cntr;
 192
 193        ndev = mlx5_ib_get_netdev(device, port_num);
 194        if (!ndev)
 195                return 0;
 196
 197        if (mlx5_lag_is_active(dev->mdev)) {
 198                rcu_read_lock();
 199                upper = netdev_master_upper_dev_get_rcu(ndev);
 200                if (upper) {
 201                        dev_put(ndev);
 202                        ndev = upper;
 203                        dev_hold(ndev);
 204                }
 205                rcu_read_unlock();
 206        }
 207
 208        if (netif_running(ndev) && netif_carrier_ok(ndev)) {
 209                props->state      = IB_PORT_ACTIVE;
 210                props->phys_state = 5;
 211        }
 212
 213        ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
 214
 215        dev_put(ndev);
 216
 217        props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
 218
 219        props->active_width     = IB_WIDTH_4X;  /* TODO */
 220        props->active_speed     = IB_SPEED_QDR; /* TODO */
 221
 222        return 0;
 223}
 224
 225static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
 226                                     const struct ib_gid_attr *attr,
 227                                     void *mlx5_addr)
 228{
 229#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
 230        char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
 231                                               source_l3_address);
 232        void *mlx5_addr_mac     = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
 233                                               source_mac_47_32);
 234
 235        if (!gid)
 236                return;
 237
 238        ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
 239
 240        if (is_vlan_dev(attr->ndev)) {
 241                MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
 242                MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
 243        }
 244
 245        switch (attr->gid_type) {
 246        case IB_GID_TYPE_IB:
 247                MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
 248                break;
 249        case IB_GID_TYPE_ROCE_UDP_ENCAP:
 250                MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
 251                break;
 252
 253        default:
 254                WARN_ON(true);
 255        }
 256
 257        if (attr->gid_type != IB_GID_TYPE_IB) {
 258                if (ipv6_addr_v4mapped((void *)gid))
 259                        MLX5_SET_RA(mlx5_addr, roce_l3_type,
 260                                    MLX5_ROCE_L3_TYPE_IPV4);
 261                else
 262                        MLX5_SET_RA(mlx5_addr, roce_l3_type,
 263                                    MLX5_ROCE_L3_TYPE_IPV6);
 264        }
 265
 266        if ((attr->gid_type == IB_GID_TYPE_IB) ||
 267            !ipv6_addr_v4mapped((void *)gid))
 268                memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
 269        else
 270                memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
 271}
 272
 273static int set_roce_addr(struct ib_device *device, u8 port_num,
 274                         unsigned int index,
 275                         const union ib_gid *gid,
 276                         const struct ib_gid_attr *attr)
 277{
 278        struct mlx5_ib_dev *dev = to_mdev(device);
 279        u32  in[MLX5_ST_SZ_DW(set_roce_address_in)]  = {0};
 280        u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0};
 281        void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
 282        enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
 283
 284        if (ll != IB_LINK_LAYER_ETHERNET)
 285                return -EINVAL;
 286
 287        ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
 288
 289        MLX5_SET(set_roce_address_in, in, roce_address_index, index);
 290        MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
 291        return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
 292}
 293
 294static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
 295                           unsigned int index, const union ib_gid *gid,
 296                           const struct ib_gid_attr *attr,
 297                           __always_unused void **context)
 298{
 299        return set_roce_addr(device, port_num, index, gid, attr);
 300}
 301
 302static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
 303                           unsigned int index, __always_unused void **context)
 304{
 305        return set_roce_addr(device, port_num, index, NULL, NULL);
 306}
 307
 308__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
 309                               int index)
 310{
 311        struct ib_gid_attr attr;
 312        union ib_gid gid;
 313
 314        if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
 315                return 0;
 316
 317        if (!attr.ndev)
 318                return 0;
 319
 320        dev_put(attr.ndev);
 321
 322        if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
 323                return 0;
 324
 325        return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
 326}
 327
 328static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
 329{
 330        if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
 331                return !MLX5_CAP_GEN(dev->mdev, ib_virt);
 332        return 0;
 333}
 334
 335enum {
 336        MLX5_VPORT_ACCESS_METHOD_MAD,
 337        MLX5_VPORT_ACCESS_METHOD_HCA,
 338        MLX5_VPORT_ACCESS_METHOD_NIC,
 339};
 340
 341static int mlx5_get_vport_access_method(struct ib_device *ibdev)
 342{
 343        if (mlx5_use_mad_ifc(to_mdev(ibdev)))
 344                return MLX5_VPORT_ACCESS_METHOD_MAD;
 345
 346        if (mlx5_ib_port_link_layer(ibdev, 1) ==
 347            IB_LINK_LAYER_ETHERNET)
 348                return MLX5_VPORT_ACCESS_METHOD_NIC;
 349
 350        return MLX5_VPORT_ACCESS_METHOD_HCA;
 351}
 352
 353static void get_atomic_caps(struct mlx5_ib_dev *dev,
 354                            struct ib_device_attr *props)
 355{
 356        u8 tmp;
 357        u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
 358        u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
 359        u8 atomic_req_8B_endianness_mode =
 360                MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
 361
 362        /* Check if HW supports 8 bytes standard atomic operations and capable
 363         * of host endianness respond
 364         */
 365        tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
 366        if (((atomic_operations & tmp) == tmp) &&
 367            (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
 368            (atomic_req_8B_endianness_mode)) {
 369                props->atomic_cap = IB_ATOMIC_HCA;
 370        } else {
 371                props->atomic_cap = IB_ATOMIC_NONE;
 372        }
 373}
 374
 375static int mlx5_query_system_image_guid(struct ib_device *ibdev,
 376                                        __be64 *sys_image_guid)
 377{
 378        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 379        struct mlx5_core_dev *mdev = dev->mdev;
 380        u64 tmp;
 381        int err;
 382
 383        switch (mlx5_get_vport_access_method(ibdev)) {
 384        case MLX5_VPORT_ACCESS_METHOD_MAD:
 385                return mlx5_query_mad_ifc_system_image_guid(ibdev,
 386                                                            sys_image_guid);
 387
 388        case MLX5_VPORT_ACCESS_METHOD_HCA:
 389                err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
 390                break;
 391
 392        case MLX5_VPORT_ACCESS_METHOD_NIC:
 393                err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
 394                break;
 395
 396        default:
 397                return -EINVAL;
 398        }
 399
 400        if (!err)
 401                *sys_image_guid = cpu_to_be64(tmp);
 402
 403        return err;
 404
 405}
 406
 407static int mlx5_query_max_pkeys(struct ib_device *ibdev,
 408                                u16 *max_pkeys)
 409{
 410        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 411        struct mlx5_core_dev *mdev = dev->mdev;
 412
 413        switch (mlx5_get_vport_access_method(ibdev)) {
 414        case MLX5_VPORT_ACCESS_METHOD_MAD:
 415                return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
 416
 417        case MLX5_VPORT_ACCESS_METHOD_HCA:
 418        case MLX5_VPORT_ACCESS_METHOD_NIC:
 419                *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
 420                                                pkey_table_size));
 421                return 0;
 422
 423        default:
 424                return -EINVAL;
 425        }
 426}
 427
 428static int mlx5_query_vendor_id(struct ib_device *ibdev,
 429                                u32 *vendor_id)
 430{
 431        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 432
 433        switch (mlx5_get_vport_access_method(ibdev)) {
 434        case MLX5_VPORT_ACCESS_METHOD_MAD:
 435                return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
 436
 437        case MLX5_VPORT_ACCESS_METHOD_HCA:
 438        case MLX5_VPORT_ACCESS_METHOD_NIC:
 439                return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
 440
 441        default:
 442                return -EINVAL;
 443        }
 444}
 445
 446static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
 447                                __be64 *node_guid)
 448{
 449        u64 tmp;
 450        int err;
 451
 452        switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
 453        case MLX5_VPORT_ACCESS_METHOD_MAD:
 454                return mlx5_query_mad_ifc_node_guid(dev, node_guid);
 455
 456        case MLX5_VPORT_ACCESS_METHOD_HCA:
 457                err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
 458                break;
 459
 460        case MLX5_VPORT_ACCESS_METHOD_NIC:
 461                err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
 462                break;
 463
 464        default:
 465                return -EINVAL;
 466        }
 467
 468        if (!err)
 469                *node_guid = cpu_to_be64(tmp);
 470
 471        return err;
 472}
 473
 474struct mlx5_reg_node_desc {
 475        u8      desc[IB_DEVICE_NODE_DESC_MAX];
 476};
 477
 478static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
 479{
 480        struct mlx5_reg_node_desc in;
 481
 482        if (mlx5_use_mad_ifc(dev))
 483                return mlx5_query_mad_ifc_node_desc(dev, node_desc);
 484
 485        memset(&in, 0, sizeof(in));
 486
 487        return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
 488                                    sizeof(struct mlx5_reg_node_desc),
 489                                    MLX5_REG_NODE_DESC, 0, 0);
 490}
 491
 492static int mlx5_ib_query_device(struct ib_device *ibdev,
 493                                struct ib_device_attr *props,
 494                                struct ib_udata *uhw)
 495{
 496        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 497        struct mlx5_core_dev *mdev = dev->mdev;
 498        int err = -ENOMEM;
 499        int max_rq_sg;
 500        int max_sq_sg;
 501        u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
 502        struct mlx5_ib_query_device_resp resp = {};
 503        size_t resp_len;
 504        u64 max_tso;
 505
 506        resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
 507        if (uhw->outlen && uhw->outlen < resp_len)
 508                return -EINVAL;
 509        else
 510                resp.response_length = resp_len;
 511
 512        if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
 513                return -EINVAL;
 514
 515        memset(props, 0, sizeof(*props));
 516        err = mlx5_query_system_image_guid(ibdev,
 517                                           &props->sys_image_guid);
 518        if (err)
 519                return err;
 520
 521        err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
 522        if (err)
 523                return err;
 524
 525        err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
 526        if (err)
 527                return err;
 528
 529        props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
 530                (fw_rev_min(dev->mdev) << 16) |
 531                fw_rev_sub(dev->mdev);
 532        props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
 533                IB_DEVICE_PORT_ACTIVE_EVENT             |
 534                IB_DEVICE_SYS_IMAGE_GUID                |
 535                IB_DEVICE_RC_RNR_NAK_GEN;
 536
 537        if (MLX5_CAP_GEN(mdev, pkv))
 538                props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
 539        if (MLX5_CAP_GEN(mdev, qkv))
 540                props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
 541        if (MLX5_CAP_GEN(mdev, apm))
 542                props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
 543        if (MLX5_CAP_GEN(mdev, xrc))
 544                props->device_cap_flags |= IB_DEVICE_XRC;
 545        if (MLX5_CAP_GEN(mdev, imaicl)) {
 546                props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
 547                                           IB_DEVICE_MEM_WINDOW_TYPE_2B;
 548                props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
 549                /* We support 'Gappy' memory registration too */
 550                props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
 551        }
 552        props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
 553        if (MLX5_CAP_GEN(mdev, sho)) {
 554                props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
 555                /* At this stage no support for signature handover */
 556                props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
 557                                      IB_PROT_T10DIF_TYPE_2 |
 558                                      IB_PROT_T10DIF_TYPE_3;
 559                props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
 560                                       IB_GUARD_T10DIF_CSUM;
 561        }
 562        if (MLX5_CAP_GEN(mdev, block_lb_mc))
 563                props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 564
 565        if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
 566                if (MLX5_CAP_ETH(mdev, csum_cap))
 567                        props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
 568
 569                if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
 570                        max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
 571                        if (max_tso) {
 572                                resp.tso_caps.max_tso = 1 << max_tso;
 573                                resp.tso_caps.supported_qpts |=
 574                                        1 << IB_QPT_RAW_PACKET;
 575                                resp.response_length += sizeof(resp.tso_caps);
 576                        }
 577                }
 578
 579                if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
 580                        resp.rss_caps.rx_hash_function =
 581                                                MLX5_RX_HASH_FUNC_TOEPLITZ;
 582                        resp.rss_caps.rx_hash_fields_mask =
 583                                                MLX5_RX_HASH_SRC_IPV4 |
 584                                                MLX5_RX_HASH_DST_IPV4 |
 585                                                MLX5_RX_HASH_SRC_IPV6 |
 586                                                MLX5_RX_HASH_DST_IPV6 |
 587                                                MLX5_RX_HASH_SRC_PORT_TCP |
 588                                                MLX5_RX_HASH_DST_PORT_TCP |
 589                                                MLX5_RX_HASH_SRC_PORT_UDP |
 590                                                MLX5_RX_HASH_DST_PORT_UDP;
 591                        resp.response_length += sizeof(resp.rss_caps);
 592                }
 593        } else {
 594                if (field_avail(typeof(resp), tso_caps, uhw->outlen))
 595                        resp.response_length += sizeof(resp.tso_caps);
 596                if (field_avail(typeof(resp), rss_caps, uhw->outlen))
 597                        resp.response_length += sizeof(resp.rss_caps);
 598        }
 599
 600        if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
 601                props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
 602                props->device_cap_flags |= IB_DEVICE_UD_TSO;
 603        }
 604
 605        if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
 606            MLX5_CAP_ETH(dev->mdev, scatter_fcs))
 607                props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
 608
 609        if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
 610                props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
 611
 612        props->vendor_part_id      = mdev->pdev->device;
 613        props->hw_ver              = mdev->pdev->revision;
 614
 615        props->max_mr_size         = ~0ull;
 616        props->page_size_cap       = ~(min_page_size - 1);
 617        props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
 618        props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
 619        max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
 620                     sizeof(struct mlx5_wqe_data_seg);
 621        max_sq_sg = (MLX5_CAP_GEN(mdev, max_wqe_sz_sq) -
 622                     sizeof(struct mlx5_wqe_ctrl_seg)) /
 623                     sizeof(struct mlx5_wqe_data_seg);
 624        props->max_sge = min(max_rq_sg, max_sq_sg);
 625        props->max_sge_rd          = MLX5_MAX_SGE_RD;
 626        props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
 627        props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
 628        props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
 629        props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
 630        props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
 631        props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
 632        props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
 633        props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
 634        props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
 635        props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
 636        props->max_srq_sge         = max_rq_sg - 1;
 637        props->max_fast_reg_page_list_len =
 638                1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
 639        get_atomic_caps(dev, props);
 640        props->masked_atomic_cap   = IB_ATOMIC_NONE;
 641        props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
 642        props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
 643        props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 644                                           props->max_mcast_grp;
 645        props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
 646        props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
 647        props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
 648
 649#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 650        if (MLX5_CAP_GEN(mdev, pg))
 651                props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
 652        props->odp_caps = dev->odp_caps;
 653#endif
 654
 655        if (MLX5_CAP_GEN(mdev, cd))
 656                props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
 657
 658        if (!mlx5_core_is_pf(mdev))
 659                props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
 660
 661        if (mlx5_ib_port_link_layer(ibdev, 1) ==
 662            IB_LINK_LAYER_ETHERNET) {
 663                props->rss_caps.max_rwq_indirection_tables =
 664                        1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
 665                props->rss_caps.max_rwq_indirection_table_size =
 666                        1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
 667                props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
 668                props->max_wq_type_rq =
 669                        1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
 670        }
 671
 672        if (uhw->outlen) {
 673                err = ib_copy_to_udata(uhw, &resp, resp.response_length);
 674
 675                if (err)
 676                        return err;
 677        }
 678
 679        return 0;
 680}
 681
 682enum mlx5_ib_width {
 683        MLX5_IB_WIDTH_1X        = 1 << 0,
 684        MLX5_IB_WIDTH_2X        = 1 << 1,
 685        MLX5_IB_WIDTH_4X        = 1 << 2,
 686        MLX5_IB_WIDTH_8X        = 1 << 3,
 687        MLX5_IB_WIDTH_12X       = 1 << 4
 688};
 689
 690static int translate_active_width(struct ib_device *ibdev, u8 active_width,
 691                                  u8 *ib_width)
 692{
 693        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 694        int err = 0;
 695
 696        if (active_width & MLX5_IB_WIDTH_1X) {
 697                *ib_width = IB_WIDTH_1X;
 698        } else if (active_width & MLX5_IB_WIDTH_2X) {
 699                mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n",
 700                            (int)active_width);
 701                err = -EINVAL;
 702        } else if (active_width & MLX5_IB_WIDTH_4X) {
 703                *ib_width = IB_WIDTH_4X;
 704        } else if (active_width & MLX5_IB_WIDTH_8X) {
 705                *ib_width = IB_WIDTH_8X;
 706        } else if (active_width & MLX5_IB_WIDTH_12X) {
 707                *ib_width = IB_WIDTH_12X;
 708        } else {
 709                mlx5_ib_dbg(dev, "Invalid active_width %d\n",
 710                            (int)active_width);
 711                err = -EINVAL;
 712        }
 713
 714        return err;
 715}
 716
 717static int mlx5_mtu_to_ib_mtu(int mtu)
 718{
 719        switch (mtu) {
 720        case 256: return 1;
 721        case 512: return 2;
 722        case 1024: return 3;
 723        case 2048: return 4;
 724        case 4096: return 5;
 725        default:
 726                pr_warn("invalid mtu\n");
 727                return -1;
 728        }
 729}
 730
 731enum ib_max_vl_num {
 732        __IB_MAX_VL_0           = 1,
 733        __IB_MAX_VL_0_1         = 2,
 734        __IB_MAX_VL_0_3         = 3,
 735        __IB_MAX_VL_0_7         = 4,
 736        __IB_MAX_VL_0_14        = 5,
 737};
 738
 739enum mlx5_vl_hw_cap {
 740        MLX5_VL_HW_0    = 1,
 741        MLX5_VL_HW_0_1  = 2,
 742        MLX5_VL_HW_0_2  = 3,
 743        MLX5_VL_HW_0_3  = 4,
 744        MLX5_VL_HW_0_4  = 5,
 745        MLX5_VL_HW_0_5  = 6,
 746        MLX5_VL_HW_0_6  = 7,
 747        MLX5_VL_HW_0_7  = 8,
 748        MLX5_VL_HW_0_14 = 15
 749};
 750
 751static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
 752                                u8 *max_vl_num)
 753{
 754        switch (vl_hw_cap) {
 755        case MLX5_VL_HW_0:
 756                *max_vl_num = __IB_MAX_VL_0;
 757                break;
 758        case MLX5_VL_HW_0_1:
 759                *max_vl_num = __IB_MAX_VL_0_1;
 760                break;
 761        case MLX5_VL_HW_0_3:
 762                *max_vl_num = __IB_MAX_VL_0_3;
 763                break;
 764        case MLX5_VL_HW_0_7:
 765                *max_vl_num = __IB_MAX_VL_0_7;
 766                break;
 767        case MLX5_VL_HW_0_14:
 768                *max_vl_num = __IB_MAX_VL_0_14;
 769                break;
 770
 771        default:
 772                return -EINVAL;
 773        }
 774
 775        return 0;
 776}
 777
 778static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
 779                               struct ib_port_attr *props)
 780{
 781        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 782        struct mlx5_core_dev *mdev = dev->mdev;
 783        struct mlx5_hca_vport_context *rep;
 784        u16 max_mtu;
 785        u16 oper_mtu;
 786        int err;
 787        u8 ib_link_width_oper;
 788        u8 vl_hw_cap;
 789
 790        rep = kzalloc(sizeof(*rep), GFP_KERNEL);
 791        if (!rep) {
 792                err = -ENOMEM;
 793                goto out;
 794        }
 795
 796        memset(props, 0, sizeof(*props));
 797
 798        err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
 799        if (err)
 800                goto out;
 801
 802        props->lid              = rep->lid;
 803        props->lmc              = rep->lmc;
 804        props->sm_lid           = rep->sm_lid;
 805        props->sm_sl            = rep->sm_sl;
 806        props->state            = rep->vport_state;
 807        props->phys_state       = rep->port_physical_state;
 808        props->port_cap_flags   = rep->cap_mask1;
 809        props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
 810        props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
 811        props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
 812        props->bad_pkey_cntr    = rep->pkey_violation_counter;
 813        props->qkey_viol_cntr   = rep->qkey_violation_counter;
 814        props->subnet_timeout   = rep->subnet_timeout;
 815        props->init_type_reply  = rep->init_type_reply;
 816        props->grh_required     = rep->grh_required;
 817
 818        err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
 819        if (err)
 820                goto out;
 821
 822        err = translate_active_width(ibdev, ib_link_width_oper,
 823                                     &props->active_width);
 824        if (err)
 825                goto out;
 826        err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
 827        if (err)
 828                goto out;
 829
 830        mlx5_query_port_max_mtu(mdev, &max_mtu, port);
 831
 832        props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
 833
 834        mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
 835
 836        props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
 837
 838        err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
 839        if (err)
 840                goto out;
 841
 842        err = translate_max_vl_num(ibdev, vl_hw_cap,
 843                                   &props->max_vl_num);
 844out:
 845        kfree(rep);
 846        return err;
 847}
 848
 849int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 850                       struct ib_port_attr *props)
 851{
 852        switch (mlx5_get_vport_access_method(ibdev)) {
 853        case MLX5_VPORT_ACCESS_METHOD_MAD:
 854                return mlx5_query_mad_ifc_port(ibdev, port, props);
 855
 856        case MLX5_VPORT_ACCESS_METHOD_HCA:
 857                return mlx5_query_hca_port(ibdev, port, props);
 858
 859        case MLX5_VPORT_ACCESS_METHOD_NIC:
 860                return mlx5_query_port_roce(ibdev, port, props);
 861
 862        default:
 863                return -EINVAL;
 864        }
 865}
 866
 867static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 868                             union ib_gid *gid)
 869{
 870        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 871        struct mlx5_core_dev *mdev = dev->mdev;
 872
 873        switch (mlx5_get_vport_access_method(ibdev)) {
 874        case MLX5_VPORT_ACCESS_METHOD_MAD:
 875                return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
 876
 877        case MLX5_VPORT_ACCESS_METHOD_HCA:
 878                return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
 879
 880        default:
 881                return -EINVAL;
 882        }
 883
 884}
 885
 886static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
 887                              u16 *pkey)
 888{
 889        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 890        struct mlx5_core_dev *mdev = dev->mdev;
 891
 892        switch (mlx5_get_vport_access_method(ibdev)) {
 893        case MLX5_VPORT_ACCESS_METHOD_MAD:
 894                return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
 895
 896        case MLX5_VPORT_ACCESS_METHOD_HCA:
 897        case MLX5_VPORT_ACCESS_METHOD_NIC:
 898                return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
 899                                                 pkey);
 900        default:
 901                return -EINVAL;
 902        }
 903}
 904
 905static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
 906                                 struct ib_device_modify *props)
 907{
 908        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 909        struct mlx5_reg_node_desc in;
 910        struct mlx5_reg_node_desc out;
 911        int err;
 912
 913        if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
 914                return -EOPNOTSUPP;
 915
 916        if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
 917                return 0;
 918
 919        /*
 920         * If possible, pass node desc to FW, so it can generate
 921         * a 144 trap.  If cmd fails, just ignore.
 922         */
 923        memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
 924        err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
 925                                   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
 926        if (err)
 927                return err;
 928
 929        memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
 930
 931        return err;
 932}
 933
 934static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
 935                               struct ib_port_modify *props)
 936{
 937        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 938        struct ib_port_attr attr;
 939        u32 tmp;
 940        int err;
 941
 942        mutex_lock(&dev->cap_mask_mutex);
 943
 944        err = mlx5_ib_query_port(ibdev, port, &attr);
 945        if (err)
 946                goto out;
 947
 948        tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
 949                ~props->clr_port_cap_mask;
 950
 951        err = mlx5_set_port_caps(dev->mdev, port, tmp);
 952
 953out:
 954        mutex_unlock(&dev->cap_mask_mutex);
 955        return err;
 956}
 957
 958static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 959                                                  struct ib_udata *udata)
 960{
 961        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 962        struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 963        struct mlx5_ib_alloc_ucontext_resp resp = {};
 964        struct mlx5_ib_ucontext *context;
 965        struct mlx5_uuar_info *uuari;
 966        struct mlx5_uar *uars;
 967        int gross_uuars;
 968        int num_uars;
 969        int ver;
 970        int uuarn;
 971        int err;
 972        int i;
 973        size_t reqlen;
 974        size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
 975                                     max_cqe_version);
 976
 977        if (!dev->ib_active)
 978                return ERR_PTR(-EAGAIN);
 979
 980        if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
 981                return ERR_PTR(-EINVAL);
 982
 983        reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
 984        if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
 985                ver = 0;
 986        else if (reqlen >= min_req_v2)
 987                ver = 2;
 988        else
 989                return ERR_PTR(-EINVAL);
 990
 991        err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
 992        if (err)
 993                return ERR_PTR(err);
 994
 995        if (req.flags)
 996                return ERR_PTR(-EINVAL);
 997
 998        if (req.total_num_uuars > MLX5_MAX_UUARS)
 999                return ERR_PTR(-ENOMEM);
1000
1001        if (req.total_num_uuars == 0)
1002                return ERR_PTR(-EINVAL);
1003
1004        if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1005                return ERR_PTR(-EOPNOTSUPP);
1006
1007        if (reqlen > sizeof(req) &&
1008            !ib_is_udata_cleared(udata, sizeof(req),
1009                                 reqlen - sizeof(req)))
1010                return ERR_PTR(-EOPNOTSUPP);
1011
1012        req.total_num_uuars = ALIGN(req.total_num_uuars,
1013                                    MLX5_NON_FP_BF_REGS_PER_PAGE);
1014        if (req.num_low_latency_uuars > req.total_num_uuars - 1)
1015                return ERR_PTR(-EINVAL);
1016
1017        num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
1018        gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
1019        resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1020        if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1021                resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1022        resp.cache_line_size = cache_line_size();
1023        resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1024        resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1025        resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1026        resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1027        resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1028        resp.cqe_version = min_t(__u8,
1029                                 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1030                                 req.max_cqe_version);
1031        resp.response_length = min(offsetof(typeof(resp), response_length) +
1032                                   sizeof(resp.response_length), udata->outlen);
1033
1034        context = kzalloc(sizeof(*context), GFP_KERNEL);
1035        if (!context)
1036                return ERR_PTR(-ENOMEM);
1037
1038        uuari = &context->uuari;
1039        mutex_init(&uuari->lock);
1040        uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
1041        if (!uars) {
1042                err = -ENOMEM;
1043                goto out_ctx;
1044        }
1045
1046        uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
1047                                sizeof(*uuari->bitmap),
1048                                GFP_KERNEL);
1049        if (!uuari->bitmap) {
1050                err = -ENOMEM;
1051                goto out_uar_ctx;
1052        }
1053        /*
1054         * clear all fast path uuars
1055         */
1056        for (i = 0; i < gross_uuars; i++) {
1057                uuarn = i & 3;
1058                if (uuarn == 2 || uuarn == 3)
1059                        set_bit(i, uuari->bitmap);
1060        }
1061
1062        uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
1063        if (!uuari->count) {
1064                err = -ENOMEM;
1065                goto out_bitmap;
1066        }
1067
1068        for (i = 0; i < num_uars; i++) {
1069                err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
1070                if (err)
1071                        goto out_count;
1072        }
1073
1074#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1075        context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
1076#endif
1077
1078        if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
1079                err = mlx5_core_alloc_transport_domain(dev->mdev,
1080                                                       &context->tdn);
1081                if (err)
1082                        goto out_uars;
1083        }
1084
1085        INIT_LIST_HEAD(&context->vma_private_list);
1086        INIT_LIST_HEAD(&context->db_page_list);
1087        mutex_init(&context->db_page_mutex);
1088
1089        resp.tot_uuars = req.total_num_uuars;
1090        resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
1091
1092        if (field_avail(typeof(resp), cqe_version, udata->outlen))
1093                resp.response_length += sizeof(resp.cqe_version);
1094
1095        if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1096                resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE;
1097                resp.response_length += sizeof(resp.cmds_supp_uhw);
1098        }
1099
1100        /*
1101         * We don't want to expose information from the PCI bar that is located
1102         * after 4096 bytes, so if the arch only supports larger pages, let's
1103         * pretend we don't support reading the HCA's core clock. This is also
1104         * forced by mmap function.
1105         */
1106        if (PAGE_SIZE <= 4096 &&
1107            field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
1108                resp.comp_mask |=
1109                        MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1110                resp.hca_core_clock_offset =
1111                        offsetof(struct mlx5_init_seg, internal_timer_h) %
1112                        PAGE_SIZE;
1113                resp.response_length += sizeof(resp.hca_core_clock_offset) +
1114                                        sizeof(resp.reserved2);
1115        }
1116
1117        err = ib_copy_to_udata(udata, &resp, resp.response_length);
1118        if (err)
1119                goto out_td;
1120
1121        uuari->ver = ver;
1122        uuari->num_low_latency_uuars = req.num_low_latency_uuars;
1123        uuari->uars = uars;
1124        uuari->num_uars = num_uars;
1125        context->cqe_version = resp.cqe_version;
1126
1127        return &context->ibucontext;
1128
1129out_td:
1130        if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1131                mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1132
1133out_uars:
1134        for (i--; i >= 0; i--)
1135                mlx5_cmd_free_uar(dev->mdev, uars[i].index);
1136out_count:
1137        kfree(uuari->count);
1138
1139out_bitmap:
1140        kfree(uuari->bitmap);
1141
1142out_uar_ctx:
1143        kfree(uars);
1144
1145out_ctx:
1146        kfree(context);
1147        return ERR_PTR(err);
1148}
1149
1150static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1151{
1152        struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1153        struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1154        struct mlx5_uuar_info *uuari = &context->uuari;
1155        int i;
1156
1157        if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1158                mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1159
1160        for (i = 0; i < uuari->num_uars; i++) {
1161                if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
1162                        mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
1163        }
1164
1165        kfree(uuari->count);
1166        kfree(uuari->bitmap);
1167        kfree(uuari->uars);
1168        kfree(context);
1169
1170        return 0;
1171}
1172
1173static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
1174{
1175        return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
1176}
1177
1178static int get_command(unsigned long offset)
1179{
1180        return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1181}
1182
1183static int get_arg(unsigned long offset)
1184{
1185        return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1186}
1187
1188static int get_index(unsigned long offset)
1189{
1190        return get_arg(offset);
1191}
1192
1193static void  mlx5_ib_vma_open(struct vm_area_struct *area)
1194{
1195        /* vma_open is called when a new VMA is created on top of our VMA.  This
1196         * is done through either mremap flow or split_vma (usually due to
1197         * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
1198         * as this VMA is strongly hardware related.  Therefore we set the
1199         * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
1200         * calling us again and trying to do incorrect actions.  We assume that
1201         * the original VMA size is exactly a single page, and therefore all
1202         * "splitting" operation will not happen to it.
1203         */
1204        area->vm_ops = NULL;
1205}
1206
1207static void  mlx5_ib_vma_close(struct vm_area_struct *area)
1208{
1209        struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
1210
1211        /* It's guaranteed that all VMAs opened on a FD are closed before the
1212         * file itself is closed, therefore no sync is needed with the regular
1213         * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
1214         * However need a sync with accessing the vma as part of
1215         * mlx5_ib_disassociate_ucontext.
1216         * The close operation is usually called under mm->mmap_sem except when
1217         * process is exiting.
1218         * The exiting case is handled explicitly as part of
1219         * mlx5_ib_disassociate_ucontext.
1220         */
1221        mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
1222
1223        /* setting the vma context pointer to null in the mlx5_ib driver's
1224         * private data, to protect a race condition in
1225         * mlx5_ib_disassociate_ucontext().
1226         */
1227        mlx5_ib_vma_priv_data->vma = NULL;
1228        list_del(&mlx5_ib_vma_priv_data->list);
1229        kfree(mlx5_ib_vma_priv_data);
1230}
1231
1232static const struct vm_operations_struct mlx5_ib_vm_ops = {
1233        .open = mlx5_ib_vma_open,
1234        .close = mlx5_ib_vma_close
1235};
1236
1237static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
1238                                struct mlx5_ib_ucontext *ctx)
1239{
1240        struct mlx5_ib_vma_private_data *vma_prv;
1241        struct list_head *vma_head = &ctx->vma_private_list;
1242
1243        vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
1244        if (!vma_prv)
1245                return -ENOMEM;
1246
1247        vma_prv->vma = vma;
1248        vma->vm_private_data = vma_prv;
1249        vma->vm_ops =  &mlx5_ib_vm_ops;
1250
1251        list_add(&vma_prv->list, vma_head);
1252
1253        return 0;
1254}
1255
1256static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
1257{
1258        int ret;
1259        struct vm_area_struct *vma;
1260        struct mlx5_ib_vma_private_data *vma_private, *n;
1261        struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1262        struct task_struct *owning_process  = NULL;
1263        struct mm_struct   *owning_mm       = NULL;
1264
1265        owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
1266        if (!owning_process)
1267                return;
1268
1269        owning_mm = get_task_mm(owning_process);
1270        if (!owning_mm) {
1271                pr_info("no mm, disassociate ucontext is pending task termination\n");
1272                while (1) {
1273                        put_task_struct(owning_process);
1274                        usleep_range(1000, 2000);
1275                        owning_process = get_pid_task(ibcontext->tgid,
1276                                                      PIDTYPE_PID);
1277                        if (!owning_process ||
1278                            owning_process->state == TASK_DEAD) {
1279                                pr_info("disassociate ucontext done, task was terminated\n");
1280                                /* in case task was dead need to release the
1281                                 * task struct.
1282                                 */
1283                                if (owning_process)
1284                                        put_task_struct(owning_process);
1285                                return;
1286                        }
1287                }
1288        }
1289
1290        /* need to protect from a race on closing the vma as part of
1291         * mlx5_ib_vma_close.
1292         */
1293        down_read(&owning_mm->mmap_sem);
1294        list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
1295                                 list) {
1296                vma = vma_private->vma;
1297                ret = zap_vma_ptes(vma, vma->vm_start,
1298                                   PAGE_SIZE);
1299                WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__);
1300                /* context going to be destroyed, should
1301                 * not access ops any more.
1302                 */
1303                vma->vm_ops = NULL;
1304                list_del(&vma_private->list);
1305                kfree(vma_private);
1306        }
1307        up_read(&owning_mm->mmap_sem);
1308        mmput(owning_mm);
1309        put_task_struct(owning_process);
1310}
1311
1312static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
1313{
1314        switch (cmd) {
1315        case MLX5_IB_MMAP_WC_PAGE:
1316                return "WC";
1317        case MLX5_IB_MMAP_REGULAR_PAGE:
1318                return "best effort WC";
1319        case MLX5_IB_MMAP_NC_PAGE:
1320                return "NC";
1321        default:
1322                return NULL;
1323        }
1324}
1325
1326static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
1327                    struct vm_area_struct *vma,
1328                    struct mlx5_ib_ucontext *context)
1329{
1330        struct mlx5_uuar_info *uuari = &context->uuari;
1331        int err;
1332        unsigned long idx;
1333        phys_addr_t pfn, pa;
1334        pgprot_t prot;
1335
1336        switch (cmd) {
1337        case MLX5_IB_MMAP_WC_PAGE:
1338/* Some architectures don't support WC memory */
1339#if defined(CONFIG_X86)
1340                if (!pat_enabled())
1341                        return -EPERM;
1342#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
1343                        return -EPERM;
1344#endif
1345        /* fall through */
1346        case MLX5_IB_MMAP_REGULAR_PAGE:
1347                /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
1348                prot = pgprot_writecombine(vma->vm_page_prot);
1349                break;
1350        case MLX5_IB_MMAP_NC_PAGE:
1351                prot = pgprot_noncached(vma->vm_page_prot);
1352                break;
1353        default:
1354                return -EINVAL;
1355        }
1356
1357        if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1358                return -EINVAL;
1359
1360        idx = get_index(vma->vm_pgoff);
1361        if (idx >= uuari->num_uars)
1362                return -EINVAL;
1363
1364        pfn = uar_index2pfn(dev, uuari->uars[idx].index);
1365        mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
1366
1367        vma->vm_page_prot = prot;
1368        err = io_remap_pfn_range(vma, vma->vm_start, pfn,
1369                                 PAGE_SIZE, vma->vm_page_prot);
1370        if (err) {
1371                mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
1372                            err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
1373                return -EAGAIN;
1374        }
1375
1376        pa = pfn << PAGE_SHIFT;
1377        mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
1378                    vma->vm_start, &pa);
1379
1380        return mlx5_ib_set_vma_data(vma, context);
1381}
1382
1383static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1384{
1385        struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1386        struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1387        unsigned long command;
1388        phys_addr_t pfn;
1389
1390        command = get_command(vma->vm_pgoff);
1391        switch (command) {
1392        case MLX5_IB_MMAP_WC_PAGE:
1393        case MLX5_IB_MMAP_NC_PAGE:
1394        case MLX5_IB_MMAP_REGULAR_PAGE:
1395                return uar_mmap(dev, command, vma, context);
1396
1397        case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
1398                return -ENOSYS;
1399
1400        case MLX5_IB_MMAP_CORE_CLOCK:
1401                if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1402                        return -EINVAL;
1403
1404                if (vma->vm_flags & VM_WRITE)
1405                        return -EPERM;
1406
1407                /* Don't expose to user-space information it shouldn't have */
1408                if (PAGE_SIZE > 4096)
1409                        return -EOPNOTSUPP;
1410
1411                vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1412                pfn = (dev->mdev->iseg_base +
1413                       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
1414                        PAGE_SHIFT;
1415                if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1416                                       PAGE_SIZE, vma->vm_page_prot))
1417                        return -EAGAIN;
1418
1419                mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
1420                            vma->vm_start,
1421                            (unsigned long long)pfn << PAGE_SHIFT);
1422                break;
1423
1424        default:
1425                return -EINVAL;
1426        }
1427
1428        return 0;
1429}
1430
1431static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1432                                      struct ib_ucontext *context,
1433                                      struct ib_udata *udata)
1434{
1435        struct mlx5_ib_alloc_pd_resp resp;
1436        struct mlx5_ib_pd *pd;
1437        int err;
1438
1439        pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1440        if (!pd)
1441                return ERR_PTR(-ENOMEM);
1442
1443        err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1444        if (err) {
1445                kfree(pd);
1446                return ERR_PTR(err);
1447        }
1448
1449        if (context) {
1450                resp.pdn = pd->pdn;
1451                if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1452                        mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1453                        kfree(pd);
1454                        return ERR_PTR(-EFAULT);
1455                }
1456        }
1457
1458        return &pd->ibpd;
1459}
1460
1461static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1462{
1463        struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1464        struct mlx5_ib_pd *mpd = to_mpd(pd);
1465
1466        mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1467        kfree(mpd);
1468
1469        return 0;
1470}
1471
1472enum {
1473        MATCH_CRITERIA_ENABLE_OUTER_BIT,
1474        MATCH_CRITERIA_ENABLE_MISC_BIT,
1475        MATCH_CRITERIA_ENABLE_INNER_BIT
1476};
1477
1478#define HEADER_IS_ZERO(match_criteria, headers)                            \
1479        !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
1480                    0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
1481
1482static u8 get_match_criteria_enable(u32 *match_criteria)
1483{
1484        u8 match_criteria_enable;
1485
1486        match_criteria_enable =
1487                (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
1488                MATCH_CRITERIA_ENABLE_OUTER_BIT;
1489        match_criteria_enable |=
1490                (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
1491                MATCH_CRITERIA_ENABLE_MISC_BIT;
1492        match_criteria_enable |=
1493                (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
1494                MATCH_CRITERIA_ENABLE_INNER_BIT;
1495
1496        return match_criteria_enable;
1497}
1498
1499static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
1500{
1501        MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
1502        MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
1503}
1504
1505static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
1506{
1507        MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
1508        MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
1509        MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
1510        MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
1511}
1512
1513#define LAST_ETH_FIELD vlan_tag
1514#define LAST_IB_FIELD sl
1515#define LAST_IPV4_FIELD tos
1516#define LAST_IPV6_FIELD traffic_class
1517#define LAST_TCP_UDP_FIELD src_port
1518
1519/* Field is the last supported field */
1520#define FIELDS_NOT_SUPPORTED(filter, field)\
1521        memchr_inv((void *)&filter.field  +\
1522                   sizeof(filter.field), 0,\
1523                   sizeof(filter) -\
1524                   offsetof(typeof(filter), field) -\
1525                   sizeof(filter.field))
1526
1527static int parse_flow_attr(u32 *match_c, u32 *match_v,
1528                           const union ib_flow_spec *ib_spec)
1529{
1530        void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1531                                             outer_headers);
1532        void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1533                                             outer_headers);
1534        void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
1535                                           misc_parameters);
1536        void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
1537                                           misc_parameters);
1538
1539        switch (ib_spec->type) {
1540        case IB_FLOW_SPEC_ETH:
1541                if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
1542                        return -ENOTSUPP;
1543
1544                ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1545                                             dmac_47_16),
1546                                ib_spec->eth.mask.dst_mac);
1547                ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1548                                             dmac_47_16),
1549                                ib_spec->eth.val.dst_mac);
1550
1551                ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1552                                             smac_47_16),
1553                                ib_spec->eth.mask.src_mac);
1554                ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1555                                             smac_47_16),
1556                                ib_spec->eth.val.src_mac);
1557
1558                if (ib_spec->eth.mask.vlan_tag) {
1559                        MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1560                                 vlan_tag, 1);
1561                        MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1562                                 vlan_tag, 1);
1563
1564                        MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1565                                 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
1566                        MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1567                                 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
1568
1569                        MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1570                                 first_cfi,
1571                                 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
1572                        MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1573                                 first_cfi,
1574                                 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
1575
1576                        MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1577                                 first_prio,
1578                                 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
1579                        MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1580                                 first_prio,
1581                                 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
1582                }
1583                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1584                         ethertype, ntohs(ib_spec->eth.mask.ether_type));
1585                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1586                         ethertype, ntohs(ib_spec->eth.val.ether_type));
1587                break;
1588        case IB_FLOW_SPEC_IPV4:
1589                if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
1590                        return -ENOTSUPP;
1591
1592                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1593                         ethertype, 0xffff);
1594                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1595                         ethertype, ETH_P_IP);
1596
1597                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1598                                    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1599                       &ib_spec->ipv4.mask.src_ip,
1600                       sizeof(ib_spec->ipv4.mask.src_ip));
1601                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1602                                    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1603                       &ib_spec->ipv4.val.src_ip,
1604                       sizeof(ib_spec->ipv4.val.src_ip));
1605                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1606                                    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1607                       &ib_spec->ipv4.mask.dst_ip,
1608                       sizeof(ib_spec->ipv4.mask.dst_ip));
1609                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1610                                    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1611                       &ib_spec->ipv4.val.dst_ip,
1612                       sizeof(ib_spec->ipv4.val.dst_ip));
1613
1614                set_tos(outer_headers_c, outer_headers_v,
1615                        ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
1616
1617                set_proto(outer_headers_c, outer_headers_v,
1618                          ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
1619                break;
1620        case IB_FLOW_SPEC_IPV6:
1621                if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
1622                        return -ENOTSUPP;
1623
1624                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1625                         ethertype, 0xffff);
1626                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1627                         ethertype, ETH_P_IPV6);
1628
1629                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1630                                    src_ipv4_src_ipv6.ipv6_layout.ipv6),
1631                       &ib_spec->ipv6.mask.src_ip,
1632                       sizeof(ib_spec->ipv6.mask.src_ip));
1633                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1634                                    src_ipv4_src_ipv6.ipv6_layout.ipv6),
1635                       &ib_spec->ipv6.val.src_ip,
1636                       sizeof(ib_spec->ipv6.val.src_ip));
1637                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1638                                    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1639                       &ib_spec->ipv6.mask.dst_ip,
1640                       sizeof(ib_spec->ipv6.mask.dst_ip));
1641                memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1642                                    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1643                       &ib_spec->ipv6.val.dst_ip,
1644                       sizeof(ib_spec->ipv6.val.dst_ip));
1645
1646                set_tos(outer_headers_c, outer_headers_v,
1647                        ib_spec->ipv6.mask.traffic_class,
1648                        ib_spec->ipv6.val.traffic_class);
1649
1650                set_proto(outer_headers_c, outer_headers_v,
1651                          ib_spec->ipv6.mask.next_hdr,
1652                          ib_spec->ipv6.val.next_hdr);
1653
1654                MLX5_SET(fte_match_set_misc, misc_params_c,
1655                         outer_ipv6_flow_label,
1656                         ntohl(ib_spec->ipv6.mask.flow_label));
1657                MLX5_SET(fte_match_set_misc, misc_params_v,
1658                         outer_ipv6_flow_label,
1659                         ntohl(ib_spec->ipv6.val.flow_label));
1660                break;
1661        case IB_FLOW_SPEC_TCP:
1662                if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1663                                         LAST_TCP_UDP_FIELD))
1664                        return -ENOTSUPP;
1665
1666                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1667                         0xff);
1668                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1669                         IPPROTO_TCP);
1670
1671                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport,
1672                         ntohs(ib_spec->tcp_udp.mask.src_port));
1673                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport,
1674                         ntohs(ib_spec->tcp_udp.val.src_port));
1675
1676                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport,
1677                         ntohs(ib_spec->tcp_udp.mask.dst_port));
1678                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport,
1679                         ntohs(ib_spec->tcp_udp.val.dst_port));
1680                break;
1681        case IB_FLOW_SPEC_UDP:
1682                if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1683                                         LAST_TCP_UDP_FIELD))
1684                        return -ENOTSUPP;
1685
1686                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1687                         0xff);
1688                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1689                         IPPROTO_UDP);
1690
1691                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport,
1692                         ntohs(ib_spec->tcp_udp.mask.src_port));
1693                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport,
1694                         ntohs(ib_spec->tcp_udp.val.src_port));
1695
1696                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport,
1697                         ntohs(ib_spec->tcp_udp.mask.dst_port));
1698                MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport,
1699                         ntohs(ib_spec->tcp_udp.val.dst_port));
1700                break;
1701        default:
1702                return -EINVAL;
1703        }
1704
1705        return 0;
1706}
1707
1708/* If a flow could catch both multicast and unicast packets,
1709 * it won't fall into the multicast flow steering table and this rule
1710 * could steal other multicast packets.
1711 */
1712static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
1713{
1714        struct ib_flow_spec_eth *eth_spec;
1715
1716        if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
1717            ib_attr->size < sizeof(struct ib_flow_attr) +
1718            sizeof(struct ib_flow_spec_eth) ||
1719            ib_attr->num_of_specs < 1)
1720                return false;
1721
1722        eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
1723        if (eth_spec->type != IB_FLOW_SPEC_ETH ||
1724            eth_spec->size != sizeof(*eth_spec))
1725                return false;
1726
1727        return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
1728               is_multicast_ether_addr(eth_spec->val.dst_mac);
1729}
1730
1731static bool is_valid_attr(const struct ib_flow_attr *flow_attr)
1732{
1733        union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
1734        bool has_ipv4_spec = false;
1735        bool eth_type_ipv4 = true;
1736        unsigned int spec_index;
1737
1738        /* Validate that ethertype is correct */
1739        for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1740                if (ib_spec->type == IB_FLOW_SPEC_ETH &&
1741                    ib_spec->eth.mask.ether_type) {
1742                        if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
1743                              ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
1744                                eth_type_ipv4 = false;
1745                } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
1746                        has_ipv4_spec = true;
1747                }
1748                ib_spec = (void *)ib_spec + ib_spec->size;
1749        }
1750        return !has_ipv4_spec || eth_type_ipv4;
1751}
1752
1753static void put_flow_table(struct mlx5_ib_dev *dev,
1754                           struct mlx5_ib_flow_prio *prio, bool ft_added)
1755{
1756        prio->refcount -= !!ft_added;
1757        if (!prio->refcount) {
1758                mlx5_destroy_flow_table(prio->flow_table);
1759                prio->flow_table = NULL;
1760        }
1761}
1762
1763static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
1764{
1765        struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
1766        struct mlx5_ib_flow_handler *handler = container_of(flow_id,
1767                                                          struct mlx5_ib_flow_handler,
1768                                                          ibflow);
1769        struct mlx5_ib_flow_handler *iter, *tmp;
1770
1771        mutex_lock(&dev->flow_db.lock);
1772
1773        list_for_each_entry_safe(iter, tmp, &handler->list, list) {
1774                mlx5_del_flow_rule(iter->rule);
1775                put_flow_table(dev, iter->prio, true);
1776                list_del(&iter->list);
1777                kfree(iter);
1778        }
1779
1780        mlx5_del_flow_rule(handler->rule);
1781        put_flow_table(dev, handler->prio, true);
1782        mutex_unlock(&dev->flow_db.lock);
1783
1784        kfree(handler);
1785
1786        return 0;
1787}
1788
1789static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
1790{
1791        priority *= 2;
1792        if (!dont_trap)
1793                priority++;
1794        return priority;
1795}
1796
1797enum flow_table_type {
1798        MLX5_IB_FT_RX,
1799        MLX5_IB_FT_TX
1800};
1801
1802#define MLX5_FS_MAX_TYPES        10
1803#define MLX5_FS_MAX_ENTRIES      32000UL
1804static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
1805                                                struct ib_flow_attr *flow_attr,
1806                                                enum flow_table_type ft_type)
1807{
1808        bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
1809        struct mlx5_flow_namespace *ns = NULL;
1810        struct mlx5_ib_flow_prio *prio;
1811        struct mlx5_flow_table *ft;
1812        int num_entries;
1813        int num_groups;
1814        int priority;
1815        int err = 0;
1816
1817        if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
1818                if (flow_is_multicast_only(flow_attr) &&
1819                    !dont_trap)
1820                        priority = MLX5_IB_FLOW_MCAST_PRIO;
1821                else
1822                        priority = ib_prio_to_core_prio(flow_attr->priority,
1823                                                        dont_trap);
1824                ns = mlx5_get_flow_namespace(dev->mdev,
1825                                             MLX5_FLOW_NAMESPACE_BYPASS);
1826                num_entries = MLX5_FS_MAX_ENTRIES;
1827                num_groups = MLX5_FS_MAX_TYPES;
1828                prio = &dev->flow_db.prios[priority];
1829        } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
1830                   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
1831                ns = mlx5_get_flow_namespace(dev->mdev,
1832                                             MLX5_FLOW_NAMESPACE_LEFTOVERS);
1833                build_leftovers_ft_param(&priority,
1834                                         &num_entries,
1835                                         &num_groups);
1836                prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
1837        } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
1838                if (!MLX5_CAP_FLOWTABLE(dev->mdev,
1839                                        allow_sniffer_and_nic_rx_shared_tir))
1840                        return ERR_PTR(-ENOTSUPP);
1841
1842                ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
1843                                             MLX5_FLOW_NAMESPACE_SNIFFER_RX :
1844                                             MLX5_FLOW_NAMESPACE_SNIFFER_TX);
1845
1846                prio = &dev->flow_db.sniffer[ft_type];
1847                priority = 0;
1848                num_entries = 1;
1849                num_groups = 1;
1850        }
1851
1852        if (!ns)
1853                return ERR_PTR(-ENOTSUPP);
1854
1855        ft = prio->flow_table;
1856        if (!ft) {
1857                ft = mlx5_create_auto_grouped_flow_table(ns, priority,
1858                                                         num_entries,
1859                                                         num_groups,
1860                                                         0);
1861
1862                if (!IS_ERR(ft)) {
1863                        prio->refcount = 0;
1864                        prio->flow_table = ft;
1865                } else {
1866                        err = PTR_ERR(ft);
1867                }
1868        }
1869
1870        return err ? ERR_PTR(err) : prio;
1871}
1872
1873static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
1874                                                     struct mlx5_ib_flow_prio *ft_prio,
1875                                                     const struct ib_flow_attr *flow_attr,
1876                                                     struct mlx5_flow_destination *dst)
1877{
1878        struct mlx5_flow_table  *ft = ft_prio->flow_table;
1879        struct mlx5_ib_flow_handler *handler;
1880        struct mlx5_flow_spec *spec;
1881        const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
1882        unsigned int spec_index;
1883        u32 action;
1884        int err = 0;
1885
1886        if (!is_valid_attr(flow_attr))
1887                return ERR_PTR(-EINVAL);
1888
1889        spec = mlx5_vzalloc(sizeof(*spec));
1890        handler = kzalloc(sizeof(*handler), GFP_KERNEL);
1891        if (!handler || !spec) {
1892                err = -ENOMEM;
1893                goto free;
1894        }
1895
1896        INIT_LIST_HEAD(&handler->list);
1897
1898        for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1899                err = parse_flow_attr(spec->match_criteria,
1900                                      spec->match_value, ib_flow);
1901                if (err < 0)
1902                        goto free;
1903
1904                ib_flow += ((union ib_flow_spec *)ib_flow)->size;
1905        }
1906
1907        spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
1908        action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
1909                MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
1910        handler->rule = mlx5_add_flow_rule(ft, spec,
1911                                           action,
1912                                           MLX5_FS_DEFAULT_FLOW_TAG,
1913                                           dst);
1914
1915        if (IS_ERR(handler->rule)) {
1916                err = PTR_ERR(handler->rule);
1917                goto free;
1918        }
1919
1920        ft_prio->refcount++;
1921        handler->prio = ft_prio;
1922
1923        ft_prio->flow_table = ft;
1924free:
1925        if (err)
1926                kfree(handler);
1927        kvfree(spec);
1928        return err ? ERR_PTR(err) : handler;
1929}
1930
1931static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
1932                                                          struct mlx5_ib_flow_prio *ft_prio,
1933                                                          struct ib_flow_attr *flow_attr,
1934                                                          struct mlx5_flow_destination *dst)
1935{
1936        struct mlx5_ib_flow_handler *handler_dst = NULL;
1937        struct mlx5_ib_flow_handler *handler = NULL;
1938
1939        handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
1940        if (!IS_ERR(handler)) {
1941                handler_dst = create_flow_rule(dev, ft_prio,
1942                                               flow_attr, dst);
1943                if (IS_ERR(handler_dst)) {
1944                        mlx5_del_flow_rule(handler->rule);
1945                        ft_prio->refcount--;
1946                        kfree(handler);
1947                        handler = handler_dst;
1948                } else {
1949                        list_add(&handler_dst->list, &handler->list);
1950                }
1951        }
1952
1953        return handler;
1954}
1955enum {
1956        LEFTOVERS_MC,
1957        LEFTOVERS_UC,
1958};
1959
1960static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
1961                                                          struct mlx5_ib_flow_prio *ft_prio,
1962                                                          struct ib_flow_attr *flow_attr,
1963                                                          struct mlx5_flow_destination *dst)
1964{
1965        struct mlx5_ib_flow_handler *handler_ucast = NULL;
1966        struct mlx5_ib_flow_handler *handler = NULL;
1967
1968        static struct {
1969                struct ib_flow_attr     flow_attr;
1970                struct ib_flow_spec_eth eth_flow;
1971        } leftovers_specs[] = {
1972                [LEFTOVERS_MC] = {
1973                        .flow_attr = {
1974                                .num_of_specs = 1,
1975                                .size = sizeof(leftovers_specs[0])
1976                        },
1977                        .eth_flow = {
1978                                .type = IB_FLOW_SPEC_ETH,
1979                                .size = sizeof(struct ib_flow_spec_eth),
1980                                .mask = {.dst_mac = {0x1} },
1981                                .val =  {.dst_mac = {0x1} }
1982                        }
1983                },
1984                [LEFTOVERS_UC] = {
1985                        .flow_attr = {
1986                                .num_of_specs = 1,
1987                                .size = sizeof(leftovers_specs[0])
1988                        },
1989                        .eth_flow = {
1990                                .type = IB_FLOW_SPEC_ETH,
1991                                .size = sizeof(struct ib_flow_spec_eth),
1992                                .mask = {.dst_mac = {0x1} },
1993                                .val = {.dst_mac = {} }
1994                        }
1995                }
1996        };
1997
1998        handler = create_flow_rule(dev, ft_prio,
1999                                   &leftovers_specs[LEFTOVERS_MC].flow_attr,
2000                                   dst);
2001        if (!IS_ERR(handler) &&
2002            flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
2003                handler_ucast = create_flow_rule(dev, ft_prio,
2004                                                 &leftovers_specs[LEFTOVERS_UC].flow_attr,
2005                                                 dst);
2006                if (IS_ERR(handler_ucast)) {
2007                        mlx5_del_flow_rule(handler->rule);
2008                        ft_prio->refcount--;
2009                        kfree(handler);
2010                        handler = handler_ucast;
2011                } else {
2012                        list_add(&handler_ucast->list, &handler->list);
2013                }
2014        }
2015
2016        return handler;
2017}
2018
2019static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
2020                                                        struct mlx5_ib_flow_prio *ft_rx,
2021                                                        struct mlx5_ib_flow_prio *ft_tx,
2022                                                        struct mlx5_flow_destination *dst)
2023{
2024        struct mlx5_ib_flow_handler *handler_rx;
2025        struct mlx5_ib_flow_handler *handler_tx;
2026        int err;
2027        static const struct ib_flow_attr flow_attr  = {
2028                .num_of_specs = 0,
2029                .size = sizeof(flow_attr)
2030        };
2031
2032        handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
2033        if (IS_ERR(handler_rx)) {
2034                err = PTR_ERR(handler_rx);
2035                goto err;
2036        }
2037
2038        handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
2039        if (IS_ERR(handler_tx)) {
2040                err = PTR_ERR(handler_tx);
2041                goto err_tx;
2042        }
2043
2044        list_add(&handler_tx->list, &handler_rx->list);
2045
2046        return handler_rx;
2047
2048err_tx:
2049        mlx5_del_flow_rule(handler_rx->rule);
2050        ft_rx->refcount--;
2051        kfree(handler_rx);
2052err:
2053        return ERR_PTR(err);
2054}
2055
2056static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
2057                                           struct ib_flow_attr *flow_attr,
2058                                           int domain)
2059{
2060        struct mlx5_ib_dev *dev = to_mdev(qp->device);
2061        struct mlx5_ib_qp *mqp = to_mqp(qp);
2062        struct mlx5_ib_flow_handler *handler = NULL;
2063        struct mlx5_flow_destination *dst = NULL;
2064        struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
2065        struct mlx5_ib_flow_prio *ft_prio;
2066        int err;
2067
2068        if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
2069                return ERR_PTR(-ENOSPC);
2070
2071        if (domain != IB_FLOW_DOMAIN_USER ||
2072            flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
2073            (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
2074                return ERR_PTR(-EINVAL);
2075
2076        dst = kzalloc(sizeof(*dst), GFP_KERNEL);
2077        if (!dst)
2078                return ERR_PTR(-ENOMEM);
2079
2080        mutex_lock(&dev->flow_db.lock);
2081
2082        ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX);
2083        if (IS_ERR(ft_prio)) {
2084                err = PTR_ERR(ft_prio);
2085                goto unlock;
2086        }
2087        if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2088                ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
2089                if (IS_ERR(ft_prio_tx)) {
2090                        err = PTR_ERR(ft_prio_tx);
2091                        ft_prio_tx = NULL;
2092                        goto destroy_ft;
2093                }
2094        }
2095
2096        dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
2097        if (mqp->flags & MLX5_IB_QP_RSS)
2098                dst->tir_num = mqp->rss_qp.tirn;
2099        else
2100                dst->tir_num = mqp->raw_packet_qp.rq.tirn;
2101
2102        if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2103                if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
2104                        handler = create_dont_trap_rule(dev, ft_prio,
2105                                                        flow_attr, dst);
2106                } else {
2107                        handler = create_flow_rule(dev, ft_prio, flow_attr,
2108                                                   dst);
2109                }
2110        } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2111                   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2112                handler = create_leftovers_rule(dev, ft_prio, flow_attr,
2113                                                dst);
2114        } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2115                handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
2116        } else {
2117                err = -EINVAL;
2118                goto destroy_ft;
2119        }
2120
2121        if (IS_ERR(handler)) {
2122                err = PTR_ERR(handler);
2123                handler = NULL;
2124                goto destroy_ft;
2125        }
2126
2127        mutex_unlock(&dev->flow_db.lock);
2128        kfree(dst);
2129
2130        return &handler->ibflow;
2131
2132destroy_ft:
2133        put_flow_table(dev, ft_prio, false);
2134        if (ft_prio_tx)
2135                put_flow_table(dev, ft_prio_tx, false);
2136unlock:
2137        mutex_unlock(&dev->flow_db.lock);
2138        kfree(dst);
2139        kfree(handler);
2140        return ERR_PTR(err);
2141}
2142
2143static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2144{
2145        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2146        int err;
2147
2148        err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
2149        if (err)
2150                mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
2151                             ibqp->qp_num, gid->raw);
2152
2153        return err;
2154}
2155
2156static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2157{
2158        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2159        int err;
2160
2161        err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
2162        if (err)
2163                mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
2164                             ibqp->qp_num, gid->raw);
2165
2166        return err;
2167}
2168
2169static int init_node_data(struct mlx5_ib_dev *dev)
2170{
2171        int err;
2172
2173        err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
2174        if (err)
2175                return err;
2176
2177        dev->mdev->rev_id = dev->mdev->pdev->revision;
2178
2179        return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
2180}
2181
2182static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
2183                             char *buf)
2184{
2185        struct mlx5_ib_dev *dev =
2186                container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2187
2188        return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
2189}
2190
2191static ssize_t show_reg_pages(struct device *device,
2192                              struct device_attribute *attr, char *buf)
2193{
2194        struct mlx5_ib_dev *dev =
2195                container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2196
2197        return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
2198}
2199
2200static ssize_t show_hca(struct device *device, struct device_attribute *attr,
2201                        char *buf)
2202{
2203        struct mlx5_ib_dev *dev =
2204                container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2205        return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
2206}
2207
2208static ssize_t show_rev(struct device *device, struct device_attribute *attr,
2209                        char *buf)
2210{
2211        struct mlx5_ib_dev *dev =
2212                container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2213        return sprintf(buf, "%x\n", dev->mdev->rev_id);
2214}
2215
2216static ssize_t show_board(struct device *device, struct device_attribute *attr,
2217                          char *buf)
2218{
2219        struct mlx5_ib_dev *dev =
2220                container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2221        return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
2222                       dev->mdev->board_id);
2223}
2224
2225static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
2226static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
2227static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
2228static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
2229static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
2230
2231static struct device_attribute *mlx5_class_attributes[] = {
2232        &dev_attr_hw_rev,
2233        &dev_attr_hca_type,
2234        &dev_attr_board_id,
2235        &dev_attr_fw_pages,
2236        &dev_attr_reg_pages,
2237};
2238
2239static void pkey_change_handler(struct work_struct *work)
2240{
2241        struct mlx5_ib_port_resources *ports =
2242                container_of(work, struct mlx5_ib_port_resources,
2243                             pkey_change_work);
2244
2245        mutex_lock(&ports->devr->mutex);
2246        mlx5_ib_gsi_pkey_change(ports->gsi);
2247        mutex_unlock(&ports->devr->mutex);
2248}
2249
2250static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
2251{
2252        struct mlx5_ib_qp *mqp;
2253        struct mlx5_ib_cq *send_mcq, *recv_mcq;
2254        struct mlx5_core_cq *mcq;
2255        struct list_head cq_armed_list;
2256        unsigned long flags_qp;
2257        unsigned long flags_cq;
2258        unsigned long flags;
2259
2260        INIT_LIST_HEAD(&cq_armed_list);
2261
2262        /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2263        spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2264        list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2265                spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2266                if (mqp->sq.tail != mqp->sq.head) {
2267                        send_mcq = to_mcq(mqp->ibqp.send_cq);
2268                        spin_lock_irqsave(&send_mcq->lock, flags_cq);
2269                        if (send_mcq->mcq.comp &&
2270                            mqp->ibqp.send_cq->comp_handler) {
2271                                if (!send_mcq->mcq.reset_notify_added) {
2272                                        send_mcq->mcq.reset_notify_added = 1;
2273                                        list_add_tail(&send_mcq->mcq.reset_notify,
2274                                                      &cq_armed_list);
2275                                }
2276                        }
2277                        spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2278                }
2279                spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2280                spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2281                /* no handling is needed for SRQ */
2282                if (!mqp->ibqp.srq) {
2283                        if (mqp->rq.tail != mqp->rq.head) {
2284                                recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2285                                spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2286                                if (recv_mcq->mcq.comp &&
2287                                    mqp->ibqp.recv_cq->comp_handler) {
2288                                        if (!recv_mcq->mcq.reset_notify_added) {
2289                                                recv_mcq->mcq.reset_notify_added = 1;
2290                                                list_add_tail(&recv_mcq->mcq.reset_notify,
2291                                                              &cq_armed_list);
2292                                        }
2293                                }
2294                                spin_unlock_irqrestore(&recv_mcq->lock,
2295                                                       flags_cq);
2296                        }
2297                }
2298                spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2299        }
2300        /*At that point all inflight post send were put to be executed as of we
2301         * lock/unlock above locks Now need to arm all involved CQs.
2302         */
2303        list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2304                mcq->comp(mcq);
2305        }
2306        spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2307}
2308
2309static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
2310                          enum mlx5_dev_event event, unsigned long param)
2311{
2312        struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
2313        struct ib_event ibev;
2314        bool fatal = false;
2315        u8 port = 0;
2316
2317        switch (event) {
2318        case MLX5_DEV_EVENT_SYS_ERROR:
2319                ibev.event = IB_EVENT_DEVICE_FATAL;
2320                mlx5_ib_handle_internal_error(ibdev);
2321                fatal = true;
2322                break;
2323
2324        case MLX5_DEV_EVENT_PORT_UP:
2325        case MLX5_DEV_EVENT_PORT_DOWN:
2326        case MLX5_DEV_EVENT_PORT_INITIALIZED:
2327                port = (u8)param;
2328
2329                /* In RoCE, port up/down events are handled in
2330                 * mlx5_netdev_event().
2331                 */
2332                if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2333                        IB_LINK_LAYER_ETHERNET)
2334                        return;
2335
2336                ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ?
2337                             IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
2338                break;
2339
2340        case MLX5_DEV_EVENT_LID_CHANGE:
2341                ibev.event = IB_EVENT_LID_CHANGE;
2342                port = (u8)param;
2343                break;
2344
2345        case MLX5_DEV_EVENT_PKEY_CHANGE:
2346                ibev.event = IB_EVENT_PKEY_CHANGE;
2347                port = (u8)param;
2348
2349                schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
2350                break;
2351
2352        case MLX5_DEV_EVENT_GUID_CHANGE:
2353                ibev.event = IB_EVENT_GID_CHANGE;
2354                port = (u8)param;
2355                break;
2356
2357        case MLX5_DEV_EVENT_CLIENT_REREG:
2358                ibev.event = IB_EVENT_CLIENT_REREGISTER;
2359                port = (u8)param;
2360                break;
2361        }
2362
2363        ibev.device           = &ibdev->ib_dev;
2364        ibev.element.port_num = port;
2365
2366        if (port < 1 || port > ibdev->num_ports) {
2367                mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
2368                return;
2369        }
2370
2371        if (ibdev->ib_active)
2372                ib_dispatch_event(&ibev);
2373
2374        if (fatal)
2375                ibdev->ib_active = false;
2376}
2377
2378static void get_ext_port_caps(struct mlx5_ib_dev *dev)
2379{
2380        int port;
2381
2382        for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
2383                mlx5_query_ext_port_caps(dev, port);
2384}
2385
2386static int get_port_caps(struct mlx5_ib_dev *dev)
2387{
2388        struct ib_device_attr *dprops = NULL;
2389        struct ib_port_attr *pprops = NULL;
2390        int err = -ENOMEM;
2391        int port;
2392        struct ib_udata uhw = {.inlen = 0, .outlen = 0};
2393
2394        pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
2395        if (!pprops)
2396                goto out;
2397
2398        dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
2399        if (!dprops)
2400                goto out;
2401
2402        err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
2403        if (err) {
2404                mlx5_ib_warn(dev, "query_device failed %d\n", err);
2405                goto out;
2406        }
2407
2408        for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2409                err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
2410                if (err) {
2411                        mlx5_ib_warn(dev, "query_port %d failed %d\n",
2412                                     port, err);
2413                        break;
2414                }
2415                dev->mdev->port_caps[port - 1].pkey_table_len =
2416                                                dprops->max_pkeys;
2417                dev->mdev->port_caps[port - 1].gid_table_len =
2418                                                pprops->gid_tbl_len;
2419                mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
2420                            dprops->max_pkeys, pprops->gid_tbl_len);
2421        }
2422
2423out:
2424        kfree(pprops);
2425        kfree(dprops);
2426
2427        return err;
2428}
2429
2430static void destroy_umrc_res(struct mlx5_ib_dev *dev)
2431{
2432        int err;
2433
2434        err = mlx5_mr_cache_cleanup(dev);
2435        if (err)
2436                mlx5_ib_warn(dev, "mr cache cleanup failed\n");
2437
2438        mlx5_ib_destroy_qp(dev->umrc.qp);
2439        ib_free_cq(dev->umrc.cq);
2440        ib_dealloc_pd(dev->umrc.pd);
2441}
2442
2443enum {
2444        MAX_UMR_WR = 128,
2445};
2446
2447static int create_umr_res(struct mlx5_ib_dev *dev)
2448{
2449        struct ib_qp_init_attr *init_attr = NULL;
2450        struct ib_qp_attr *attr = NULL;
2451        struct ib_pd *pd;
2452        struct ib_cq *cq;
2453        struct ib_qp *qp;
2454        int ret;
2455
2456        attr = kzalloc(sizeof(*attr), GFP_KERNEL);
2457        init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
2458        if (!attr || !init_attr) {
2459                ret = -ENOMEM;
2460                goto error_0;
2461        }
2462
2463        pd = ib_alloc_pd(&dev->ib_dev, 0);
2464        if (IS_ERR(pd)) {
2465                mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
2466                ret = PTR_ERR(pd);
2467                goto error_0;
2468        }
2469
2470        cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
2471        if (IS_ERR(cq)) {
2472                mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
2473                ret = PTR_ERR(cq);
2474                goto error_2;
2475        }
2476
2477        init_attr->send_cq = cq;
2478        init_attr->recv_cq = cq;
2479        init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
2480        init_attr->cap.max_send_wr = MAX_UMR_WR;
2481        init_attr->cap.max_send_sge = 1;
2482        init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
2483        init_attr->port_num = 1;
2484        qp = mlx5_ib_create_qp(pd, init_attr, NULL);
2485        if (IS_ERR(qp)) {
2486                mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
2487                ret = PTR_ERR(qp);
2488                goto error_3;
2489        }
2490        qp->device     = &dev->ib_dev;
2491        qp->real_qp    = qp;
2492        qp->uobject    = NULL;
2493        qp->qp_type    = MLX5_IB_QPT_REG_UMR;
2494
2495        attr->qp_state = IB_QPS_INIT;
2496        attr->port_num = 1;
2497        ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
2498                                IB_QP_PORT, NULL);
2499        if (ret) {
2500                mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
2501                goto error_4;
2502        }
2503
2504        memset(attr, 0, sizeof(*attr));
2505        attr->qp_state = IB_QPS_RTR;
2506        attr->path_mtu = IB_MTU_256;
2507
2508        ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2509        if (ret) {
2510                mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
2511                goto error_4;
2512        }
2513
2514        memset(attr, 0, sizeof(*attr));
2515        attr->qp_state = IB_QPS_RTS;
2516        ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2517        if (ret) {
2518                mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
2519                goto error_4;
2520        }
2521
2522        dev->umrc.qp = qp;
2523        dev->umrc.cq = cq;
2524        dev->umrc.pd = pd;
2525
2526        sema_init(&dev->umrc.sem, MAX_UMR_WR);
2527        ret = mlx5_mr_cache_init(dev);
2528        if (ret) {
2529                mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
2530                goto error_4;
2531        }
2532
2533        kfree(attr);
2534        kfree(init_attr);
2535
2536        return 0;
2537
2538error_4:
2539        mlx5_ib_destroy_qp(qp);
2540
2541error_3:
2542        ib_free_cq(cq);
2543
2544error_2:
2545        ib_dealloc_pd(pd);
2546
2547error_0:
2548        kfree(attr);
2549        kfree(init_attr);
2550        return ret;
2551}
2552
2553static int create_dev_resources(struct mlx5_ib_resources *devr)
2554{
2555        struct ib_srq_init_attr attr;
2556        struct mlx5_ib_dev *dev;
2557        struct ib_cq_init_attr cq_attr = {.cqe = 1};
2558        int port;
2559        int ret = 0;
2560
2561        dev = container_of(devr, struct mlx5_ib_dev, devr);
2562
2563        mutex_init(&devr->mutex);
2564
2565        devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
2566        if (IS_ERR(devr->p0)) {
2567                ret = PTR_ERR(devr->p0);
2568                goto error0;
2569        }
2570        devr->p0->device  = &dev->ib_dev;
2571        devr->p0->uobject = NULL;
2572        atomic_set(&devr->p0->usecnt, 0);
2573
2574        devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
2575        if (IS_ERR(devr->c0)) {
2576                ret = PTR_ERR(devr->c0);
2577                goto error1;
2578        }
2579        devr->c0->device        = &dev->ib_dev;
2580        devr->c0->uobject       = NULL;
2581        devr->c0->comp_handler  = NULL;
2582        devr->c0->event_handler = NULL;
2583        devr->c0->cq_context    = NULL;
2584        atomic_set(&devr->c0->usecnt, 0);
2585
2586        devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2587        if (IS_ERR(devr->x0)) {
2588                ret = PTR_ERR(devr->x0);
2589                goto error2;
2590        }
2591        devr->x0->device = &dev->ib_dev;
2592        devr->x0->inode = NULL;
2593        atomic_set(&devr->x0->usecnt, 0);
2594        mutex_init(&devr->x0->tgt_qp_mutex);
2595        INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
2596
2597        devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2598        if (IS_ERR(devr->x1)) {
2599                ret = PTR_ERR(devr->x1);
2600                goto error3;
2601        }
2602        devr->x1->device = &dev->ib_dev;
2603        devr->x1->inode = NULL;
2604        atomic_set(&devr->x1->usecnt, 0);
2605        mutex_init(&devr->x1->tgt_qp_mutex);
2606        INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
2607
2608        memset(&attr, 0, sizeof(attr));
2609        attr.attr.max_sge = 1;
2610        attr.attr.max_wr = 1;
2611        attr.srq_type = IB_SRQT_XRC;
2612        attr.ext.xrc.cq = devr->c0;
2613        attr.ext.xrc.xrcd = devr->x0;
2614
2615        devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2616        if (IS_ERR(devr->s0)) {
2617                ret = PTR_ERR(devr->s0);
2618                goto error4;
2619        }
2620        devr->s0->device        = &dev->ib_dev;
2621        devr->s0->pd            = devr->p0;
2622        devr->s0->uobject       = NULL;
2623        devr->s0->event_handler = NULL;
2624        devr->s0->srq_context   = NULL;
2625        devr->s0->srq_type      = IB_SRQT_XRC;
2626        devr->s0->ext.xrc.xrcd  = devr->x0;
2627        devr->s0->ext.xrc.cq    = devr->c0;
2628        atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
2629        atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
2630        atomic_inc(&devr->p0->usecnt);
2631        atomic_set(&devr->s0->usecnt, 0);
2632
2633        memset(&attr, 0, sizeof(attr));
2634        attr.attr.max_sge = 1;
2635        attr.attr.max_wr = 1;
2636        attr.srq_type = IB_SRQT_BASIC;
2637        devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2638        if (IS_ERR(devr->s1)) {
2639                ret = PTR_ERR(devr->s1);
2640                goto error5;
2641        }
2642        devr->s1->device        = &dev->ib_dev;
2643        devr->s1->pd            = devr->p0;
2644        devr->s1->uobject       = NULL;
2645        devr->s1->event_handler = NULL;
2646        devr->s1->srq_context   = NULL;
2647        devr->s1->srq_type      = IB_SRQT_BASIC;
2648        devr->s1->ext.xrc.cq    = devr->c0;
2649        atomic_inc(&devr->p0->usecnt);
2650        atomic_set(&devr->s0->usecnt, 0);
2651
2652        for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
2653                INIT_WORK(&devr->ports[port].pkey_change_work,
2654                          pkey_change_handler);
2655                devr->ports[port].devr = devr;
2656        }
2657
2658        return 0;
2659
2660error5:
2661        mlx5_ib_destroy_srq(devr->s0);
2662error4:
2663        mlx5_ib_dealloc_xrcd(devr->x1);
2664error3:
2665        mlx5_ib_dealloc_xrcd(devr->x0);
2666error2:
2667        mlx5_ib_destroy_cq(devr->c0);
2668error1:
2669        mlx5_ib_dealloc_pd(devr->p0);
2670error0:
2671        return ret;
2672}
2673
2674static void destroy_dev_resources(struct mlx5_ib_resources *devr)
2675{
2676        struct mlx5_ib_dev *dev =
2677                container_of(devr, struct mlx5_ib_dev, devr);
2678        int port;
2679
2680        mlx5_ib_destroy_srq(devr->s1);
2681        mlx5_ib_destroy_srq(devr->s0);
2682        mlx5_ib_dealloc_xrcd(devr->x0);
2683        mlx5_ib_dealloc_xrcd(devr->x1);
2684        mlx5_ib_destroy_cq(devr->c0);
2685        mlx5_ib_dealloc_pd(devr->p0);
2686
2687        /* Make sure no change P_Key work items are still executing */
2688        for (port = 0; port < dev->num_ports; ++port)
2689                cancel_work_sync(&devr->ports[port].pkey_change_work);
2690}
2691
2692static u32 get_core_cap_flags(struct ib_device *ibdev)
2693{
2694        struct mlx5_ib_dev *dev = to_mdev(ibdev);
2695        enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
2696        u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
2697        u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
2698        u32 ret = 0;
2699
2700        if (ll == IB_LINK_LAYER_INFINIBAND)
2701                return RDMA_CORE_PORT_IBA_IB;
2702
2703        if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
2704                return 0;
2705
2706        if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
2707                return 0;
2708
2709        if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
2710                ret |= RDMA_CORE_PORT_IBA_ROCE;
2711
2712        if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
2713                ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
2714
2715        return ret;
2716}
2717
2718static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
2719                               struct ib_port_immutable *immutable)
2720{
2721        struct ib_port_attr attr;
2722        int err;
2723
2724        err = mlx5_ib_query_port(ibdev, port_num, &attr);
2725        if (err)
2726                return err;
2727
2728        immutable->pkey_tbl_len = attr.pkey_tbl_len;
2729        immutable->gid_tbl_len = attr.gid_tbl_len;
2730        immutable->core_cap_flags = get_core_cap_flags(ibdev);
2731        immutable->max_mad_size = IB_MGMT_MAD_SIZE;
2732
2733        return 0;
2734}
2735
2736static void get_dev_fw_str(struct ib_device *ibdev, char *str,
2737                           size_t str_len)
2738{
2739        struct mlx5_ib_dev *dev =
2740                container_of(ibdev, struct mlx5_ib_dev, ib_dev);
2741        snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
2742                       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
2743}
2744
2745static int mlx5_roce_lag_init(struct mlx5_ib_dev *dev)
2746{
2747        struct mlx5_core_dev *mdev = dev->mdev;
2748        struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
2749                                                                 MLX5_FLOW_NAMESPACE_LAG);
2750        struct mlx5_flow_table *ft;
2751        int err;
2752
2753        if (!ns || !mlx5_lag_is_active(mdev))
2754                return 0;
2755
2756        err = mlx5_cmd_create_vport_lag(mdev);
2757        if (err)
2758                return err;
2759
2760        ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
2761        if (IS_ERR(ft)) {
2762                err = PTR_ERR(ft);
2763                goto err_destroy_vport_lag;
2764        }
2765
2766        dev->flow_db.lag_demux_ft = ft;
2767        return 0;
2768
2769err_destroy_vport_lag:
2770        mlx5_cmd_destroy_vport_lag(mdev);
2771        return err;
2772}
2773
2774static void mlx5_roce_lag_cleanup(struct mlx5_ib_dev *dev)
2775{
2776        struct mlx5_core_dev *mdev = dev->mdev;
2777
2778        if (dev->flow_db.lag_demux_ft) {
2779                mlx5_destroy_flow_table(dev->flow_db.lag_demux_ft);
2780                dev->flow_db.lag_demux_ft = NULL;
2781
2782                mlx5_cmd_destroy_vport_lag(mdev);
2783        }
2784}
2785
2786static void mlx5_remove_roce_notifier(struct mlx5_ib_dev *dev)
2787{
2788        if (dev->roce.nb.notifier_call) {
2789                unregister_netdevice_notifier(&dev->roce.nb);
2790                dev->roce.nb.notifier_call = NULL;
2791        }
2792}
2793
2794static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
2795{
2796        int err;
2797
2798        dev->roce.nb.notifier_call = mlx5_netdev_event;
2799        err = register_netdevice_notifier(&dev->roce.nb);
2800        if (err) {
2801                dev->roce.nb.notifier_call = NULL;
2802                return err;
2803        }
2804
2805        err = mlx5_nic_vport_enable_roce(dev->mdev);
2806        if (err)
2807                goto err_unregister_netdevice_notifier;
2808
2809        err = mlx5_roce_lag_init(dev);
2810        if (err)
2811                goto err_disable_roce;
2812
2813        return 0;
2814
2815err_disable_roce:
2816        mlx5_nic_vport_disable_roce(dev->mdev);
2817
2818err_unregister_netdevice_notifier:
2819        mlx5_remove_roce_notifier(dev);
2820        return err;
2821}
2822
2823static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
2824{
2825        mlx5_roce_lag_cleanup(dev);
2826        mlx5_nic_vport_disable_roce(dev->mdev);
2827}
2828
2829static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
2830{
2831        unsigned int i;
2832
2833        for (i = 0; i < dev->num_ports; i++)
2834                mlx5_core_dealloc_q_counter(dev->mdev,
2835                                            dev->port[i].q_cnt_id);
2836}
2837
2838static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
2839{
2840        int i;
2841        int ret;
2842
2843        for (i = 0; i < dev->num_ports; i++) {
2844                ret = mlx5_core_alloc_q_counter(dev->mdev,
2845                                                &dev->port[i].q_cnt_id);
2846                if (ret) {
2847                        mlx5_ib_warn(dev,
2848                                     "couldn't allocate queue counter for port %d, err %d\n",
2849                                     i + 1, ret);
2850                        goto dealloc_counters;
2851                }
2852        }
2853
2854        return 0;
2855
2856dealloc_counters:
2857        while (--i >= 0)
2858                mlx5_core_dealloc_q_counter(dev->mdev,
2859                                            dev->port[i].q_cnt_id);
2860
2861        return ret;
2862}
2863
2864static const char * const names[] = {
2865        "rx_write_requests",
2866        "rx_read_requests",
2867        "rx_atomic_requests",
2868        "out_of_buffer",
2869        "out_of_sequence",
2870        "duplicate_request",
2871        "rnr_nak_retry_err",
2872        "packet_seq_err",
2873        "implied_nak_seq_err",
2874        "local_ack_timeout_err",
2875};
2876
2877static const size_t stats_offsets[] = {
2878        MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests),
2879        MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests),
2880        MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests),
2881        MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer),
2882        MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence),
2883        MLX5_BYTE_OFF(query_q_counter_out, duplicate_request),
2884        MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err),
2885        MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err),
2886        MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err),
2887        MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err),
2888};
2889
2890static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
2891                                                    u8 port_num)
2892{
2893        BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets));
2894
2895        /* We support only per port stats */
2896        if (port_num == 0)
2897                return NULL;
2898
2899        return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names),
2900                                          RDMA_HW_STATS_DEFAULT_LIFESPAN);
2901}
2902
2903static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
2904                                struct rdma_hw_stats *stats,
2905                                u8 port, int index)
2906{
2907        struct mlx5_ib_dev *dev = to_mdev(ibdev);
2908        int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
2909        void *out;
2910        __be32 val;
2911        int ret;
2912        int i;
2913
2914        if (!port || !stats)
2915                return -ENOSYS;
2916
2917        out = mlx5_vzalloc(outlen);
2918        if (!out)
2919                return -ENOMEM;
2920
2921        ret = mlx5_core_query_q_counter(dev->mdev,
2922                                        dev->port[port - 1].q_cnt_id, 0,
2923                                        out, outlen);
2924        if (ret)
2925                goto free;
2926
2927        for (i = 0; i < ARRAY_SIZE(names); i++) {
2928                val = *(__be32 *)(out + stats_offsets[i]);
2929                stats->value[i] = (u64)be32_to_cpu(val);
2930        }
2931free:
2932        kvfree(out);
2933        return ARRAY_SIZE(names);
2934}
2935
2936static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
2937{
2938        struct mlx5_ib_dev *dev;
2939        enum rdma_link_layer ll;
2940        int port_type_cap;
2941        const char *name;
2942        int err;
2943        int i;
2944
2945        port_type_cap = MLX5_CAP_GEN(mdev, port_type);
2946        ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
2947
2948        if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce))
2949                return NULL;
2950
2951        printk_once(KERN_INFO "%s", mlx5_version);
2952
2953        dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
2954        if (!dev)
2955                return NULL;
2956
2957        dev->mdev = mdev;
2958
2959        dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
2960                            GFP_KERNEL);
2961        if (!dev->port)
2962                goto err_dealloc;
2963
2964        rwlock_init(&dev->roce.netdev_lock);
2965        err = get_port_caps(dev);
2966        if (err)
2967                goto err_free_port;
2968
2969        if (mlx5_use_mad_ifc(dev))
2970                get_ext_port_caps(dev);
2971
2972        MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
2973
2974        if (!mlx5_lag_is_active(mdev))
2975                name = "mlx5_%d";
2976        else
2977                name = "mlx5_bond_%d";
2978
2979        strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX);
2980        dev->ib_dev.owner               = THIS_MODULE;
2981        dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
2982        dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
2983        dev->num_ports          = MLX5_CAP_GEN(mdev, num_ports);
2984        dev->ib_dev.phys_port_cnt     = dev->num_ports;
2985        dev->ib_dev.num_comp_vectors    =
2986                dev->mdev->priv.eq_table.num_comp_vectors;
2987        dev->ib_dev.dma_device  = &mdev->pdev->dev;
2988
2989        dev->ib_dev.uverbs_abi_ver      = MLX5_IB_UVERBS_ABI_VERSION;
2990        dev->ib_dev.uverbs_cmd_mask     =
2991                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
2992                (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
2993                (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
2994                (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
2995                (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
2996                (1ull << IB_USER_VERBS_CMD_REG_MR)              |
2997                (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
2998                (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
2999                (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
3000                (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
3001                (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
3002                (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
3003                (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
3004                (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
3005                (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
3006                (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
3007                (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
3008                (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
3009                (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
3010                (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
3011                (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
3012                (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
3013                (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
3014                (1ull << IB_USER_VERBS_CMD_OPEN_QP);
3015        dev->ib_dev.uverbs_ex_cmd_mask =
3016                (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
3017                (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
3018                (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
3019
3020        dev->ib_dev.query_device        = mlx5_ib_query_device;
3021        dev->ib_dev.query_port          = mlx5_ib_query_port;
3022        dev->ib_dev.get_link_layer      = mlx5_ib_port_link_layer;
3023        if (ll == IB_LINK_LAYER_ETHERNET)
3024                dev->ib_dev.get_netdev  = mlx5_ib_get_netdev;
3025        dev->ib_dev.query_gid           = mlx5_ib_query_gid;
3026        dev->ib_dev.add_gid             = mlx5_ib_add_gid;
3027        dev->ib_dev.del_gid             = mlx5_ib_del_gid;
3028        dev->ib_dev.query_pkey          = mlx5_ib_query_pkey;
3029        dev->ib_dev.modify_device       = mlx5_ib_modify_device;
3030        dev->ib_dev.modify_port         = mlx5_ib_modify_port;
3031        dev->ib_dev.alloc_ucontext      = mlx5_ib_alloc_ucontext;
3032        dev->ib_dev.dealloc_ucontext    = mlx5_ib_dealloc_ucontext;
3033        dev->ib_dev.mmap                = mlx5_ib_mmap;
3034        dev->ib_dev.alloc_pd            = mlx5_ib_alloc_pd;
3035        dev->ib_dev.dealloc_pd          = mlx5_ib_dealloc_pd;
3036        dev->ib_dev.create_ah           = mlx5_ib_create_ah;
3037        dev->ib_dev.query_ah            = mlx5_ib_query_ah;
3038        dev->ib_dev.destroy_ah          = mlx5_ib_destroy_ah;
3039        dev->ib_dev.create_srq          = mlx5_ib_create_srq;
3040        dev->ib_dev.modify_srq          = mlx5_ib_modify_srq;
3041        dev->ib_dev.query_srq           = mlx5_ib_query_srq;
3042        dev->ib_dev.destroy_srq         = mlx5_ib_destroy_srq;
3043        dev->ib_dev.post_srq_recv       = mlx5_ib_post_srq_recv;
3044        dev->ib_dev.create_qp           = mlx5_ib_create_qp;
3045        dev->ib_dev.modify_qp           = mlx5_ib_modify_qp;
3046        dev->ib_dev.query_qp            = mlx5_ib_query_qp;
3047        dev->ib_dev.destroy_qp          = mlx5_ib_destroy_qp;
3048        dev->ib_dev.post_send           = mlx5_ib_post_send;
3049        dev->ib_dev.post_recv           = mlx5_ib_post_recv;
3050        dev->ib_dev.create_cq           = mlx5_ib_create_cq;
3051        dev->ib_dev.modify_cq           = mlx5_ib_modify_cq;
3052        dev->ib_dev.resize_cq           = mlx5_ib_resize_cq;
3053        dev->ib_dev.destroy_cq          = mlx5_ib_destroy_cq;
3054        dev->ib_dev.poll_cq             = mlx5_ib_poll_cq;
3055        dev->ib_dev.req_notify_cq       = mlx5_ib_arm_cq;
3056        dev->ib_dev.get_dma_mr          = mlx5_ib_get_dma_mr;
3057        dev->ib_dev.reg_user_mr         = mlx5_ib_reg_user_mr;
3058        dev->ib_dev.rereg_user_mr       = mlx5_ib_rereg_user_mr;
3059        dev->ib_dev.dereg_mr            = mlx5_ib_dereg_mr;
3060        dev->ib_dev.attach_mcast        = mlx5_ib_mcg_attach;
3061        dev->ib_dev.detach_mcast        = mlx5_ib_mcg_detach;
3062        dev->ib_dev.process_mad         = mlx5_ib_process_mad;
3063        dev->ib_dev.alloc_mr            = mlx5_ib_alloc_mr;
3064        dev->ib_dev.map_mr_sg           = mlx5_ib_map_mr_sg;
3065        dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
3066        dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
3067        dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
3068        if (mlx5_core_is_pf(mdev)) {
3069                dev->ib_dev.get_vf_config       = mlx5_ib_get_vf_config;
3070                dev->ib_dev.set_vf_link_state   = mlx5_ib_set_vf_link_state;
3071                dev->ib_dev.get_vf_stats        = mlx5_ib_get_vf_stats;
3072                dev->ib_dev.set_vf_guid         = mlx5_ib_set_vf_guid;
3073        }
3074
3075        dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
3076
3077        mlx5_ib_internal_fill_odp_caps(dev);
3078
3079        if (MLX5_CAP_GEN(mdev, imaicl)) {
3080                dev->ib_dev.alloc_mw            = mlx5_ib_alloc_mw;
3081                dev->ib_dev.dealloc_mw          = mlx5_ib_dealloc_mw;
3082                dev->ib_dev.uverbs_cmd_mask |=
3083                        (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
3084                        (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
3085        }
3086
3087        if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
3088            MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
3089                dev->ib_dev.get_hw_stats        = mlx5_ib_get_hw_stats;
3090                dev->ib_dev.alloc_hw_stats      = mlx5_ib_alloc_hw_stats;
3091        }
3092
3093        if (MLX5_CAP_GEN(mdev, xrc)) {
3094                dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
3095                dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
3096                dev->ib_dev.uverbs_cmd_mask |=
3097                        (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
3098                        (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
3099        }
3100
3101        if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
3102            IB_LINK_LAYER_ETHERNET) {
3103                dev->ib_dev.create_flow = mlx5_ib_create_flow;
3104                dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
3105                dev->ib_dev.create_wq    = mlx5_ib_create_wq;
3106                dev->ib_dev.modify_wq    = mlx5_ib_modify_wq;
3107                dev->ib_dev.destroy_wq   = mlx5_ib_destroy_wq;
3108                dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
3109                dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
3110                dev->ib_dev.uverbs_ex_cmd_mask |=
3111                        (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
3112                        (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
3113                        (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
3114                        (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
3115                        (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
3116                        (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
3117                        (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
3118        }
3119        err = init_node_data(dev);
3120        if (err)
3121                goto err_free_port;
3122
3123        mutex_init(&dev->flow_db.lock);
3124        mutex_init(&dev->cap_mask_mutex);
3125        INIT_LIST_HEAD(&dev->qp_list);
3126        spin_lock_init(&dev->reset_flow_resource_lock);
3127
3128        if (ll == IB_LINK_LAYER_ETHERNET) {
3129                err = mlx5_enable_roce(dev);
3130                if (err)
3131                        goto err_free_port;
3132        }
3133
3134        err = create_dev_resources(&dev->devr);
3135        if (err)
3136                goto err_disable_roce;
3137
3138        err = mlx5_ib_odp_init_one(dev);
3139        if (err)
3140                goto err_rsrc;
3141
3142        err = mlx5_ib_alloc_q_counters(dev);
3143        if (err)
3144                goto err_odp;
3145
3146        err = ib_register_device(&dev->ib_dev, NULL);
3147        if (err)
3148                goto err_q_cnt;
3149
3150        err = create_umr_res(dev);
3151        if (err)
3152                goto err_dev;
3153
3154        for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
3155                err = device_create_file(&dev->ib_dev.dev,
3156                                         mlx5_class_attributes[i]);
3157                if (err)
3158                        goto err_umrc;
3159        }
3160
3161        dev->ib_active = true;
3162
3163        return dev;
3164
3165err_umrc:
3166        destroy_umrc_res(dev);
3167
3168err_dev:
3169        ib_unregister_device(&dev->ib_dev);
3170
3171err_q_cnt:
3172        mlx5_ib_dealloc_q_counters(dev);
3173
3174err_odp:
3175        mlx5_ib_odp_remove_one(dev);
3176
3177err_rsrc:
3178        destroy_dev_resources(&dev->devr);
3179
3180err_disable_roce:
3181        if (ll == IB_LINK_LAYER_ETHERNET) {
3182                mlx5_disable_roce(dev);
3183                mlx5_remove_roce_notifier(dev);
3184        }
3185
3186err_free_port:
3187        kfree(dev->port);
3188
3189err_dealloc:
3190        ib_dealloc_device((struct ib_device *)dev);
3191
3192        return NULL;
3193}
3194
3195static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
3196{
3197        struct mlx5_ib_dev *dev = context;
3198        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
3199
3200        mlx5_remove_roce_notifier(dev);
3201        ib_unregister_device(&dev->ib_dev);
3202        mlx5_ib_dealloc_q_counters(dev);
3203        destroy_umrc_res(dev);
3204        mlx5_ib_odp_remove_one(dev);
3205        destroy_dev_resources(&dev->devr);
3206        if (ll == IB_LINK_LAYER_ETHERNET)
3207                mlx5_disable_roce(dev);
3208        kfree(dev->port);
3209        ib_dealloc_device(&dev->ib_dev);
3210}
3211
3212static struct mlx5_interface mlx5_ib_interface = {
3213        .add            = mlx5_ib_add,
3214        .remove         = mlx5_ib_remove,
3215        .event          = mlx5_ib_event,
3216        .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
3217};
3218
3219static int __init mlx5_ib_init(void)
3220{
3221        int err;
3222
3223        if (deprecated_prof_sel != 2)
3224                pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
3225
3226        err = mlx5_ib_odp_init();
3227        if (err)
3228                return err;
3229
3230        err = mlx5_register_interface(&mlx5_ib_interface);
3231        if (err)
3232                goto clean_odp;
3233
3234        return err;
3235
3236clean_odp:
3237        mlx5_ib_odp_cleanup();
3238        return err;
3239}
3240
3241static void __exit mlx5_ib_cleanup(void)
3242{
3243        mlx5_unregister_interface(&mlx5_ib_interface);
3244        mlx5_ib_odp_cleanup();
3245}
3246
3247module_init(mlx5_ib_init);
3248module_exit(mlx5_ib_cleanup);
3249