linux/drivers/infiniband/hw/mlx5/main.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
   2/*
   3 * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
   4 */
   5
   6#include <linux/debugfs.h>
   7#include <linux/highmem.h>
   8#include <linux/module.h>
   9#include <linux/init.h>
  10#include <linux/errno.h>
  11#include <linux/pci.h>
  12#include <linux/dma-mapping.h>
  13#include <linux/slab.h>
  14#include <linux/bitmap.h>
  15#include <linux/sched.h>
  16#include <linux/sched/mm.h>
  17#include <linux/sched/task.h>
  18#include <linux/delay.h>
  19#include <rdma/ib_user_verbs.h>
  20#include <rdma/ib_addr.h>
  21#include <rdma/ib_cache.h>
  22#include <linux/mlx5/port.h>
  23#include <linux/mlx5/vport.h>
  24#include <linux/mlx5/fs.h>
  25#include <linux/mlx5/eswitch.h>
  26#include <linux/list.h>
  27#include <rdma/ib_smi.h>
  28#include <rdma/ib_umem.h>
  29#include <rdma/lag.h>
  30#include <linux/in.h>
  31#include <linux/etherdevice.h>
  32#include "mlx5_ib.h"
  33#include "ib_rep.h"
  34#include "cmd.h"
  35#include "devx.h"
  36#include "fs.h"
  37#include "srq.h"
  38#include "qp.h"
  39#include "wr.h"
  40#include "restrack.h"
  41#include "counters.h"
  42#include <linux/mlx5/accel.h>
  43#include <rdma/uverbs_std_types.h>
  44#include <rdma/mlx5_user_ioctl_verbs.h>
  45#include <rdma/mlx5_user_ioctl_cmds.h>
  46#include <rdma/ib_umem_odp.h>
  47
  48#define UVERBS_MODULE_NAME mlx5_ib
  49#include <rdma/uverbs_named_ioctl.h>
  50
  51MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
  52MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver");
  53MODULE_LICENSE("Dual BSD/GPL");
  54
  55struct mlx5_ib_event_work {
  56        struct work_struct      work;
  57        union {
  58                struct mlx5_ib_dev            *dev;
  59                struct mlx5_ib_multiport_info *mpi;
  60        };
  61        bool                    is_slave;
  62        unsigned int            event;
  63        void                    *param;
  64};
  65
  66enum {
  67        MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
  68};
  69
  70static struct workqueue_struct *mlx5_ib_event_wq;
  71static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
  72static LIST_HEAD(mlx5_ib_dev_list);
  73/*
  74 * This mutex should be held when accessing either of the above lists
  75 */
  76static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
  77
  78/* We can't use an array for xlt_emergency_page because dma_map_single
  79 * doesn't work on kernel modules memory
  80 */
  81static unsigned long xlt_emergency_page;
  82static struct mutex xlt_emergency_page_mutex;
  83
  84struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
  85{
  86        struct mlx5_ib_dev *dev;
  87
  88        mutex_lock(&mlx5_ib_multiport_mutex);
  89        dev = mpi->ibdev;
  90        mutex_unlock(&mlx5_ib_multiport_mutex);
  91        return dev;
  92}
  93
  94static enum rdma_link_layer
  95mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
  96{
  97        switch (port_type_cap) {
  98        case MLX5_CAP_PORT_TYPE_IB:
  99                return IB_LINK_LAYER_INFINIBAND;
 100        case MLX5_CAP_PORT_TYPE_ETH:
 101                return IB_LINK_LAYER_ETHERNET;
 102        default:
 103                return IB_LINK_LAYER_UNSPECIFIED;
 104        }
 105}
 106
 107static enum rdma_link_layer
 108mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
 109{
 110        struct mlx5_ib_dev *dev = to_mdev(device);
 111        int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
 112
 113        return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 114}
 115
 116static int get_port_state(struct ib_device *ibdev,
 117                          u8 port_num,
 118                          enum ib_port_state *state)
 119{
 120        struct ib_port_attr attr;
 121        int ret;
 122
 123        memset(&attr, 0, sizeof(attr));
 124        ret = ibdev->ops.query_port(ibdev, port_num, &attr);
 125        if (!ret)
 126                *state = attr.state;
 127        return ret;
 128}
 129
 130static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
 131                                           struct net_device *ndev,
 132                                           u8 *port_num)
 133{
 134        struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
 135        struct net_device *rep_ndev;
 136        struct mlx5_ib_port *port;
 137        int i;
 138
 139        for (i = 0; i < dev->num_ports; i++) {
 140                port  = &dev->port[i];
 141                if (!port->rep)
 142                        continue;
 143
 144                read_lock(&port->roce.netdev_lock);
 145                rep_ndev = mlx5_ib_get_rep_netdev(esw,
 146                                                  port->rep->vport);
 147                if (rep_ndev == ndev) {
 148                        read_unlock(&port->roce.netdev_lock);
 149                        *port_num = i + 1;
 150                        return &port->roce;
 151                }
 152                read_unlock(&port->roce.netdev_lock);
 153        }
 154
 155        return NULL;
 156}
 157
 158static int mlx5_netdev_event(struct notifier_block *this,
 159                             unsigned long event, void *ptr)
 160{
 161        struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
 162        struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
 163        u8 port_num = roce->native_port_num;
 164        struct mlx5_core_dev *mdev;
 165        struct mlx5_ib_dev *ibdev;
 166
 167        ibdev = roce->dev;
 168        mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
 169        if (!mdev)
 170                return NOTIFY_DONE;
 171
 172        switch (event) {
 173        case NETDEV_REGISTER:
 174                /* Should already be registered during the load */
 175                if (ibdev->is_rep)
 176                        break;
 177                write_lock(&roce->netdev_lock);
 178                if (ndev->dev.parent == mdev->device)
 179                        roce->netdev = ndev;
 180                write_unlock(&roce->netdev_lock);
 181                break;
 182
 183        case NETDEV_UNREGISTER:
 184                /* In case of reps, ib device goes away before the netdevs */
 185                write_lock(&roce->netdev_lock);
 186                if (roce->netdev == ndev)
 187                        roce->netdev = NULL;
 188                write_unlock(&roce->netdev_lock);
 189                break;
 190
 191        case NETDEV_CHANGE:
 192        case NETDEV_UP:
 193        case NETDEV_DOWN: {
 194                struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
 195                struct net_device *upper = NULL;
 196
 197                if (lag_ndev) {
 198                        upper = netdev_master_upper_dev_get(lag_ndev);
 199                        dev_put(lag_ndev);
 200                }
 201
 202                if (ibdev->is_rep)
 203                        roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
 204                if (!roce)
 205                        return NOTIFY_DONE;
 206                if ((upper == ndev || (!upper && ndev == roce->netdev))
 207                    && ibdev->ib_active) {
 208                        struct ib_event ibev = { };
 209                        enum ib_port_state port_state;
 210
 211                        if (get_port_state(&ibdev->ib_dev, port_num,
 212                                           &port_state))
 213                                goto done;
 214
 215                        if (roce->last_port_state == port_state)
 216                                goto done;
 217
 218                        roce->last_port_state = port_state;
 219                        ibev.device = &ibdev->ib_dev;
 220                        if (port_state == IB_PORT_DOWN)
 221                                ibev.event = IB_EVENT_PORT_ERR;
 222                        else if (port_state == IB_PORT_ACTIVE)
 223                                ibev.event = IB_EVENT_PORT_ACTIVE;
 224                        else
 225                                goto done;
 226
 227                        ibev.element.port_num = port_num;
 228                        ib_dispatch_event(&ibev);
 229                }
 230                break;
 231        }
 232
 233        default:
 234                break;
 235        }
 236done:
 237        mlx5_ib_put_native_port_mdev(ibdev, port_num);
 238        return NOTIFY_DONE;
 239}
 240
 241static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
 242                                             u8 port_num)
 243{
 244        struct mlx5_ib_dev *ibdev = to_mdev(device);
 245        struct net_device *ndev;
 246        struct mlx5_core_dev *mdev;
 247
 248        mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
 249        if (!mdev)
 250                return NULL;
 251
 252        ndev = mlx5_lag_get_roce_netdev(mdev);
 253        if (ndev)
 254                goto out;
 255
 256        /* Ensure ndev does not disappear before we invoke dev_hold()
 257         */
 258        read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
 259        ndev = ibdev->port[port_num - 1].roce.netdev;
 260        if (ndev)
 261                dev_hold(ndev);
 262        read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
 263
 264out:
 265        mlx5_ib_put_native_port_mdev(ibdev, port_num);
 266        return ndev;
 267}
 268
 269struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
 270                                                   u8 ib_port_num,
 271                                                   u8 *native_port_num)
 272{
 273        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
 274                                                          ib_port_num);
 275        struct mlx5_core_dev *mdev = NULL;
 276        struct mlx5_ib_multiport_info *mpi;
 277        struct mlx5_ib_port *port;
 278
 279        if (!mlx5_core_mp_enabled(ibdev->mdev) ||
 280            ll != IB_LINK_LAYER_ETHERNET) {
 281                if (native_port_num)
 282                        *native_port_num = ib_port_num;
 283                return ibdev->mdev;
 284        }
 285
 286        if (native_port_num)
 287                *native_port_num = 1;
 288
 289        port = &ibdev->port[ib_port_num - 1];
 290        spin_lock(&port->mp.mpi_lock);
 291        mpi = ibdev->port[ib_port_num - 1].mp.mpi;
 292        if (mpi && !mpi->unaffiliate) {
 293                mdev = mpi->mdev;
 294                /* If it's the master no need to refcount, it'll exist
 295                 * as long as the ib_dev exists.
 296                 */
 297                if (!mpi->is_master)
 298                        mpi->mdev_refcnt++;
 299        }
 300        spin_unlock(&port->mp.mpi_lock);
 301
 302        return mdev;
 303}
 304
 305void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num)
 306{
 307        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
 308                                                          port_num);
 309        struct mlx5_ib_multiport_info *mpi;
 310        struct mlx5_ib_port *port;
 311
 312        if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
 313                return;
 314
 315        port = &ibdev->port[port_num - 1];
 316
 317        spin_lock(&port->mp.mpi_lock);
 318        mpi = ibdev->port[port_num - 1].mp.mpi;
 319        if (mpi->is_master)
 320                goto out;
 321
 322        mpi->mdev_refcnt--;
 323        if (mpi->unaffiliate)
 324                complete(&mpi->unref_comp);
 325out:
 326        spin_unlock(&port->mp.mpi_lock);
 327}
 328
 329static int translate_eth_legacy_proto_oper(u32 eth_proto_oper,
 330                                           u16 *active_speed, u8 *active_width)
 331{
 332        switch (eth_proto_oper) {
 333        case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
 334        case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
 335        case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
 336        case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
 337                *active_width = IB_WIDTH_1X;
 338                *active_speed = IB_SPEED_SDR;
 339                break;
 340        case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
 341        case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
 342        case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
 343        case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
 344        case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
 345        case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
 346        case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
 347                *active_width = IB_WIDTH_1X;
 348                *active_speed = IB_SPEED_QDR;
 349                break;
 350        case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
 351        case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
 352        case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
 353                *active_width = IB_WIDTH_1X;
 354                *active_speed = IB_SPEED_EDR;
 355                break;
 356        case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
 357        case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
 358        case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
 359        case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
 360                *active_width = IB_WIDTH_4X;
 361                *active_speed = IB_SPEED_QDR;
 362                break;
 363        case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
 364        case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
 365        case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
 366                *active_width = IB_WIDTH_1X;
 367                *active_speed = IB_SPEED_HDR;
 368                break;
 369        case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
 370                *active_width = IB_WIDTH_4X;
 371                *active_speed = IB_SPEED_FDR;
 372                break;
 373        case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
 374        case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
 375        case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
 376        case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
 377                *active_width = IB_WIDTH_4X;
 378                *active_speed = IB_SPEED_EDR;
 379                break;
 380        default:
 381                return -EINVAL;
 382        }
 383
 384        return 0;
 385}
 386
 387static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
 388                                        u8 *active_width)
 389{
 390        switch (eth_proto_oper) {
 391        case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
 392        case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
 393                *active_width = IB_WIDTH_1X;
 394                *active_speed = IB_SPEED_SDR;
 395                break;
 396        case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
 397                *active_width = IB_WIDTH_1X;
 398                *active_speed = IB_SPEED_DDR;
 399                break;
 400        case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
 401                *active_width = IB_WIDTH_1X;
 402                *active_speed = IB_SPEED_QDR;
 403                break;
 404        case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
 405                *active_width = IB_WIDTH_4X;
 406                *active_speed = IB_SPEED_QDR;
 407                break;
 408        case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
 409                *active_width = IB_WIDTH_1X;
 410                *active_speed = IB_SPEED_EDR;
 411                break;
 412        case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
 413                *active_width = IB_WIDTH_2X;
 414                *active_speed = IB_SPEED_EDR;
 415                break;
 416        case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
 417                *active_width = IB_WIDTH_1X;
 418                *active_speed = IB_SPEED_HDR;
 419                break;
 420        case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
 421                *active_width = IB_WIDTH_4X;
 422                *active_speed = IB_SPEED_EDR;
 423                break;
 424        case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
 425                *active_width = IB_WIDTH_2X;
 426                *active_speed = IB_SPEED_HDR;
 427                break;
 428        case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
 429                *active_width = IB_WIDTH_4X;
 430                *active_speed = IB_SPEED_HDR;
 431                break;
 432        default:
 433                return -EINVAL;
 434        }
 435
 436        return 0;
 437}
 438
 439static int translate_eth_proto_oper(u32 eth_proto_oper, u16 *active_speed,
 440                                    u8 *active_width, bool ext)
 441{
 442        return ext ?
 443                translate_eth_ext_proto_oper(eth_proto_oper, active_speed,
 444                                             active_width) :
 445                translate_eth_legacy_proto_oper(eth_proto_oper, active_speed,
 446                                                active_width);
 447}
 448
 449static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 450                                struct ib_port_attr *props)
 451{
 452        struct mlx5_ib_dev *dev = to_mdev(device);
 453        u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
 454        struct mlx5_core_dev *mdev;
 455        struct net_device *ndev, *upper;
 456        enum ib_mtu ndev_ib_mtu;
 457        bool put_mdev = true;
 458        u16 qkey_viol_cntr;
 459        u32 eth_prot_oper;
 460        u8 mdev_port_num;
 461        bool ext;
 462        int err;
 463
 464        mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
 465        if (!mdev) {
 466                /* This means the port isn't affiliated yet. Get the
 467                 * info for the master port instead.
 468                 */
 469                put_mdev = false;
 470                mdev = dev->mdev;
 471                mdev_port_num = 1;
 472                port_num = 1;
 473        }
 474
 475        /* Possible bad flows are checked before filling out props so in case
 476         * of an error it will still be zeroed out.
 477         * Use native port in case of reps
 478         */
 479        if (dev->is_rep)
 480                err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
 481                                           1);
 482        else
 483                err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
 484                                           mdev_port_num);
 485        if (err)
 486                goto out;
 487        ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability);
 488        eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
 489
 490        props->active_width     = IB_WIDTH_4X;
 491        props->active_speed     = IB_SPEED_QDR;
 492
 493        translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
 494                                 &props->active_width, ext);
 495
 496        props->port_cap_flags |= IB_PORT_CM_SUP;
 497        props->ip_gids = true;
 498
 499        props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
 500                                                roce_address_table_size);
 501        props->max_mtu          = IB_MTU_4096;
 502        props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
 503        props->pkey_tbl_len     = 1;
 504        props->state            = IB_PORT_DOWN;
 505        props->phys_state       = IB_PORT_PHYS_STATE_DISABLED;
 506
 507        mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
 508        props->qkey_viol_cntr = qkey_viol_cntr;
 509
 510        /* If this is a stub query for an unaffiliated port stop here */
 511        if (!put_mdev)
 512                goto out;
 513
 514        ndev = mlx5_ib_get_netdev(device, port_num);
 515        if (!ndev)
 516                goto out;
 517
 518        if (dev->lag_active) {
 519                rcu_read_lock();
 520                upper = netdev_master_upper_dev_get_rcu(ndev);
 521                if (upper) {
 522                        dev_put(ndev);
 523                        ndev = upper;
 524                        dev_hold(ndev);
 525                }
 526                rcu_read_unlock();
 527        }
 528
 529        if (netif_running(ndev) && netif_carrier_ok(ndev)) {
 530                props->state      = IB_PORT_ACTIVE;
 531                props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
 532        }
 533
 534        ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
 535
 536        dev_put(ndev);
 537
 538        props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
 539out:
 540        if (put_mdev)
 541                mlx5_ib_put_native_port_mdev(dev, port_num);
 542        return err;
 543}
 544
 545static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
 546                         unsigned int index, const union ib_gid *gid,
 547                         const struct ib_gid_attr *attr)
 548{
 549        enum ib_gid_type gid_type = IB_GID_TYPE_ROCE;
 550        u16 vlan_id = 0xffff;
 551        u8 roce_version = 0;
 552        u8 roce_l3_type = 0;
 553        u8 mac[ETH_ALEN];
 554        int ret;
 555
 556        if (gid) {
 557                gid_type = attr->gid_type;
 558                ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
 559                if (ret)
 560                        return ret;
 561        }
 562
 563        switch (gid_type) {
 564        case IB_GID_TYPE_ROCE:
 565                roce_version = MLX5_ROCE_VERSION_1;
 566                break;
 567        case IB_GID_TYPE_ROCE_UDP_ENCAP:
 568                roce_version = MLX5_ROCE_VERSION_2;
 569                if (ipv6_addr_v4mapped((void *)gid))
 570                        roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
 571                else
 572                        roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
 573                break;
 574
 575        default:
 576                mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type);
 577        }
 578
 579        return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
 580                                      roce_l3_type, gid->raw, mac,
 581                                      vlan_id < VLAN_CFI_MASK, vlan_id,
 582                                      port_num);
 583}
 584
 585static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
 586                           __always_unused void **context)
 587{
 588        return set_roce_addr(to_mdev(attr->device), attr->port_num,
 589                             attr->index, &attr->gid, attr);
 590}
 591
 592static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
 593                           __always_unused void **context)
 594{
 595        return set_roce_addr(to_mdev(attr->device), attr->port_num,
 596                             attr->index, NULL, NULL);
 597}
 598
 599__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev,
 600                                   const struct ib_gid_attr *attr)
 601{
 602        if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
 603                return 0;
 604
 605        return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
 606}
 607
 608static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
 609{
 610        if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
 611                return !MLX5_CAP_GEN(dev->mdev, ib_virt);
 612        return 0;
 613}
 614
 615enum {
 616        MLX5_VPORT_ACCESS_METHOD_MAD,
 617        MLX5_VPORT_ACCESS_METHOD_HCA,
 618        MLX5_VPORT_ACCESS_METHOD_NIC,
 619};
 620
 621static int mlx5_get_vport_access_method(struct ib_device *ibdev)
 622{
 623        if (mlx5_use_mad_ifc(to_mdev(ibdev)))
 624                return MLX5_VPORT_ACCESS_METHOD_MAD;
 625
 626        if (mlx5_ib_port_link_layer(ibdev, 1) ==
 627            IB_LINK_LAYER_ETHERNET)
 628                return MLX5_VPORT_ACCESS_METHOD_NIC;
 629
 630        return MLX5_VPORT_ACCESS_METHOD_HCA;
 631}
 632
 633static void get_atomic_caps(struct mlx5_ib_dev *dev,
 634                            u8 atomic_size_qp,
 635                            struct ib_device_attr *props)
 636{
 637        u8 tmp;
 638        u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
 639        u8 atomic_req_8B_endianness_mode =
 640                MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
 641
 642        /* Check if HW supports 8 bytes standard atomic operations and capable
 643         * of host endianness respond
 644         */
 645        tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
 646        if (((atomic_operations & tmp) == tmp) &&
 647            (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
 648            (atomic_req_8B_endianness_mode)) {
 649                props->atomic_cap = IB_ATOMIC_HCA;
 650        } else {
 651                props->atomic_cap = IB_ATOMIC_NONE;
 652        }
 653}
 654
 655static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
 656                               struct ib_device_attr *props)
 657{
 658        u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
 659
 660        get_atomic_caps(dev, atomic_size_qp, props);
 661}
 662
 663static int mlx5_query_system_image_guid(struct ib_device *ibdev,
 664                                        __be64 *sys_image_guid)
 665{
 666        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 667        struct mlx5_core_dev *mdev = dev->mdev;
 668        u64 tmp;
 669        int err;
 670
 671        switch (mlx5_get_vport_access_method(ibdev)) {
 672        case MLX5_VPORT_ACCESS_METHOD_MAD:
 673                return mlx5_query_mad_ifc_system_image_guid(ibdev,
 674                                                            sys_image_guid);
 675
 676        case MLX5_VPORT_ACCESS_METHOD_HCA:
 677                err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
 678                break;
 679
 680        case MLX5_VPORT_ACCESS_METHOD_NIC:
 681                err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
 682                break;
 683
 684        default:
 685                return -EINVAL;
 686        }
 687
 688        if (!err)
 689                *sys_image_guid = cpu_to_be64(tmp);
 690
 691        return err;
 692
 693}
 694
 695static int mlx5_query_max_pkeys(struct ib_device *ibdev,
 696                                u16 *max_pkeys)
 697{
 698        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 699        struct mlx5_core_dev *mdev = dev->mdev;
 700
 701        switch (mlx5_get_vport_access_method(ibdev)) {
 702        case MLX5_VPORT_ACCESS_METHOD_MAD:
 703                return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
 704
 705        case MLX5_VPORT_ACCESS_METHOD_HCA:
 706        case MLX5_VPORT_ACCESS_METHOD_NIC:
 707                *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
 708                                                pkey_table_size));
 709                return 0;
 710
 711        default:
 712                return -EINVAL;
 713        }
 714}
 715
 716static int mlx5_query_vendor_id(struct ib_device *ibdev,
 717                                u32 *vendor_id)
 718{
 719        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 720
 721        switch (mlx5_get_vport_access_method(ibdev)) {
 722        case MLX5_VPORT_ACCESS_METHOD_MAD:
 723                return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
 724
 725        case MLX5_VPORT_ACCESS_METHOD_HCA:
 726        case MLX5_VPORT_ACCESS_METHOD_NIC:
 727                return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
 728
 729        default:
 730                return -EINVAL;
 731        }
 732}
 733
 734static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
 735                                __be64 *node_guid)
 736{
 737        u64 tmp;
 738        int err;
 739
 740        switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
 741        case MLX5_VPORT_ACCESS_METHOD_MAD:
 742                return mlx5_query_mad_ifc_node_guid(dev, node_guid);
 743
 744        case MLX5_VPORT_ACCESS_METHOD_HCA:
 745                err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
 746                break;
 747
 748        case MLX5_VPORT_ACCESS_METHOD_NIC:
 749                err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
 750                break;
 751
 752        default:
 753                return -EINVAL;
 754        }
 755
 756        if (!err)
 757                *node_guid = cpu_to_be64(tmp);
 758
 759        return err;
 760}
 761
 762struct mlx5_reg_node_desc {
 763        u8      desc[IB_DEVICE_NODE_DESC_MAX];
 764};
 765
 766static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
 767{
 768        struct mlx5_reg_node_desc in;
 769
 770        if (mlx5_use_mad_ifc(dev))
 771                return mlx5_query_mad_ifc_node_desc(dev, node_desc);
 772
 773        memset(&in, 0, sizeof(in));
 774
 775        return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
 776                                    sizeof(struct mlx5_reg_node_desc),
 777                                    MLX5_REG_NODE_DESC, 0, 0);
 778}
 779
 780static int mlx5_ib_query_device(struct ib_device *ibdev,
 781                                struct ib_device_attr *props,
 782                                struct ib_udata *uhw)
 783{
 784        size_t uhw_outlen = (uhw) ? uhw->outlen : 0;
 785        struct mlx5_ib_dev *dev = to_mdev(ibdev);
 786        struct mlx5_core_dev *mdev = dev->mdev;
 787        int err = -ENOMEM;
 788        int max_sq_desc;
 789        int max_rq_sg;
 790        int max_sq_sg;
 791        u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
 792        bool raw_support = !mlx5_core_mp_enabled(mdev);
 793        struct mlx5_ib_query_device_resp resp = {};
 794        size_t resp_len;
 795        u64 max_tso;
 796
 797        resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
 798        if (uhw_outlen && uhw_outlen < resp_len)
 799                return -EINVAL;
 800
 801        resp.response_length = resp_len;
 802
 803        if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
 804                return -EINVAL;
 805
 806        memset(props, 0, sizeof(*props));
 807        err = mlx5_query_system_image_guid(ibdev,
 808                                           &props->sys_image_guid);
 809        if (err)
 810                return err;
 811
 812        err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
 813        if (err)
 814                return err;
 815
 816        err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
 817        if (err)
 818                return err;
 819
 820        props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
 821                (fw_rev_min(dev->mdev) << 16) |
 822                fw_rev_sub(dev->mdev);
 823        props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
 824                IB_DEVICE_PORT_ACTIVE_EVENT             |
 825                IB_DEVICE_SYS_IMAGE_GUID                |
 826                IB_DEVICE_RC_RNR_NAK_GEN;
 827
 828        if (MLX5_CAP_GEN(mdev, pkv))
 829                props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
 830        if (MLX5_CAP_GEN(mdev, qkv))
 831                props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
 832        if (MLX5_CAP_GEN(mdev, apm))
 833                props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
 834        if (MLX5_CAP_GEN(mdev, xrc))
 835                props->device_cap_flags |= IB_DEVICE_XRC;
 836        if (MLX5_CAP_GEN(mdev, imaicl)) {
 837                props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
 838                                           IB_DEVICE_MEM_WINDOW_TYPE_2B;
 839                props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
 840                /* We support 'Gappy' memory registration too */
 841                props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
 842        }
 843        /* IB_WR_REG_MR always requires changing the entity size with UMR */
 844        if (!MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
 845                props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
 846        if (MLX5_CAP_GEN(mdev, sho)) {
 847                props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER;
 848                /* At this stage no support for signature handover */
 849                props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
 850                                      IB_PROT_T10DIF_TYPE_2 |
 851                                      IB_PROT_T10DIF_TYPE_3;
 852                props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
 853                                       IB_GUARD_T10DIF_CSUM;
 854        }
 855        if (MLX5_CAP_GEN(mdev, block_lb_mc))
 856                props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 857
 858        if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
 859                if (MLX5_CAP_ETH(mdev, csum_cap)) {
 860                        /* Legacy bit to support old userspace libraries */
 861                        props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
 862                        props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
 863                }
 864
 865                if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
 866                        props->raw_packet_caps |=
 867                                IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
 868
 869                if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) {
 870                        max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
 871                        if (max_tso) {
 872                                resp.tso_caps.max_tso = 1 << max_tso;
 873                                resp.tso_caps.supported_qpts |=
 874                                        1 << IB_QPT_RAW_PACKET;
 875                                resp.response_length += sizeof(resp.tso_caps);
 876                        }
 877                }
 878
 879                if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) {
 880                        resp.rss_caps.rx_hash_function =
 881                                                MLX5_RX_HASH_FUNC_TOEPLITZ;
 882                        resp.rss_caps.rx_hash_fields_mask =
 883                                                MLX5_RX_HASH_SRC_IPV4 |
 884                                                MLX5_RX_HASH_DST_IPV4 |
 885                                                MLX5_RX_HASH_SRC_IPV6 |
 886                                                MLX5_RX_HASH_DST_IPV6 |
 887                                                MLX5_RX_HASH_SRC_PORT_TCP |
 888                                                MLX5_RX_HASH_DST_PORT_TCP |
 889                                                MLX5_RX_HASH_SRC_PORT_UDP |
 890                                                MLX5_RX_HASH_DST_PORT_UDP |
 891                                                MLX5_RX_HASH_INNER;
 892                        if (mlx5_accel_ipsec_device_caps(dev->mdev) &
 893                            MLX5_ACCEL_IPSEC_CAP_DEVICE)
 894                                resp.rss_caps.rx_hash_fields_mask |=
 895                                        MLX5_RX_HASH_IPSEC_SPI;
 896                        resp.response_length += sizeof(resp.rss_caps);
 897                }
 898        } else {
 899                if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen)
 900                        resp.response_length += sizeof(resp.tso_caps);
 901                if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen)
 902                        resp.response_length += sizeof(resp.rss_caps);
 903        }
 904
 905        if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
 906                props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
 907                props->device_cap_flags |= IB_DEVICE_UD_TSO;
 908        }
 909
 910        if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
 911            MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
 912            raw_support)
 913                props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
 914
 915        if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
 916            MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
 917                props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
 918
 919        if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
 920            MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
 921            raw_support) {
 922                /* Legacy bit to support old userspace libraries */
 923                props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
 924                props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
 925        }
 926
 927        if (MLX5_CAP_DEV_MEM(mdev, memic)) {
 928                props->max_dm_size =
 929                        MLX5_CAP_DEV_MEM(mdev, max_memic_size);
 930        }
 931
 932        if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
 933                props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
 934
 935        if (MLX5_CAP_GEN(mdev, end_pad))
 936                props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING;
 937
 938        props->vendor_part_id      = mdev->pdev->device;
 939        props->hw_ver              = mdev->pdev->revision;
 940
 941        props->max_mr_size         = ~0ull;
 942        props->page_size_cap       = ~(min_page_size - 1);
 943        props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
 944        props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
 945        max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
 946                     sizeof(struct mlx5_wqe_data_seg);
 947        max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
 948        max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
 949                     sizeof(struct mlx5_wqe_raddr_seg)) /
 950                sizeof(struct mlx5_wqe_data_seg);
 951        props->max_send_sge = max_sq_sg;
 952        props->max_recv_sge = max_rq_sg;
 953        props->max_sge_rd          = MLX5_MAX_SGE_RD;
 954        props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
 955        props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
 956        props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
 957        props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
 958        props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
 959        props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
 960        props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
 961        props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
 962        props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
 963        props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
 964        props->max_srq_sge         = max_rq_sg - 1;
 965        props->max_fast_reg_page_list_len =
 966                1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
 967        props->max_pi_fast_reg_page_list_len =
 968                props->max_fast_reg_page_list_len / 2;
 969        props->max_sgl_rd =
 970                MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance);
 971        get_atomic_caps_qp(dev, props);
 972        props->masked_atomic_cap   = IB_ATOMIC_NONE;
 973        props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
 974        props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
 975        props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 976                                           props->max_mcast_grp;
 977        props->max_ah = INT_MAX;
 978        props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
 979        props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
 980
 981        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
 982                if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
 983                        props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
 984                props->odp_caps = dev->odp_caps;
 985                if (!uhw) {
 986                        /* ODP for kernel QPs is not implemented for receive
 987                         * WQEs and SRQ WQEs
 988                         */
 989                        props->odp_caps.per_transport_caps.rc_odp_caps &=
 990                                ~(IB_ODP_SUPPORT_READ |
 991                                  IB_ODP_SUPPORT_SRQ_RECV);
 992                        props->odp_caps.per_transport_caps.uc_odp_caps &=
 993                                ~(IB_ODP_SUPPORT_READ |
 994                                  IB_ODP_SUPPORT_SRQ_RECV);
 995                        props->odp_caps.per_transport_caps.ud_odp_caps &=
 996                                ~(IB_ODP_SUPPORT_READ |
 997                                  IB_ODP_SUPPORT_SRQ_RECV);
 998                        props->odp_caps.per_transport_caps.xrc_odp_caps &=
 999                                ~(IB_ODP_SUPPORT_READ |
1000                                  IB_ODP_SUPPORT_SRQ_RECV);
1001                }
1002        }
1003
1004        if (MLX5_CAP_GEN(mdev, cd))
1005                props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
1006
1007        if (mlx5_core_is_vf(mdev))
1008                props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
1009
1010        if (mlx5_ib_port_link_layer(ibdev, 1) ==
1011            IB_LINK_LAYER_ETHERNET && raw_support) {
1012                props->rss_caps.max_rwq_indirection_tables =
1013                        1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
1014                props->rss_caps.max_rwq_indirection_table_size =
1015                        1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
1016                props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
1017                props->max_wq_type_rq =
1018                        1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
1019        }
1020
1021        if (MLX5_CAP_GEN(mdev, tag_matching)) {
1022                props->tm_caps.max_num_tags =
1023                        (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
1024                props->tm_caps.max_ops =
1025                        1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
1026                props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
1027        }
1028
1029        if (MLX5_CAP_GEN(mdev, tag_matching) &&
1030            MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
1031                props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
1032                props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
1033        }
1034
1035        if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
1036                props->cq_caps.max_cq_moderation_count =
1037                                                MLX5_MAX_CQ_COUNT;
1038                props->cq_caps.max_cq_moderation_period =
1039                                                MLX5_MAX_CQ_PERIOD;
1040        }
1041
1042        if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) {
1043                resp.response_length += sizeof(resp.cqe_comp_caps);
1044
1045                if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
1046                        resp.cqe_comp_caps.max_num =
1047                                MLX5_CAP_GEN(dev->mdev,
1048                                             cqe_compression_max_num);
1049
1050                        resp.cqe_comp_caps.supported_format =
1051                                MLX5_IB_CQE_RES_FORMAT_HASH |
1052                                MLX5_IB_CQE_RES_FORMAT_CSUM;
1053
1054                        if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
1055                                resp.cqe_comp_caps.supported_format |=
1056                                        MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX;
1057                }
1058        }
1059
1060        if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen &&
1061            raw_support) {
1062                if (MLX5_CAP_QOS(mdev, packet_pacing) &&
1063                    MLX5_CAP_GEN(mdev, qos)) {
1064                        resp.packet_pacing_caps.qp_rate_limit_max =
1065                                MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
1066                        resp.packet_pacing_caps.qp_rate_limit_min =
1067                                MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
1068                        resp.packet_pacing_caps.supported_qpts |=
1069                                1 << IB_QPT_RAW_PACKET;
1070                        if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) &&
1071                            MLX5_CAP_QOS(mdev, packet_pacing_typical_size))
1072                                resp.packet_pacing_caps.cap_flags |=
1073                                        MLX5_IB_PP_SUPPORT_BURST;
1074                }
1075                resp.response_length += sizeof(resp.packet_pacing_caps);
1076        }
1077
1078        if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <=
1079            uhw_outlen) {
1080                if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
1081                        resp.mlx5_ib_support_multi_pkt_send_wqes =
1082                                MLX5_IB_ALLOW_MPW;
1083
1084                if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
1085                        resp.mlx5_ib_support_multi_pkt_send_wqes |=
1086                                MLX5_IB_SUPPORT_EMPW;
1087
1088                resp.response_length +=
1089                        sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
1090        }
1091
1092        if (offsetofend(typeof(resp), flags) <= uhw_outlen) {
1093                resp.response_length += sizeof(resp.flags);
1094
1095                if (MLX5_CAP_GEN(mdev, cqe_compression_128))
1096                        resp.flags |=
1097                                MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
1098
1099                if (MLX5_CAP_GEN(mdev, cqe_128_always))
1100                        resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
1101                if (MLX5_CAP_GEN(mdev, qp_packet_based))
1102                        resp.flags |=
1103                                MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
1104
1105                resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
1106        }
1107
1108        if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
1109                resp.response_length += sizeof(resp.sw_parsing_caps);
1110                if (MLX5_CAP_ETH(mdev, swp)) {
1111                        resp.sw_parsing_caps.sw_parsing_offloads |=
1112                                MLX5_IB_SW_PARSING;
1113
1114                        if (MLX5_CAP_ETH(mdev, swp_csum))
1115                                resp.sw_parsing_caps.sw_parsing_offloads |=
1116                                        MLX5_IB_SW_PARSING_CSUM;
1117
1118                        if (MLX5_CAP_ETH(mdev, swp_lso))
1119                                resp.sw_parsing_caps.sw_parsing_offloads |=
1120                                        MLX5_IB_SW_PARSING_LSO;
1121
1122                        if (resp.sw_parsing_caps.sw_parsing_offloads)
1123                                resp.sw_parsing_caps.supported_qpts =
1124                                        BIT(IB_QPT_RAW_PACKET);
1125                }
1126        }
1127
1128        if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen &&
1129            raw_support) {
1130                resp.response_length += sizeof(resp.striding_rq_caps);
1131                if (MLX5_CAP_GEN(mdev, striding_rq)) {
1132                        resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
1133                                MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
1134                        resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
1135                                MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
1136                        if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range))
1137                                resp.striding_rq_caps
1138                                        .min_single_wqe_log_num_of_strides =
1139                                        MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1140                        else
1141                                resp.striding_rq_caps
1142                                        .min_single_wqe_log_num_of_strides =
1143                                        MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1144                        resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
1145                                MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
1146                        resp.striding_rq_caps.supported_qpts =
1147                                BIT(IB_QPT_RAW_PACKET);
1148                }
1149        }
1150
1151        if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) {
1152                resp.response_length += sizeof(resp.tunnel_offloads_caps);
1153                if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
1154                        resp.tunnel_offloads_caps |=
1155                                MLX5_IB_TUNNELED_OFFLOADS_VXLAN;
1156                if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx))
1157                        resp.tunnel_offloads_caps |=
1158                                MLX5_IB_TUNNELED_OFFLOADS_GENEVE;
1159                if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
1160                        resp.tunnel_offloads_caps |=
1161                                MLX5_IB_TUNNELED_OFFLOADS_GRE;
1162                if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre))
1163                        resp.tunnel_offloads_caps |=
1164                                MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
1165                if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp))
1166                        resp.tunnel_offloads_caps |=
1167                                MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
1168        }
1169
1170        if (uhw_outlen) {
1171                err = ib_copy_to_udata(uhw, &resp, resp.response_length);
1172
1173                if (err)
1174                        return err;
1175        }
1176
1177        return 0;
1178}
1179
1180static void translate_active_width(struct ib_device *ibdev, u16 active_width,
1181                                   u8 *ib_width)
1182{
1183        struct mlx5_ib_dev *dev = to_mdev(ibdev);
1184
1185        if (active_width & MLX5_PTYS_WIDTH_1X)
1186                *ib_width = IB_WIDTH_1X;
1187        else if (active_width & MLX5_PTYS_WIDTH_2X)
1188                *ib_width = IB_WIDTH_2X;
1189        else if (active_width & MLX5_PTYS_WIDTH_4X)
1190                *ib_width = IB_WIDTH_4X;
1191        else if (active_width & MLX5_PTYS_WIDTH_8X)
1192                *ib_width = IB_WIDTH_8X;
1193        else if (active_width & MLX5_PTYS_WIDTH_12X)
1194                *ib_width = IB_WIDTH_12X;
1195        else {
1196                mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n",
1197                            active_width);
1198                *ib_width = IB_WIDTH_4X;
1199        }
1200
1201        return;
1202}
1203
1204static int mlx5_mtu_to_ib_mtu(int mtu)
1205{
1206        switch (mtu) {
1207        case 256: return 1;
1208        case 512: return 2;
1209        case 1024: return 3;
1210        case 2048: return 4;
1211        case 4096: return 5;
1212        default:
1213                pr_warn("invalid mtu\n");
1214                return -1;
1215        }
1216}
1217
1218enum ib_max_vl_num {
1219        __IB_MAX_VL_0           = 1,
1220        __IB_MAX_VL_0_1         = 2,
1221        __IB_MAX_VL_0_3         = 3,
1222        __IB_MAX_VL_0_7         = 4,
1223        __IB_MAX_VL_0_14        = 5,
1224};
1225
1226enum mlx5_vl_hw_cap {
1227        MLX5_VL_HW_0    = 1,
1228        MLX5_VL_HW_0_1  = 2,
1229        MLX5_VL_HW_0_2  = 3,
1230        MLX5_VL_HW_0_3  = 4,
1231        MLX5_VL_HW_0_4  = 5,
1232        MLX5_VL_HW_0_5  = 6,
1233        MLX5_VL_HW_0_6  = 7,
1234        MLX5_VL_HW_0_7  = 8,
1235        MLX5_VL_HW_0_14 = 15
1236};
1237
1238static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
1239                                u8 *max_vl_num)
1240{
1241        switch (vl_hw_cap) {
1242        case MLX5_VL_HW_0:
1243                *max_vl_num = __IB_MAX_VL_0;
1244                break;
1245        case MLX5_VL_HW_0_1:
1246                *max_vl_num = __IB_MAX_VL_0_1;
1247                break;
1248        case MLX5_VL_HW_0_3:
1249                *max_vl_num = __IB_MAX_VL_0_3;
1250                break;
1251        case MLX5_VL_HW_0_7:
1252                *max_vl_num = __IB_MAX_VL_0_7;
1253                break;
1254        case MLX5_VL_HW_0_14:
1255                *max_vl_num = __IB_MAX_VL_0_14;
1256                break;
1257
1258        default:
1259                return -EINVAL;
1260        }
1261
1262        return 0;
1263}
1264
1265static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
1266                               struct ib_port_attr *props)
1267{
1268        struct mlx5_ib_dev *dev = to_mdev(ibdev);
1269        struct mlx5_core_dev *mdev = dev->mdev;
1270        struct mlx5_hca_vport_context *rep;
1271        u16 max_mtu;
1272        u16 oper_mtu;
1273        int err;
1274        u16 ib_link_width_oper;
1275        u8 vl_hw_cap;
1276
1277        rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1278        if (!rep) {
1279                err = -ENOMEM;
1280                goto out;
1281        }
1282
1283        /* props being zeroed by the caller, avoid zeroing it here */
1284
1285        err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
1286        if (err)
1287                goto out;
1288
1289        props->lid              = rep->lid;
1290        props->lmc              = rep->lmc;
1291        props->sm_lid           = rep->sm_lid;
1292        props->sm_sl            = rep->sm_sl;
1293        props->state            = rep->vport_state;
1294        props->phys_state       = rep->port_physical_state;
1295        props->port_cap_flags   = rep->cap_mask1;
1296        props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
1297        props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
1298        props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
1299        props->bad_pkey_cntr    = rep->pkey_violation_counter;
1300        props->qkey_viol_cntr   = rep->qkey_violation_counter;
1301        props->subnet_timeout   = rep->subnet_timeout;
1302        props->init_type_reply  = rep->init_type_reply;
1303
1304        if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
1305                props->port_cap_flags2 = rep->cap_mask2;
1306
1307        err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper,
1308                                      &props->active_speed, port);
1309        if (err)
1310                goto out;
1311
1312        translate_active_width(ibdev, ib_link_width_oper, &props->active_width);
1313
1314        mlx5_query_port_max_mtu(mdev, &max_mtu, port);
1315
1316        props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
1317
1318        mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
1319
1320        props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
1321
1322        err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
1323        if (err)
1324                goto out;
1325
1326        err = translate_max_vl_num(ibdev, vl_hw_cap,
1327                                   &props->max_vl_num);
1328out:
1329        kfree(rep);
1330        return err;
1331}
1332
1333int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
1334                       struct ib_port_attr *props)
1335{
1336        unsigned int count;
1337        int ret;
1338
1339        switch (mlx5_get_vport_access_method(ibdev)) {
1340        case MLX5_VPORT_ACCESS_METHOD_MAD:
1341                ret = mlx5_query_mad_ifc_port(ibdev, port, props);
1342                break;
1343
1344        case MLX5_VPORT_ACCESS_METHOD_HCA:
1345                ret = mlx5_query_hca_port(ibdev, port, props);
1346                break;
1347
1348        case MLX5_VPORT_ACCESS_METHOD_NIC:
1349                ret = mlx5_query_port_roce(ibdev, port, props);
1350                break;
1351
1352        default:
1353                ret = -EINVAL;
1354        }
1355
1356        if (!ret && props) {
1357                struct mlx5_ib_dev *dev = to_mdev(ibdev);
1358                struct mlx5_core_dev *mdev;
1359                bool put_mdev = true;
1360
1361                mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
1362                if (!mdev) {
1363                        /* If the port isn't affiliated yet query the master.
1364                         * The master and slave will have the same values.
1365                         */
1366                        mdev = dev->mdev;
1367                        port = 1;
1368                        put_mdev = false;
1369                }
1370                count = mlx5_core_reserved_gids_count(mdev);
1371                if (put_mdev)
1372                        mlx5_ib_put_native_port_mdev(dev, port);
1373                props->gid_tbl_len -= count;
1374        }
1375        return ret;
1376}
1377
1378static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port,
1379                                  struct ib_port_attr *props)
1380{
1381        int ret;
1382
1383        /* Only link layer == ethernet is valid for representors
1384         * and we always use port 1
1385         */
1386        ret = mlx5_query_port_roce(ibdev, port, props);
1387        if (ret || !props)
1388                return ret;
1389
1390        /* We don't support GIDS */
1391        props->gid_tbl_len = 0;
1392
1393        return ret;
1394}
1395
1396static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
1397                             union ib_gid *gid)
1398{
1399        struct mlx5_ib_dev *dev = to_mdev(ibdev);
1400        struct mlx5_core_dev *mdev = dev->mdev;
1401
1402        switch (mlx5_get_vport_access_method(ibdev)) {
1403        case MLX5_VPORT_ACCESS_METHOD_MAD:
1404                return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
1405
1406        case MLX5_VPORT_ACCESS_METHOD_HCA:
1407                return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
1408
1409        default:
1410                return -EINVAL;
1411        }
1412
1413}
1414
1415static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
1416                                   u16 index, u16 *pkey)
1417{
1418        struct mlx5_ib_dev *dev = to_mdev(ibdev);
1419        struct mlx5_core_dev *mdev;
1420        bool put_mdev = true;
1421        u8 mdev_port_num;
1422        int err;
1423
1424        mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
1425        if (!mdev) {
1426                /* The port isn't affiliated yet, get the PKey from the master
1427                 * port. For RoCE the PKey tables will be the same.
1428                 */
1429                put_mdev = false;
1430                mdev = dev->mdev;
1431                mdev_port_num = 1;
1432        }
1433
1434        err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
1435                                        index, pkey);
1436        if (put_mdev)
1437                mlx5_ib_put_native_port_mdev(dev, port);
1438
1439        return err;
1440}
1441
1442static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1443                              u16 *pkey)
1444{
1445        switch (mlx5_get_vport_access_method(ibdev)) {
1446        case MLX5_VPORT_ACCESS_METHOD_MAD:
1447                return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
1448
1449        case MLX5_VPORT_ACCESS_METHOD_HCA:
1450        case MLX5_VPORT_ACCESS_METHOD_NIC:
1451                return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
1452        default:
1453                return -EINVAL;
1454        }
1455}
1456
1457static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
1458                                 struct ib_device_modify *props)
1459{
1460        struct mlx5_ib_dev *dev = to_mdev(ibdev);
1461        struct mlx5_reg_node_desc in;
1462        struct mlx5_reg_node_desc out;
1463        int err;
1464
1465        if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
1466                return -EOPNOTSUPP;
1467
1468        if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
1469                return 0;
1470
1471        /*
1472         * If possible, pass node desc to FW, so it can generate
1473         * a 144 trap.  If cmd fails, just ignore.
1474         */
1475        memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1476        err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
1477                                   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
1478        if (err)
1479                return err;
1480
1481        memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1482
1483        return err;
1484}
1485
1486static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
1487                                u32 value)
1488{
1489        struct mlx5_hca_vport_context ctx = {};
1490        struct mlx5_core_dev *mdev;
1491        u8 mdev_port_num;
1492        int err;
1493
1494        mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
1495        if (!mdev)
1496                return -ENODEV;
1497
1498        err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
1499        if (err)
1500                goto out;
1501
1502        if (~ctx.cap_mask1_perm & mask) {
1503                mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
1504                             mask, ctx.cap_mask1_perm);
1505                err = -EINVAL;
1506                goto out;
1507        }
1508
1509        ctx.cap_mask1 = value;
1510        ctx.cap_mask1_perm = mask;
1511        err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
1512                                                 0, &ctx);
1513
1514out:
1515        mlx5_ib_put_native_port_mdev(dev, port_num);
1516
1517        return err;
1518}
1519
1520static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1521                               struct ib_port_modify *props)
1522{
1523        struct mlx5_ib_dev *dev = to_mdev(ibdev);
1524        struct ib_port_attr attr;
1525        u32 tmp;
1526        int err;
1527        u32 change_mask;
1528        u32 value;
1529        bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
1530                      IB_LINK_LAYER_INFINIBAND);
1531
1532        /* CM layer calls ib_modify_port() regardless of the link layer. For
1533         * Ethernet ports, qkey violation and Port capabilities are meaningless.
1534         */
1535        if (!is_ib)
1536                return 0;
1537
1538        if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
1539                change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
1540                value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
1541                return set_port_caps_atomic(dev, port, change_mask, value);
1542        }
1543
1544        mutex_lock(&dev->cap_mask_mutex);
1545
1546        err = ib_query_port(ibdev, port, &attr);
1547        if (err)
1548                goto out;
1549
1550        tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1551                ~props->clr_port_cap_mask;
1552
1553        err = mlx5_set_port_caps(dev->mdev, port, tmp);
1554
1555out:
1556        mutex_unlock(&dev->cap_mask_mutex);
1557        return err;
1558}
1559
1560static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1561{
1562        mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1563                    caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1564}
1565
1566static u16 calc_dynamic_bfregs(int uars_per_sys_page)
1567{
1568        /* Large page with non 4k uar support might limit the dynamic size */
1569        if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
1570                return MLX5_MIN_DYN_BFREGS;
1571
1572        return MLX5_MAX_DYN_BFREGS;
1573}
1574
1575static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1576                             struct mlx5_ib_alloc_ucontext_req_v2 *req,
1577                             struct mlx5_bfreg_info *bfregi)
1578{
1579        int uars_per_sys_page;
1580        int bfregs_per_sys_page;
1581        int ref_bfregs = req->total_num_bfregs;
1582
1583        if (req->total_num_bfregs == 0)
1584                return -EINVAL;
1585
1586        BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1587        BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1588
1589        if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1590                return -ENOMEM;
1591
1592        uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1593        bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1594        /* This holds the required static allocation asked by the user */
1595        req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1596        if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1597                return -EINVAL;
1598
1599        bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1600        bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
1601        bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
1602        bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
1603
1604        mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
1605                    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1606                    lib_uar_4k ? "yes" : "no", ref_bfregs,
1607                    req->total_num_bfregs, bfregi->total_num_bfregs,
1608                    bfregi->num_sys_pages);
1609
1610        return 0;
1611}
1612
1613static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1614{
1615        struct mlx5_bfreg_info *bfregi;
1616        int err;
1617        int i;
1618
1619        bfregi = &context->bfregi;
1620        for (i = 0; i < bfregi->num_static_sys_pages; i++) {
1621                err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1622                if (err)
1623                        goto error;
1624
1625                mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1626        }
1627
1628        for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
1629                bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
1630
1631        return 0;
1632
1633error:
1634        for (--i; i >= 0; i--)
1635                if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1636                        mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1637
1638        return err;
1639}
1640
1641static void deallocate_uars(struct mlx5_ib_dev *dev,
1642                            struct mlx5_ib_ucontext *context)
1643{
1644        struct mlx5_bfreg_info *bfregi;
1645        int i;
1646
1647        bfregi = &context->bfregi;
1648        for (i = 0; i < bfregi->num_sys_pages; i++)
1649                if (i < bfregi->num_static_sys_pages ||
1650                    bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX)
1651                        mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1652}
1653
1654int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1655{
1656        int err = 0;
1657
1658        mutex_lock(&dev->lb.mutex);
1659        if (td)
1660                dev->lb.user_td++;
1661        if (qp)
1662                dev->lb.qps++;
1663
1664        if (dev->lb.user_td == 2 ||
1665            dev->lb.qps == 1) {
1666                if (!dev->lb.enabled) {
1667                        err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
1668                        dev->lb.enabled = true;
1669                }
1670        }
1671
1672        mutex_unlock(&dev->lb.mutex);
1673
1674        return err;
1675}
1676
1677void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1678{
1679        mutex_lock(&dev->lb.mutex);
1680        if (td)
1681                dev->lb.user_td--;
1682        if (qp)
1683                dev->lb.qps--;
1684
1685        if (dev->lb.user_td == 1 &&
1686            dev->lb.qps == 0) {
1687                if (dev->lb.enabled) {
1688                        mlx5_nic_vport_update_local_lb(dev->mdev, false);
1689                        dev->lb.enabled = false;
1690                }
1691        }
1692
1693        mutex_unlock(&dev->lb.mutex);
1694}
1695
1696static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
1697                                          u16 uid)
1698{
1699        int err;
1700
1701        if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1702                return 0;
1703
1704        err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid);
1705        if (err)
1706                return err;
1707
1708        if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1709            (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1710             !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1711                return err;
1712
1713        return mlx5_ib_enable_lb(dev, true, false);
1714}
1715
1716static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
1717                                             u16 uid)
1718{
1719        if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1720                return;
1721
1722        mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid);
1723
1724        if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1725            (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1726             !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1727                return;
1728
1729        mlx5_ib_disable_lb(dev, true, false);
1730}
1731
1732static int set_ucontext_resp(struct ib_ucontext *uctx,
1733                             struct mlx5_ib_alloc_ucontext_resp *resp)
1734{
1735        struct ib_device *ibdev = uctx->device;
1736        struct mlx5_ib_dev *dev = to_mdev(ibdev);
1737        struct mlx5_ib_ucontext *context = to_mucontext(uctx);
1738        struct mlx5_bfreg_info *bfregi = &context->bfregi;
1739        int err;
1740
1741        if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1742                err = mlx5_cmd_dump_fill_mkey(dev->mdev,
1743                                              &resp->dump_fill_mkey);
1744                if (err)
1745                        return err;
1746                resp->comp_mask |=
1747                        MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
1748        }
1749
1750        resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1751        if (dev->wc_support)
1752                resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev,
1753                                                      log_bf_reg_size);
1754        resp->cache_line_size = cache_line_size();
1755        resp->max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1756        resp->max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1757        resp->max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1758        resp->max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1759        resp->max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1760        resp->cqe_version = context->cqe_version;
1761        resp->log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1762                                MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1763        resp->num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1764                                        MLX5_CAP_GEN(dev->mdev,
1765                                                     num_of_uars_per_page) : 1;
1766
1767        if (mlx5_accel_ipsec_device_caps(dev->mdev) &
1768                                MLX5_ACCEL_IPSEC_CAP_DEVICE) {
1769                if (mlx5_get_flow_namespace(dev->mdev,
1770                                MLX5_FLOW_NAMESPACE_EGRESS))
1771                        resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
1772                if (mlx5_accel_ipsec_device_caps(dev->mdev) &
1773                                MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
1774                        resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
1775                if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
1776                        resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
1777                if (mlx5_accel_ipsec_device_caps(dev->mdev) &
1778                                MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
1779                        resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
1780                /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
1781        }
1782
1783        resp->tot_bfregs = bfregi->lib_uar_dyn ? 0 :
1784                        bfregi->total_num_bfregs - bfregi->num_dyn_bfregs;
1785        resp->num_ports = dev->num_ports;
1786        resp->cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1787                                      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1788
1789        if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
1790                mlx5_query_min_inline(dev->mdev, &resp->eth_min_inline);
1791                resp->eth_min_inline++;
1792        }
1793
1794        if (dev->mdev->clock_info)
1795                resp->clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
1796
1797        /*
1798         * We don't want to expose information from the PCI bar that is located
1799         * after 4096 bytes, so if the arch only supports larger pages, let's
1800         * pretend we don't support reading the HCA's core clock. This is also
1801         * forced by mmap function.
1802         */
1803        if (PAGE_SIZE <= 4096) {
1804                resp->comp_mask |=
1805                        MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1806                resp->hca_core_clock_offset =
1807                        offsetof(struct mlx5_init_seg,
1808                                 internal_timer_h) % PAGE_SIZE;
1809        }
1810
1811        if (MLX5_CAP_GEN(dev->mdev, ece_support))
1812                resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE;
1813
1814        resp->num_dyn_bfregs = bfregi->num_dyn_bfregs;
1815        return 0;
1816}
1817
1818static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
1819                                  struct ib_udata *udata)
1820{
1821        struct ib_device *ibdev = uctx->device;
1822        struct mlx5_ib_dev *dev = to_mdev(ibdev);
1823        struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1824        struct mlx5_ib_alloc_ucontext_resp resp = {};
1825        struct mlx5_ib_ucontext *context = to_mucontext(uctx);
1826        struct mlx5_bfreg_info *bfregi;
1827        int ver;
1828        int err;
1829        size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1830                                     max_cqe_version);
1831        bool lib_uar_4k;
1832        bool lib_uar_dyn;
1833
1834        if (!dev->ib_active)
1835                return -EAGAIN;
1836
1837        if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1838                ver = 0;
1839        else if (udata->inlen >= min_req_v2)
1840                ver = 2;
1841        else
1842                return -EINVAL;
1843
1844        err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1845        if (err)
1846                return err;
1847
1848        if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
1849                return -EOPNOTSUPP;
1850
1851        if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1852                return -EOPNOTSUPP;
1853
1854        req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1855                                    MLX5_NON_FP_BFREGS_PER_UAR);
1856        if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1857                return -EINVAL;
1858
1859        lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1860        lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
1861        bfregi = &context->bfregi;
1862
1863        if (lib_uar_dyn) {
1864                bfregi->lib_uar_dyn = lib_uar_dyn;
1865                goto uar_done;
1866        }
1867
1868        /* updates req->total_num_bfregs */
1869        err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
1870        if (err)
1871                goto out_ctx;
1872
1873        mutex_init(&bfregi->lock);
1874        bfregi->lib_uar_4k = lib_uar_4k;
1875        bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
1876                                GFP_KERNEL);
1877        if (!bfregi->count) {
1878                err = -ENOMEM;
1879                goto out_ctx;
1880        }
1881
1882        bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1883                                    sizeof(*bfregi->sys_pages),
1884                                    GFP_KERNEL);
1885        if (!bfregi->sys_pages) {
1886                err = -ENOMEM;
1887                goto out_count;
1888        }
1889
1890        err = allocate_uars(dev, context);
1891        if (err)
1892                goto out_sys_pages;
1893
1894uar_done:
1895        if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
1896                err = mlx5_ib_devx_create(dev, true);
1897                if (err < 0)
1898                        goto out_uars;
1899                context->devx_uid = err;
1900        }
1901
1902        err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
1903                                             context->devx_uid);
1904        if (err)
1905                goto out_devx;
1906
1907        INIT_LIST_HEAD(&context->db_page_list);
1908        mutex_init(&context->db_page_mutex);
1909
1910        context->cqe_version = min_t(__u8,
1911                                 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1912                                 req.max_cqe_version);
1913
1914        err = set_ucontext_resp(uctx, &resp);
1915        if (err)
1916                goto out_mdev;
1917
1918        resp.response_length = min(udata->outlen, sizeof(resp));
1919        err = ib_copy_to_udata(udata, &resp, resp.response_length);
1920        if (err)
1921                goto out_mdev;
1922
1923        bfregi->ver = ver;
1924        bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1925        context->lib_caps = req.lib_caps;
1926        print_lib_caps(dev, context->lib_caps);
1927
1928        if (mlx5_ib_lag_should_assign_affinity(dev)) {
1929                u8 port = mlx5_core_native_port_num(dev->mdev) - 1;
1930
1931                atomic_set(&context->tx_port_affinity,
1932                           atomic_add_return(
1933                                   1, &dev->port[port].roce.tx_port_affinity));
1934        }
1935
1936        return 0;
1937
1938out_mdev:
1939        mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1940out_devx:
1941        if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
1942                mlx5_ib_devx_destroy(dev, context->devx_uid);
1943
1944out_uars:
1945        deallocate_uars(dev, context);
1946
1947out_sys_pages:
1948        kfree(bfregi->sys_pages);
1949
1950out_count:
1951        kfree(bfregi->count);
1952
1953out_ctx:
1954        return err;
1955}
1956
1957static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext,
1958                                  struct uverbs_attr_bundle *attrs)
1959{
1960        struct mlx5_ib_alloc_ucontext_resp uctx_resp = {};
1961        int ret;
1962
1963        ret = set_ucontext_resp(ibcontext, &uctx_resp);
1964        if (ret)
1965                return ret;
1966
1967        uctx_resp.response_length =
1968                min_t(size_t,
1969                      uverbs_attr_get_len(attrs,
1970                                MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX),
1971                      sizeof(uctx_resp));
1972
1973        ret = uverbs_copy_to_struct_or_zero(attrs,
1974                                        MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
1975                                        &uctx_resp,
1976                                        sizeof(uctx_resp));
1977        return ret;
1978}
1979
1980static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1981{
1982        struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1983        struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1984        struct mlx5_bfreg_info *bfregi;
1985
1986        bfregi = &context->bfregi;
1987        mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1988
1989        if (context->devx_uid)
1990                mlx5_ib_devx_destroy(dev, context->devx_uid);
1991
1992        deallocate_uars(dev, context);
1993        kfree(bfregi->sys_pages);
1994        kfree(bfregi->count);
1995}
1996
1997static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
1998                                 int uar_idx)
1999{
2000        int fw_uars_per_page;
2001
2002        fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
2003
2004        return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
2005}
2006
2007static u64 uar_index2paddress(struct mlx5_ib_dev *dev,
2008                                 int uar_idx)
2009{
2010        unsigned int fw_uars_per_page;
2011
2012        fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
2013                                MLX5_UARS_IN_PAGE : 1;
2014
2015        return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE);
2016}
2017
2018static int get_command(unsigned long offset)
2019{
2020        return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
2021}
2022
2023static int get_arg(unsigned long offset)
2024{
2025        return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
2026}
2027
2028static int get_index(unsigned long offset)
2029{
2030        return get_arg(offset);
2031}
2032
2033/* Index resides in an extra byte to enable larger values than 255 */
2034static int get_extended_index(unsigned long offset)
2035{
2036        return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
2037}
2038
2039
2040static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
2041{
2042}
2043
2044static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
2045{
2046        switch (cmd) {
2047        case MLX5_IB_MMAP_WC_PAGE:
2048                return "WC";
2049        case MLX5_IB_MMAP_REGULAR_PAGE:
2050                return "best effort WC";
2051        case MLX5_IB_MMAP_NC_PAGE:
2052                return "NC";
2053        case MLX5_IB_MMAP_DEVICE_MEM:
2054                return "Device Memory";
2055        default:
2056                return NULL;
2057        }
2058}
2059
2060static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
2061                                        struct vm_area_struct *vma,
2062                                        struct mlx5_ib_ucontext *context)
2063{
2064        if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
2065            !(vma->vm_flags & VM_SHARED))
2066                return -EINVAL;
2067
2068        if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
2069                return -EOPNOTSUPP;
2070
2071        if (vma->vm_flags & (VM_WRITE | VM_EXEC))
2072                return -EPERM;
2073        vma->vm_flags &= ~VM_MAYWRITE;
2074
2075        if (!dev->mdev->clock_info)
2076                return -EOPNOTSUPP;
2077
2078        return vm_insert_page(vma, vma->vm_start,
2079                              virt_to_page(dev->mdev->clock_info));
2080}
2081
2082static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
2083{
2084        struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
2085        struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
2086        struct mlx5_var_table *var_table = &dev->var_table;
2087        struct mlx5_ib_dm *mdm;
2088
2089        switch (mentry->mmap_flag) {
2090        case MLX5_IB_MMAP_TYPE_MEMIC:
2091                mdm = container_of(mentry, struct mlx5_ib_dm, mentry);
2092                mlx5_cmd_dealloc_memic(&dev->dm, mdm->dev_addr,
2093                                       mdm->size);
2094                kfree(mdm);
2095                break;
2096        case MLX5_IB_MMAP_TYPE_VAR:
2097                mutex_lock(&var_table->bitmap_lock);
2098                clear_bit(mentry->page_idx, var_table->bitmap);
2099                mutex_unlock(&var_table->bitmap_lock);
2100                kfree(mentry);
2101                break;
2102        case MLX5_IB_MMAP_TYPE_UAR_WC:
2103        case MLX5_IB_MMAP_TYPE_UAR_NC:
2104                mlx5_cmd_free_uar(dev->mdev, mentry->page_idx);
2105                kfree(mentry);
2106                break;
2107        default:
2108                WARN_ON(true);
2109        }
2110}
2111
2112static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
2113                    struct vm_area_struct *vma,
2114                    struct mlx5_ib_ucontext *context)
2115{
2116        struct mlx5_bfreg_info *bfregi = &context->bfregi;
2117        int err;
2118        unsigned long idx;
2119        phys_addr_t pfn;
2120        pgprot_t prot;
2121        u32 bfreg_dyn_idx = 0;
2122        u32 uar_index;
2123        int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
2124        int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
2125                                bfregi->num_static_sys_pages;
2126
2127        if (bfregi->lib_uar_dyn)
2128                return -EINVAL;
2129
2130        if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2131                return -EINVAL;
2132
2133        if (dyn_uar)
2134                idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
2135        else
2136                idx = get_index(vma->vm_pgoff);
2137
2138        if (idx >= max_valid_idx) {
2139                mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
2140                             idx, max_valid_idx);
2141                return -EINVAL;
2142        }
2143
2144        switch (cmd) {
2145        case MLX5_IB_MMAP_WC_PAGE:
2146        case MLX5_IB_MMAP_ALLOC_WC:
2147        case MLX5_IB_MMAP_REGULAR_PAGE:
2148                /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
2149                prot = pgprot_writecombine(vma->vm_page_prot);
2150                break;
2151        case MLX5_IB_MMAP_NC_PAGE:
2152                prot = pgprot_noncached(vma->vm_page_prot);
2153                break;
2154        default:
2155                return -EINVAL;
2156        }
2157
2158        if (dyn_uar) {
2159                int uars_per_page;
2160
2161                uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
2162                bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
2163                if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
2164                        mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
2165                                     bfreg_dyn_idx, bfregi->total_num_bfregs);
2166                        return -EINVAL;
2167                }
2168
2169                mutex_lock(&bfregi->lock);
2170                /* Fail if uar already allocated, first bfreg index of each
2171                 * page holds its count.
2172                 */
2173                if (bfregi->count[bfreg_dyn_idx]) {
2174                        mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
2175                        mutex_unlock(&bfregi->lock);
2176                        return -EINVAL;
2177                }
2178
2179                bfregi->count[bfreg_dyn_idx]++;
2180                mutex_unlock(&bfregi->lock);
2181
2182                err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
2183                if (err) {
2184                        mlx5_ib_warn(dev, "UAR alloc failed\n");
2185                        goto free_bfreg;
2186                }
2187        } else {
2188                uar_index = bfregi->sys_pages[idx];
2189        }
2190
2191        pfn = uar_index2pfn(dev, uar_index);
2192        mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
2193
2194        err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
2195                                prot, NULL);
2196        if (err) {
2197                mlx5_ib_err(dev,
2198                            "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
2199                            err, mmap_cmd2str(cmd));
2200                goto err;
2201        }
2202
2203        if (dyn_uar)
2204                bfregi->sys_pages[idx] = uar_index;
2205        return 0;
2206
2207err:
2208        if (!dyn_uar)
2209                return err;
2210
2211        mlx5_cmd_free_uar(dev->mdev, idx);
2212
2213free_bfreg:
2214        mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
2215
2216        return err;
2217}
2218
2219static int add_dm_mmap_entry(struct ib_ucontext *context,
2220                             struct mlx5_ib_dm *mdm,
2221                             u64 address)
2222{
2223        mdm->mentry.mmap_flag = MLX5_IB_MMAP_TYPE_MEMIC;
2224        mdm->mentry.address = address;
2225        return rdma_user_mmap_entry_insert_range(
2226                        context, &mdm->mentry.rdma_entry,
2227                        mdm->size,
2228                        MLX5_IB_MMAP_DEVICE_MEM << 16,
2229                        (MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1);
2230}
2231
2232static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
2233{
2234        unsigned long idx;
2235        u8 command;
2236
2237        command = get_command(vma->vm_pgoff);
2238        idx = get_extended_index(vma->vm_pgoff);
2239
2240        return (command << 16 | idx);
2241}
2242
2243static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
2244                               struct vm_area_struct *vma,
2245                               struct ib_ucontext *ucontext)
2246{
2247        struct mlx5_user_mmap_entry *mentry;
2248        struct rdma_user_mmap_entry *entry;
2249        unsigned long pgoff;
2250        pgprot_t prot;
2251        phys_addr_t pfn;
2252        int ret;
2253
2254        pgoff = mlx5_vma_to_pgoff(vma);
2255        entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff);
2256        if (!entry)
2257                return -EINVAL;
2258
2259        mentry = to_mmmap(entry);
2260        pfn = (mentry->address >> PAGE_SHIFT);
2261        if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR ||
2262            mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC)
2263                prot = pgprot_noncached(vma->vm_page_prot);
2264        else
2265                prot = pgprot_writecombine(vma->vm_page_prot);
2266        ret = rdma_user_mmap_io(ucontext, vma, pfn,
2267                                entry->npages * PAGE_SIZE,
2268                                prot,
2269                                entry);
2270        rdma_user_mmap_entry_put(&mentry->rdma_entry);
2271        return ret;
2272}
2273
2274static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry)
2275{
2276        u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF;
2277        u64 index = entry->rdma_entry.start_pgoff & 0xFFFF;
2278
2279        return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) |
2280                (index & 0xFF)) << PAGE_SHIFT;
2281}
2282
2283static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
2284{
2285        struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2286        struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2287        unsigned long command;
2288        phys_addr_t pfn;
2289
2290        command = get_command(vma->vm_pgoff);
2291        switch (command) {
2292        case MLX5_IB_MMAP_WC_PAGE:
2293        case MLX5_IB_MMAP_ALLOC_WC:
2294                if (!dev->wc_support)
2295                        return -EPERM;
2296                fallthrough;
2297        case MLX5_IB_MMAP_NC_PAGE:
2298        case MLX5_IB_MMAP_REGULAR_PAGE:
2299                return uar_mmap(dev, command, vma, context);
2300
2301        case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
2302                return -ENOSYS;
2303
2304        case MLX5_IB_MMAP_CORE_CLOCK:
2305                if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2306                        return -EINVAL;
2307
2308                if (vma->vm_flags & VM_WRITE)
2309                        return -EPERM;
2310                vma->vm_flags &= ~VM_MAYWRITE;
2311
2312                /* Don't expose to user-space information it shouldn't have */
2313                if (PAGE_SIZE > 4096)
2314                        return -EOPNOTSUPP;
2315
2316                pfn = (dev->mdev->iseg_base +
2317                       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
2318                        PAGE_SHIFT;
2319                return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
2320                                         PAGE_SIZE,
2321                                         pgprot_noncached(vma->vm_page_prot),
2322                                         NULL);
2323        case MLX5_IB_MMAP_CLOCK_INFO:
2324                return mlx5_ib_mmap_clock_info_page(dev, vma, context);
2325
2326        default:
2327                return mlx5_ib_mmap_offset(dev, vma, ibcontext);
2328        }
2329
2330        return 0;
2331}
2332
2333static inline int check_dm_type_support(struct mlx5_ib_dev *dev,
2334                                        u32 type)
2335{
2336        switch (type) {
2337        case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2338                if (!MLX5_CAP_DEV_MEM(dev->mdev, memic))
2339                        return -EOPNOTSUPP;
2340                break;
2341        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2342        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2343                if (!capable(CAP_SYS_RAWIO) ||
2344                    !capable(CAP_NET_RAW))
2345                        return -EPERM;
2346
2347                if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
2348                      MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner) ||
2349                      MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner_v2) ||
2350                      MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner_v2)))
2351                        return -EOPNOTSUPP;
2352                break;
2353        }
2354
2355        return 0;
2356}
2357
2358static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
2359                                 struct mlx5_ib_dm *dm,
2360                                 struct ib_dm_alloc_attr *attr,
2361                                 struct uverbs_attr_bundle *attrs)
2362{
2363        struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
2364        u64 start_offset;
2365        u16 page_idx;
2366        int err;
2367        u64 address;
2368
2369        dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
2370
2371        err = mlx5_cmd_alloc_memic(dm_db, &dm->dev_addr,
2372                                   dm->size, attr->alignment);
2373        if (err)
2374                return err;
2375
2376        address = dm->dev_addr & PAGE_MASK;
2377        err = add_dm_mmap_entry(ctx, dm, address);
2378        if (err)
2379                goto err_dealloc;
2380
2381        page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF;
2382        err = uverbs_copy_to(attrs,
2383                             MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
2384                             &page_idx,
2385                             sizeof(page_idx));
2386        if (err)
2387                goto err_copy;
2388
2389        start_offset = dm->dev_addr & ~PAGE_MASK;
2390        err = uverbs_copy_to(attrs,
2391                             MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2392                             &start_offset, sizeof(start_offset));
2393        if (err)
2394                goto err_copy;
2395
2396        return 0;
2397
2398err_copy:
2399        rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
2400err_dealloc:
2401        mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
2402
2403        return err;
2404}
2405
2406static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
2407                                  struct mlx5_ib_dm *dm,
2408                                  struct ib_dm_alloc_attr *attr,
2409                                  struct uverbs_attr_bundle *attrs,
2410                                  int type)
2411{
2412        struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
2413        u64 act_size;
2414        int err;
2415
2416        /* Allocation size must a multiple of the basic block size
2417         * and a power of 2.
2418         */
2419        act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
2420        act_size = roundup_pow_of_two(act_size);
2421
2422        dm->size = act_size;
2423        err = mlx5_dm_sw_icm_alloc(dev, type, act_size, attr->alignment,
2424                                   to_mucontext(ctx)->devx_uid, &dm->dev_addr,
2425                                   &dm->icm_dm.obj_id);
2426        if (err)
2427                return err;
2428
2429        err = uverbs_copy_to(attrs,
2430                             MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2431                             &dm->dev_addr, sizeof(dm->dev_addr));
2432        if (err)
2433                mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
2434                                       to_mucontext(ctx)->devx_uid, dm->dev_addr,
2435                                       dm->icm_dm.obj_id);
2436
2437        return err;
2438}
2439
2440struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
2441                               struct ib_ucontext *context,
2442                               struct ib_dm_alloc_attr *attr,
2443                               struct uverbs_attr_bundle *attrs)
2444{
2445        struct mlx5_ib_dm *dm;
2446        enum mlx5_ib_uapi_dm_type type;
2447        int err;
2448
2449        err = uverbs_get_const_default(&type, attrs,
2450                                       MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
2451                                       MLX5_IB_UAPI_DM_TYPE_MEMIC);
2452        if (err)
2453                return ERR_PTR(err);
2454
2455        mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n",
2456                    type, attr->length, attr->alignment);
2457
2458        err = check_dm_type_support(to_mdev(ibdev), type);
2459        if (err)
2460                return ERR_PTR(err);
2461
2462        dm = kzalloc(sizeof(*dm), GFP_KERNEL);
2463        if (!dm)
2464                return ERR_PTR(-ENOMEM);
2465
2466        dm->type = type;
2467
2468        switch (type) {
2469        case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2470                err = handle_alloc_dm_memic(context, dm,
2471                                            attr,
2472                                            attrs);
2473                break;
2474        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2475                err = handle_alloc_dm_sw_icm(context, dm,
2476                                             attr, attrs,
2477                                             MLX5_SW_ICM_TYPE_STEERING);
2478                break;
2479        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2480                err = handle_alloc_dm_sw_icm(context, dm,
2481                                             attr, attrs,
2482                                             MLX5_SW_ICM_TYPE_HEADER_MODIFY);
2483                break;
2484        default:
2485                err = -EOPNOTSUPP;
2486        }
2487
2488        if (err)
2489                goto err_free;
2490
2491        return &dm->ibdm;
2492
2493err_free:
2494        kfree(dm);
2495        return ERR_PTR(err);
2496}
2497
2498int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
2499{
2500        struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
2501                &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
2502        struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
2503        struct mlx5_ib_dm *dm = to_mdm(ibdm);
2504        int ret;
2505
2506        switch (dm->type) {
2507        case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2508                rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
2509                return 0;
2510        case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2511                ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
2512                                             dm->size, ctx->devx_uid, dm->dev_addr,
2513                                             dm->icm_dm.obj_id);
2514                if (ret)
2515                        return ret;
2516                break;
2517        case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2518                ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
2519                                             dm->size, ctx->devx_uid, dm->dev_addr,
2520                                             dm->icm_dm.obj_id);
2521                if (ret)
2522                        return ret;
2523                break;
2524        default:
2525                return -EOPNOTSUPP;
2526        }
2527
2528        kfree(dm);
2529
2530        return 0;
2531}
2532
2533static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
2534{
2535        struct mlx5_ib_pd *pd = to_mpd(ibpd);
2536        struct ib_device *ibdev = ibpd->device;
2537        struct mlx5_ib_alloc_pd_resp resp;
2538        int err;
2539        u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
2540        u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {};
2541        u16 uid = 0;
2542        struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
2543                udata, struct mlx5_ib_ucontext, ibucontext);
2544
2545        uid = context ? context->devx_uid : 0;
2546        MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
2547        MLX5_SET(alloc_pd_in, in, uid, uid);
2548        err = mlx5_cmd_exec_inout(to_mdev(ibdev)->mdev, alloc_pd, in, out);
2549        if (err)
2550                return err;
2551
2552        pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
2553        pd->uid = uid;
2554        if (udata) {
2555                resp.pdn = pd->pdn;
2556                if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
2557                        mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
2558                        return -EFAULT;
2559                }
2560        }
2561
2562        return 0;
2563}
2564
2565static int mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
2566{
2567        struct mlx5_ib_dev *mdev = to_mdev(pd->device);
2568        struct mlx5_ib_pd *mpd = to_mpd(pd);
2569
2570        return mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
2571}
2572
2573static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2574{
2575        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2576        struct mlx5_ib_qp *mqp = to_mqp(ibqp);
2577        int err;
2578        u16 uid;
2579
2580        uid = ibqp->pd ?
2581                to_mpd(ibqp->pd)->uid : 0;
2582
2583        if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) {
2584                mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
2585                return -EOPNOTSUPP;
2586        }
2587
2588        err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
2589        if (err)
2590                mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
2591                             ibqp->qp_num, gid->raw);
2592
2593        return err;
2594}
2595
2596static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2597{
2598        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2599        int err;
2600        u16 uid;
2601
2602        uid = ibqp->pd ?
2603                to_mpd(ibqp->pd)->uid : 0;
2604        err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
2605        if (err)
2606                mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
2607                             ibqp->qp_num, gid->raw);
2608
2609        return err;
2610}
2611
2612static int init_node_data(struct mlx5_ib_dev *dev)
2613{
2614        int err;
2615
2616        err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
2617        if (err)
2618                return err;
2619
2620        dev->mdev->rev_id = dev->mdev->pdev->revision;
2621
2622        return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
2623}
2624
2625static ssize_t fw_pages_show(struct device *device,
2626                             struct device_attribute *attr, char *buf)
2627{
2628        struct mlx5_ib_dev *dev =
2629                rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2630
2631        return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
2632}
2633static DEVICE_ATTR_RO(fw_pages);
2634
2635static ssize_t reg_pages_show(struct device *device,
2636                              struct device_attribute *attr, char *buf)
2637{
2638        struct mlx5_ib_dev *dev =
2639                rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2640
2641        return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
2642}
2643static DEVICE_ATTR_RO(reg_pages);
2644
2645static ssize_t hca_type_show(struct device *device,
2646                             struct device_attribute *attr, char *buf)
2647{
2648        struct mlx5_ib_dev *dev =
2649                rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2650
2651        return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
2652}
2653static DEVICE_ATTR_RO(hca_type);
2654
2655static ssize_t hw_rev_show(struct device *device,
2656                           struct device_attribute *attr, char *buf)
2657{
2658        struct mlx5_ib_dev *dev =
2659                rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2660
2661        return sprintf(buf, "%x\n", dev->mdev->rev_id);
2662}
2663static DEVICE_ATTR_RO(hw_rev);
2664
2665static ssize_t board_id_show(struct device *device,
2666                             struct device_attribute *attr, char *buf)
2667{
2668        struct mlx5_ib_dev *dev =
2669                rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2670
2671        return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
2672                       dev->mdev->board_id);
2673}
2674static DEVICE_ATTR_RO(board_id);
2675
2676static struct attribute *mlx5_class_attributes[] = {
2677        &dev_attr_hw_rev.attr,
2678        &dev_attr_hca_type.attr,
2679        &dev_attr_board_id.attr,
2680        &dev_attr_fw_pages.attr,
2681        &dev_attr_reg_pages.attr,
2682        NULL,
2683};
2684
2685static const struct attribute_group mlx5_attr_group = {
2686        .attrs = mlx5_class_attributes,
2687};
2688
2689static void pkey_change_handler(struct work_struct *work)
2690{
2691        struct mlx5_ib_port_resources *ports =
2692                container_of(work, struct mlx5_ib_port_resources,
2693                             pkey_change_work);
2694
2695        mlx5_ib_gsi_pkey_change(ports->gsi);
2696}
2697
2698static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
2699{
2700        struct mlx5_ib_qp *mqp;
2701        struct mlx5_ib_cq *send_mcq, *recv_mcq;
2702        struct mlx5_core_cq *mcq;
2703        struct list_head cq_armed_list;
2704        unsigned long flags_qp;
2705        unsigned long flags_cq;
2706        unsigned long flags;
2707
2708        INIT_LIST_HEAD(&cq_armed_list);
2709
2710        /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2711        spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2712        list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2713                spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2714                if (mqp->sq.tail != mqp->sq.head) {
2715                        send_mcq = to_mcq(mqp->ibqp.send_cq);
2716                        spin_lock_irqsave(&send_mcq->lock, flags_cq);
2717                        if (send_mcq->mcq.comp &&
2718                            mqp->ibqp.send_cq->comp_handler) {
2719                                if (!send_mcq->mcq.reset_notify_added) {
2720                                        send_mcq->mcq.reset_notify_added = 1;
2721                                        list_add_tail(&send_mcq->mcq.reset_notify,
2722                                                      &cq_armed_list);
2723                                }
2724                        }
2725                        spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2726                }
2727                spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2728                spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2729                /* no handling is needed for SRQ */
2730                if (!mqp->ibqp.srq) {
2731                        if (mqp->rq.tail != mqp->rq.head) {
2732                                recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2733                                spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2734                                if (recv_mcq->mcq.comp &&
2735                                    mqp->ibqp.recv_cq->comp_handler) {
2736                                        if (!recv_mcq->mcq.reset_notify_added) {
2737                                                recv_mcq->mcq.reset_notify_added = 1;
2738                                                list_add_tail(&recv_mcq->mcq.reset_notify,
2739                                                              &cq_armed_list);
2740                                        }
2741                                }
2742                                spin_unlock_irqrestore(&recv_mcq->lock,
2743                                                       flags_cq);
2744                        }
2745                }
2746                spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2747        }
2748        /*At that point all inflight post send were put to be executed as of we
2749         * lock/unlock above locks Now need to arm all involved CQs.
2750         */
2751        list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2752                mcq->comp(mcq, NULL);
2753        }
2754        spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2755}
2756
2757static void delay_drop_handler(struct work_struct *work)
2758{
2759        int err;
2760        struct mlx5_ib_delay_drop *delay_drop =
2761                container_of(work, struct mlx5_ib_delay_drop,
2762                             delay_drop_work);
2763
2764        atomic_inc(&delay_drop->events_cnt);
2765
2766        mutex_lock(&delay_drop->lock);
2767        err = mlx5_core_set_delay_drop(delay_drop->dev, delay_drop->timeout);
2768        if (err) {
2769                mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
2770                             delay_drop->timeout);
2771                delay_drop->activate = false;
2772        }
2773        mutex_unlock(&delay_drop->lock);
2774}
2775
2776static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
2777                                 struct ib_event *ibev)
2778{
2779        u8 port = (eqe->data.port.port >> 4) & 0xf;
2780
2781        switch (eqe->sub_type) {
2782        case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
2783                if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2784                                            IB_LINK_LAYER_ETHERNET)
2785                        schedule_work(&ibdev->delay_drop.delay_drop_work);
2786                break;
2787        default: /* do nothing */
2788                return;
2789        }
2790}
2791
2792static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
2793                              struct ib_event *ibev)
2794{
2795        u8 port = (eqe->data.port.port >> 4) & 0xf;
2796
2797        ibev->element.port_num = port;
2798
2799        switch (eqe->sub_type) {
2800        case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2801        case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2802        case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
2803                /* In RoCE, port up/down events are handled in
2804                 * mlx5_netdev_event().
2805                 */
2806                if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2807                                            IB_LINK_LAYER_ETHERNET)
2808                        return -EINVAL;
2809
2810                ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
2811                                IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
2812                break;
2813
2814        case MLX5_PORT_CHANGE_SUBTYPE_LID:
2815                ibev->event = IB_EVENT_LID_CHANGE;
2816                break;
2817
2818        case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
2819                ibev->event = IB_EVENT_PKEY_CHANGE;
2820                schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
2821                break;
2822
2823        case MLX5_PORT_CHANGE_SUBTYPE_GUID:
2824                ibev->event = IB_EVENT_GID_CHANGE;
2825                break;
2826
2827        case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
2828                ibev->event = IB_EVENT_CLIENT_REREGISTER;
2829                break;
2830        default:
2831                return -EINVAL;
2832        }
2833
2834        return 0;
2835}
2836
2837static void mlx5_ib_handle_event(struct work_struct *_work)
2838{
2839        struct mlx5_ib_event_work *work =
2840                container_of(_work, struct mlx5_ib_event_work, work);
2841        struct mlx5_ib_dev *ibdev;
2842        struct ib_event ibev;
2843        bool fatal = false;
2844
2845        if (work->is_slave) {
2846                ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
2847                if (!ibdev)
2848                        goto out;
2849        } else {
2850                ibdev = work->dev;
2851        }
2852
2853        switch (work->event) {
2854        case MLX5_DEV_EVENT_SYS_ERROR:
2855                ibev.event = IB_EVENT_DEVICE_FATAL;
2856                mlx5_ib_handle_internal_error(ibdev);
2857                ibev.element.port_num  = (u8)(unsigned long)work->param;
2858                fatal = true;
2859                break;
2860        case MLX5_EVENT_TYPE_PORT_CHANGE:
2861                if (handle_port_change(ibdev, work->param, &ibev))
2862                        goto out;
2863                break;
2864        case MLX5_EVENT_TYPE_GENERAL_EVENT:
2865                handle_general_event(ibdev, work->param, &ibev);
2866                fallthrough;
2867        default:
2868                goto out;
2869        }
2870
2871        ibev.device = &ibdev->ib_dev;
2872
2873        if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
2874                mlx5_ib_warn(ibdev, "warning: event on port %d\n",  ibev.element.port_num);
2875                goto out;
2876        }
2877
2878        if (ibdev->ib_active)
2879                ib_dispatch_event(&ibev);
2880
2881        if (fatal)
2882                ibdev->ib_active = false;
2883out:
2884        kfree(work);
2885}
2886
2887static int mlx5_ib_event(struct notifier_block *nb,
2888                         unsigned long event, void *param)
2889{
2890        struct mlx5_ib_event_work *work;
2891
2892        work = kmalloc(sizeof(*work), GFP_ATOMIC);
2893        if (!work)
2894                return NOTIFY_DONE;
2895
2896        INIT_WORK(&work->work, mlx5_ib_handle_event);
2897        work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
2898        work->is_slave = false;
2899        work->param = param;
2900        work->event = event;
2901
2902        queue_work(mlx5_ib_event_wq, &work->work);
2903
2904        return NOTIFY_OK;
2905}
2906
2907static int mlx5_ib_event_slave_port(struct notifier_block *nb,
2908                                    unsigned long event, void *param)
2909{
2910        struct mlx5_ib_event_work *work;
2911
2912        work = kmalloc(sizeof(*work), GFP_ATOMIC);
2913        if (!work)
2914                return NOTIFY_DONE;
2915
2916        INIT_WORK(&work->work, mlx5_ib_handle_event);
2917        work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
2918        work->is_slave = true;
2919        work->param = param;
2920        work->event = event;
2921        queue_work(mlx5_ib_event_wq, &work->work);
2922
2923        return NOTIFY_OK;
2924}
2925
2926static int set_has_smi_cap(struct mlx5_ib_dev *dev)
2927{
2928        struct mlx5_hca_vport_context vport_ctx;
2929        int err;
2930        int port;
2931
2932        for (port = 1; port <= ARRAY_SIZE(dev->mdev->port_caps); port++) {
2933                dev->mdev->port_caps[port - 1].has_smi = false;
2934                if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2935                    MLX5_CAP_PORT_TYPE_IB) {
2936                        if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
2937                                err = mlx5_query_hca_vport_context(dev->mdev, 0,
2938                                                                   port, 0,
2939                                                                   &vport_ctx);
2940                                if (err) {
2941                                        mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
2942                                                    port, err);
2943                                        return err;
2944                                }
2945                                dev->mdev->port_caps[port - 1].has_smi =
2946                                        vport_ctx.has_smi;
2947                        } else {
2948                                dev->mdev->port_caps[port - 1].has_smi = true;
2949                        }
2950                }
2951        }
2952        return 0;
2953}
2954
2955static void get_ext_port_caps(struct mlx5_ib_dev *dev)
2956{
2957        int port;
2958
2959        for (port = 1; port <= dev->num_ports; port++)
2960                mlx5_query_ext_port_caps(dev, port);
2961}
2962
2963static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
2964{
2965        struct ib_device_attr *dprops = NULL;
2966        struct ib_port_attr *pprops = NULL;
2967        int err = -ENOMEM;
2968
2969        pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
2970        if (!pprops)
2971                goto out;
2972
2973        dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
2974        if (!dprops)
2975                goto out;
2976
2977        err = mlx5_ib_query_device(&dev->ib_dev, dprops, NULL);
2978        if (err) {
2979                mlx5_ib_warn(dev, "query_device failed %d\n", err);
2980                goto out;
2981        }
2982
2983        err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
2984        if (err) {
2985                mlx5_ib_warn(dev, "query_port %d failed %d\n",
2986                             port, err);
2987                goto out;
2988        }
2989
2990        dev->mdev->port_caps[port - 1].pkey_table_len =
2991                                        dprops->max_pkeys;
2992        dev->mdev->port_caps[port - 1].gid_table_len =
2993                                        pprops->gid_tbl_len;
2994        mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n",
2995                    port, dprops->max_pkeys, pprops->gid_tbl_len);
2996
2997out:
2998        kfree(pprops);
2999        kfree(dprops);
3000
3001        return err;
3002}
3003
3004static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
3005{
3006        /* For representors use port 1, is this is the only native
3007         * port
3008         */
3009        if (dev->is_rep)
3010                return __get_port_caps(dev, 1);
3011        return __get_port_caps(dev, port);
3012}
3013
3014static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
3015{
3016        switch (umr_fence_cap) {
3017        case MLX5_CAP_UMR_FENCE_NONE:
3018                return MLX5_FENCE_MODE_NONE;
3019        case MLX5_CAP_UMR_FENCE_SMALL:
3020                return MLX5_FENCE_MODE_INITIATOR_SMALL;
3021        default:
3022                return MLX5_FENCE_MODE_STRONG_ORDERING;
3023        }
3024}
3025
3026static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
3027{
3028        struct mlx5_ib_resources *devr = &dev->devr;
3029        struct ib_srq_init_attr attr;
3030        struct ib_device *ibdev;
3031        struct ib_cq_init_attr cq_attr = {.cqe = 1};
3032        int port;
3033        int ret = 0;
3034
3035        ibdev = &dev->ib_dev;
3036
3037        if (!MLX5_CAP_GEN(dev->mdev, xrc))
3038                return -EOPNOTSUPP;
3039
3040        mutex_init(&devr->mutex);
3041
3042        devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
3043        if (!devr->p0)
3044                return -ENOMEM;
3045
3046        devr->p0->device  = ibdev;
3047        devr->p0->uobject = NULL;
3048        atomic_set(&devr->p0->usecnt, 0);
3049
3050        ret = mlx5_ib_alloc_pd(devr->p0, NULL);
3051        if (ret)
3052                goto error0;
3053
3054        devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq);
3055        if (!devr->c0) {
3056                ret = -ENOMEM;
3057                goto error1;
3058        }
3059
3060        devr->c0->device = &dev->ib_dev;
3061        atomic_set(&devr->c0->usecnt, 0);
3062
3063        ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL);
3064        if (ret)
3065                goto err_create_cq;
3066
3067        ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0);
3068        if (ret)
3069                goto error2;
3070
3071        ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0);
3072        if (ret)
3073                goto error3;
3074
3075        memset(&attr, 0, sizeof(attr));
3076        attr.attr.max_sge = 1;
3077        attr.attr.max_wr = 1;
3078        attr.srq_type = IB_SRQT_XRC;
3079        attr.ext.cq = devr->c0;
3080
3081        devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
3082        if (!devr->s0) {
3083                ret = -ENOMEM;
3084                goto error4;
3085        }
3086
3087        devr->s0->device        = &dev->ib_dev;
3088        devr->s0->pd            = devr->p0;
3089        devr->s0->srq_type      = IB_SRQT_XRC;
3090        devr->s0->ext.cq        = devr->c0;
3091        ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
3092        if (ret)
3093                goto err_create;
3094
3095        atomic_inc(&devr->s0->ext.cq->usecnt);
3096        atomic_inc(&devr->p0->usecnt);
3097        atomic_set(&devr->s0->usecnt, 0);
3098
3099        memset(&attr, 0, sizeof(attr));
3100        attr.attr.max_sge = 1;
3101        attr.attr.max_wr = 1;
3102        attr.srq_type = IB_SRQT_BASIC;
3103        devr->s1 = rdma_zalloc_drv_obj(ibdev, ib_srq);
3104        if (!devr->s1) {
3105                ret = -ENOMEM;
3106                goto error5;
3107        }
3108
3109        devr->s1->device        = &dev->ib_dev;
3110        devr->s1->pd            = devr->p0;
3111        devr->s1->srq_type      = IB_SRQT_BASIC;
3112        devr->s1->ext.cq        = devr->c0;
3113
3114        ret = mlx5_ib_create_srq(devr->s1, &attr, NULL);
3115        if (ret)
3116                goto error6;
3117
3118        atomic_inc(&devr->p0->usecnt);
3119        atomic_set(&devr->s1->usecnt, 0);
3120
3121        for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
3122                INIT_WORK(&devr->ports[port].pkey_change_work,
3123                          pkey_change_handler);
3124
3125        return 0;
3126
3127error6:
3128        kfree(devr->s1);
3129error5:
3130        mlx5_ib_destroy_srq(devr->s0, NULL);
3131err_create:
3132        kfree(devr->s0);
3133error4:
3134        mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
3135error3:
3136        mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
3137error2:
3138        mlx5_ib_destroy_cq(devr->c0, NULL);
3139err_create_cq:
3140        kfree(devr->c0);
3141error1:
3142        mlx5_ib_dealloc_pd(devr->p0, NULL);
3143error0:
3144        kfree(devr->p0);
3145        return ret;
3146}
3147
3148static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
3149{
3150        struct mlx5_ib_resources *devr = &dev->devr;
3151        int port;
3152
3153        mlx5_ib_destroy_srq(devr->s1, NULL);
3154        kfree(devr->s1);
3155        mlx5_ib_destroy_srq(devr->s0, NULL);
3156        kfree(devr->s0);
3157        mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
3158        mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
3159        mlx5_ib_destroy_cq(devr->c0, NULL);
3160        kfree(devr->c0);
3161        mlx5_ib_dealloc_pd(devr->p0, NULL);
3162        kfree(devr->p0);
3163
3164        /* Make sure no change P_Key work items are still executing */
3165        for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
3166                cancel_work_sync(&devr->ports[port].pkey_change_work);
3167}
3168
3169static u32 get_core_cap_flags(struct ib_device *ibdev,
3170                              struct mlx5_hca_vport_context *rep)
3171{
3172        struct mlx5_ib_dev *dev = to_mdev(ibdev);
3173        enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
3174        u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
3175        u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
3176        bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
3177        u32 ret = 0;
3178
3179        if (rep->grh_required)
3180                ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED;
3181
3182        if (ll == IB_LINK_LAYER_INFINIBAND)
3183                return ret | RDMA_CORE_PORT_IBA_IB;
3184
3185        if (raw_support)
3186                ret |= RDMA_CORE_PORT_RAW_PACKET;
3187
3188        if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
3189                return ret;
3190
3191        if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
3192                return ret;
3193
3194        if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
3195                ret |= RDMA_CORE_PORT_IBA_ROCE;
3196
3197        if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
3198                ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
3199
3200        return ret;
3201}
3202
3203static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
3204                               struct ib_port_immutable *immutable)
3205{
3206        struct ib_port_attr attr;
3207        struct mlx5_ib_dev *dev = to_mdev(ibdev);
3208        enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
3209        struct mlx5_hca_vport_context rep = {0};
3210        int err;
3211
3212        err = ib_query_port(ibdev, port_num, &attr);
3213        if (err)
3214                return err;
3215
3216        if (ll == IB_LINK_LAYER_INFINIBAND) {
3217                err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
3218                                                   &rep);
3219                if (err)
3220                        return err;
3221        }
3222
3223        immutable->pkey_tbl_len = attr.pkey_tbl_len;
3224        immutable->gid_tbl_len = attr.gid_tbl_len;
3225        immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
3226        immutable->max_mad_size = IB_MGMT_MAD_SIZE;
3227
3228        return 0;
3229}
3230
3231static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num,
3232                                   struct ib_port_immutable *immutable)
3233{
3234        struct ib_port_attr attr;
3235        int err;
3236
3237        immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
3238
3239        err = ib_query_port(ibdev, port_num, &attr);
3240        if (err)
3241                return err;
3242
3243        immutable->pkey_tbl_len = attr.pkey_tbl_len;
3244        immutable->gid_tbl_len = attr.gid_tbl_len;
3245        immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
3246
3247        return 0;
3248}
3249
3250static void get_dev_fw_str(struct ib_device *ibdev, char *str)
3251{
3252        struct mlx5_ib_dev *dev =
3253                container_of(ibdev, struct mlx5_ib_dev, ib_dev);
3254        snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
3255                 fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
3256                 fw_rev_sub(dev->mdev));
3257}
3258
3259static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
3260{
3261        struct mlx5_core_dev *mdev = dev->mdev;
3262        struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
3263                                                                 MLX5_FLOW_NAMESPACE_LAG);
3264        struct mlx5_flow_table *ft;
3265        int err;
3266
3267        if (!ns || !mlx5_lag_is_roce(mdev))
3268                return 0;
3269
3270        err = mlx5_cmd_create_vport_lag(mdev);
3271        if (err)
3272                return err;
3273
3274        ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
3275        if (IS_ERR(ft)) {
3276                err = PTR_ERR(ft);
3277                goto err_destroy_vport_lag;
3278        }
3279
3280        dev->flow_db->lag_demux_ft = ft;
3281        dev->lag_active = true;
3282        return 0;
3283
3284err_destroy_vport_lag:
3285        mlx5_cmd_destroy_vport_lag(mdev);
3286        return err;
3287}
3288
3289static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
3290{
3291        struct mlx5_core_dev *mdev = dev->mdev;
3292
3293        if (dev->lag_active) {
3294                dev->lag_active = false;
3295
3296                mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
3297                dev->flow_db->lag_demux_ft = NULL;
3298
3299                mlx5_cmd_destroy_vport_lag(mdev);
3300        }
3301}
3302
3303static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
3304{
3305        int err;
3306
3307        dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event;
3308        err = register_netdevice_notifier_net(mlx5_core_net(dev->mdev),
3309                                              &dev->port[port_num].roce.nb);
3310        if (err) {
3311                dev->port[port_num].roce.nb.notifier_call = NULL;
3312                return err;
3313        }
3314
3315        return 0;
3316}
3317
3318static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
3319{
3320        if (dev->port[port_num].roce.nb.notifier_call) {
3321                unregister_netdevice_notifier_net(mlx5_core_net(dev->mdev),
3322                                                  &dev->port[port_num].roce.nb);
3323                dev->port[port_num].roce.nb.notifier_call = NULL;
3324        }
3325}
3326
3327static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
3328{
3329        int err;
3330
3331        err = mlx5_nic_vport_enable_roce(dev->mdev);
3332        if (err)
3333                return err;
3334
3335        err = mlx5_eth_lag_init(dev);
3336        if (err)
3337                goto err_disable_roce;
3338
3339        return 0;
3340
3341err_disable_roce:
3342        mlx5_nic_vport_disable_roce(dev->mdev);
3343
3344        return err;
3345}
3346
3347static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
3348{
3349        mlx5_eth_lag_cleanup(dev);
3350        mlx5_nic_vport_disable_roce(dev->mdev);
3351}
3352
3353static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
3354                                 enum rdma_netdev_t type,
3355                                 struct rdma_netdev_alloc_params *params)
3356{
3357        if (type != RDMA_NETDEV_IPOIB)
3358                return -EOPNOTSUPP;
3359
3360        return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
3361}
3362
3363static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
3364                                       size_t count, loff_t *pos)
3365{
3366        struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
3367        char lbuf[20];
3368        int len;
3369
3370        len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
3371        return simple_read_from_buffer(buf, count, pos, lbuf, len);
3372}
3373
3374static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
3375                                        size_t count, loff_t *pos)
3376{
3377        struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
3378        u32 timeout;
3379        u32 var;
3380
3381        if (kstrtouint_from_user(buf, count, 0, &var))
3382                return -EFAULT;
3383
3384        timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
3385                        1000);
3386        if (timeout != var)
3387                mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
3388                            timeout);
3389
3390        delay_drop->timeout = timeout;
3391
3392        return count;
3393}
3394
3395static const struct file_operations fops_delay_drop_timeout = {
3396        .owner  = THIS_MODULE,
3397        .open   = simple_open,
3398        .write  = delay_drop_timeout_write,
3399        .read   = delay_drop_timeout_read,
3400};
3401
3402static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
3403                                      struct mlx5_ib_multiport_info *mpi)
3404{
3405        u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
3406        struct mlx5_ib_port *port = &ibdev->port[port_num];
3407        int comps;
3408        int err;
3409        int i;
3410
3411        lockdep_assert_held(&mlx5_ib_multiport_mutex);
3412
3413        mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
3414
3415        spin_lock(&port->mp.mpi_lock);
3416        if (!mpi->ibdev) {
3417                spin_unlock(&port->mp.mpi_lock);
3418                return;
3419        }
3420
3421        mpi->ibdev = NULL;
3422
3423        spin_unlock(&port->mp.mpi_lock);
3424        if (mpi->mdev_events.notifier_call)
3425                mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
3426        mpi->mdev_events.notifier_call = NULL;
3427        mlx5_remove_netdev_notifier(ibdev, port_num);
3428        spin_lock(&port->mp.mpi_lock);
3429
3430        comps = mpi->mdev_refcnt;
3431        if (comps) {
3432                mpi->unaffiliate = true;
3433                init_completion(&mpi->unref_comp);
3434                spin_unlock(&port->mp.mpi_lock);
3435
3436                for (i = 0; i < comps; i++)
3437                        wait_for_completion(&mpi->unref_comp);
3438
3439                spin_lock(&port->mp.mpi_lock);
3440                mpi->unaffiliate = false;
3441        }
3442
3443        port->mp.mpi = NULL;
3444
3445        list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
3446
3447        spin_unlock(&port->mp.mpi_lock);
3448
3449        err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
3450
3451        mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1);
3452        /* Log an error, still needed to cleanup the pointers and add
3453         * it back to the list.
3454         */
3455        if (err)
3456                mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
3457                            port_num + 1);
3458
3459        ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
3460}
3461
3462static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
3463                                    struct mlx5_ib_multiport_info *mpi)
3464{
3465        u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
3466        int err;
3467
3468        lockdep_assert_held(&mlx5_ib_multiport_mutex);
3469
3470        spin_lock(&ibdev->port[port_num].mp.mpi_lock);
3471        if (ibdev->port[port_num].mp.mpi) {
3472                mlx5_ib_dbg(ibdev, "port %d already affiliated.\n",
3473                            port_num + 1);
3474                spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
3475                return false;
3476        }
3477
3478        ibdev->port[port_num].mp.mpi = mpi;
3479        mpi->ibdev = ibdev;
3480        mpi->mdev_events.notifier_call = NULL;
3481        spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
3482
3483        err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
3484        if (err)
3485                goto unbind;
3486
3487        err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev));
3488        if (err)
3489                goto unbind;
3490
3491        err = mlx5_add_netdev_notifier(ibdev, port_num);
3492        if (err) {
3493                mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n",
3494                            port_num + 1);
3495                goto unbind;
3496        }
3497
3498        mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
3499        mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
3500
3501        mlx5_ib_init_cong_debugfs(ibdev, port_num);
3502
3503        return true;
3504
3505unbind:
3506        mlx5_ib_unbind_slave_port(ibdev, mpi);
3507        return false;
3508}
3509
3510static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
3511{
3512        int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
3513        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
3514                                                          port_num + 1);
3515        struct mlx5_ib_multiport_info *mpi;
3516        int err;
3517        int i;
3518
3519        if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
3520                return 0;
3521
3522        err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
3523                                                     &dev->sys_image_guid);
3524        if (err)
3525                return err;
3526
3527        err = mlx5_nic_vport_enable_roce(dev->mdev);
3528        if (err)
3529                return err;
3530
3531        mutex_lock(&mlx5_ib_multiport_mutex);
3532        for (i = 0; i < dev->num_ports; i++) {
3533                bool bound = false;
3534
3535                /* build a stub multiport info struct for the native port. */
3536                if (i == port_num) {
3537                        mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
3538                        if (!mpi) {
3539                                mutex_unlock(&mlx5_ib_multiport_mutex);
3540                                mlx5_nic_vport_disable_roce(dev->mdev);
3541                                return -ENOMEM;
3542                        }
3543
3544                        mpi->is_master = true;
3545                        mpi->mdev = dev->mdev;
3546                        mpi->sys_image_guid = dev->sys_image_guid;
3547                        dev->port[i].mp.mpi = mpi;
3548                        mpi->ibdev = dev;
3549                        mpi = NULL;
3550                        continue;
3551                }
3552
3553                list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
3554                                    list) {
3555                        if (dev->sys_image_guid == mpi->sys_image_guid &&
3556                            (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
3557                                bound = mlx5_ib_bind_slave_port(dev, mpi);
3558                        }
3559
3560                        if (bound) {
3561                                dev_dbg(mpi->mdev->device,
3562                                        "removing port from unaffiliated list.\n");
3563                                mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
3564                                list_del(&mpi->list);
3565                                break;
3566                        }
3567                }
3568                if (!bound) {
3569                        get_port_caps(dev, i + 1);
3570                        mlx5_ib_dbg(dev, "no free port found for port %d\n",
3571                                    i + 1);
3572                }
3573        }
3574
3575        list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
3576        mutex_unlock(&mlx5_ib_multiport_mutex);
3577        return err;
3578}
3579
3580static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
3581{
3582        int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
3583        enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
3584                                                          port_num + 1);
3585        int i;
3586
3587        if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
3588                return;
3589
3590        mutex_lock(&mlx5_ib_multiport_mutex);
3591        for (i = 0; i < dev->num_ports; i++) {
3592                if (dev->port[i].mp.mpi) {
3593                        /* Destroy the native port stub */
3594                        if (i == port_num) {
3595                                kfree(dev->port[i].mp.mpi);
3596                                dev->port[i].mp.mpi = NULL;
3597                        } else {
3598                                mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1);
3599                                mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi);
3600                        }
3601                }
3602        }
3603
3604        mlx5_ib_dbg(dev, "removing from devlist\n");
3605        list_del(&dev->ib_dev_list);
3606        mutex_unlock(&mlx5_ib_multiport_mutex);
3607
3608        mlx5_nic_vport_disable_roce(dev->mdev);
3609}
3610
3611static int mmap_obj_cleanup(struct ib_uobject *uobject,
3612                            enum rdma_remove_reason why,
3613                            struct uverbs_attr_bundle *attrs)
3614{
3615        struct mlx5_user_mmap_entry *obj = uobject->object;
3616
3617        rdma_user_mmap_entry_remove(&obj->rdma_entry);
3618        return 0;
3619}
3620
3621static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c,
3622                                            struct mlx5_user_mmap_entry *entry,
3623                                            size_t length)
3624{
3625        return rdma_user_mmap_entry_insert_range(
3626                &c->ibucontext, &entry->rdma_entry, length,
3627                (MLX5_IB_MMAP_OFFSET_START << 16),
3628                ((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1));
3629}
3630
3631static struct mlx5_user_mmap_entry *
3632alloc_var_entry(struct mlx5_ib_ucontext *c)
3633{
3634        struct mlx5_user_mmap_entry *entry;
3635        struct mlx5_var_table *var_table;
3636        u32 page_idx;
3637        int err;
3638
3639        var_table = &to_mdev(c->ibucontext.device)->var_table;
3640        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
3641        if (!entry)
3642                return ERR_PTR(-ENOMEM);
3643
3644        mutex_lock(&var_table->bitmap_lock);
3645        page_idx = find_first_zero_bit(var_table->bitmap,
3646                                       var_table->num_var_hw_entries);
3647        if (page_idx >= var_table->num_var_hw_entries) {
3648                err = -ENOSPC;
3649                mutex_unlock(&var_table->bitmap_lock);
3650                goto end;
3651        }
3652
3653        set_bit(page_idx, var_table->bitmap);
3654        mutex_unlock(&var_table->bitmap_lock);
3655
3656        entry->address = var_table->hw_start_addr +
3657                                (page_idx * var_table->stride_size);
3658        entry->page_idx = page_idx;
3659        entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
3660
3661        err = mlx5_rdma_user_mmap_entry_insert(c, entry,
3662                                               var_table->stride_size);
3663        if (err)
3664                goto err_insert;
3665
3666        return entry;
3667
3668err_insert:
3669        mutex_lock(&var_table->bitmap_lock);
3670        clear_bit(page_idx, var_table->bitmap);
3671        mutex_unlock(&var_table->bitmap_lock);
3672end:
3673        kfree(entry);
3674        return ERR_PTR(err);
3675}
3676
3677static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)(
3678        struct uverbs_attr_bundle *attrs)
3679{
3680        struct ib_uobject *uobj = uverbs_attr_get_uobject(
3681                attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
3682        struct mlx5_ib_ucontext *c;
3683        struct mlx5_user_mmap_entry *entry;
3684        u64 mmap_offset;
3685        u32 length;
3686        int err;
3687
3688        c = to_mucontext(ib_uverbs_get_ucontext(attrs));
3689        if (IS_ERR(c))
3690                return PTR_ERR(c);
3691
3692        entry = alloc_var_entry(c);
3693        if (IS_ERR(entry))
3694                return PTR_ERR(entry);
3695
3696        mmap_offset = mlx5_entry_to_mmap_offset(entry);
3697        length = entry->rdma_entry.npages * PAGE_SIZE;
3698        uobj->object = entry;
3699        uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
3700
3701        err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
3702                             &mmap_offset, sizeof(mmap_offset));
3703        if (err)
3704                return err;
3705
3706        err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
3707                             &entry->page_idx, sizeof(entry->page_idx));
3708        if (err)
3709                return err;
3710
3711        err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
3712                             &length, sizeof(length));
3713        return err;
3714}
3715
3716DECLARE_UVERBS_NAMED_METHOD(
3717        MLX5_IB_METHOD_VAR_OBJ_ALLOC,
3718        UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE,
3719                        MLX5_IB_OBJECT_VAR,
3720                        UVERBS_ACCESS_NEW,
3721                        UA_MANDATORY),
3722        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
3723                           UVERBS_ATTR_TYPE(u32),
3724                           UA_MANDATORY),
3725        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
3726                           UVERBS_ATTR_TYPE(u32),
3727                           UA_MANDATORY),
3728        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
3729                            UVERBS_ATTR_TYPE(u64),
3730                            UA_MANDATORY));
3731
3732DECLARE_UVERBS_NAMED_METHOD_DESTROY(
3733        MLX5_IB_METHOD_VAR_OBJ_DESTROY,
3734        UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE,
3735                        MLX5_IB_OBJECT_VAR,
3736                        UVERBS_ACCESS_DESTROY,
3737                        UA_MANDATORY));
3738
3739DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
3740                            UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
3741                            &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
3742                            &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
3743
3744static bool var_is_supported(struct ib_device *device)
3745{
3746        struct mlx5_ib_dev *dev = to_mdev(device);
3747
3748        return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
3749                        MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
3750}
3751
3752static struct mlx5_user_mmap_entry *
3753alloc_uar_entry(struct mlx5_ib_ucontext *c,
3754                enum mlx5_ib_uapi_uar_alloc_type alloc_type)
3755{
3756        struct mlx5_user_mmap_entry *entry;
3757        struct mlx5_ib_dev *dev;
3758        u32 uar_index;
3759        int err;
3760
3761        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
3762        if (!entry)
3763                return ERR_PTR(-ENOMEM);
3764
3765        dev = to_mdev(c->ibucontext.device);
3766        err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
3767        if (err)
3768                goto end;
3769
3770        entry->page_idx = uar_index;
3771        entry->address = uar_index2paddress(dev, uar_index);
3772        if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
3773                entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC;
3774        else
3775                entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC;
3776
3777        err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE);
3778        if (err)
3779                goto err_insert;
3780
3781        return entry;
3782
3783err_insert:
3784        mlx5_cmd_free_uar(dev->mdev, uar_index);
3785end:
3786        kfree(entry);
3787        return ERR_PTR(err);
3788}
3789
3790static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
3791        struct uverbs_attr_bundle *attrs)
3792{
3793        struct ib_uobject *uobj = uverbs_attr_get_uobject(
3794                attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
3795        enum mlx5_ib_uapi_uar_alloc_type alloc_type;
3796        struct mlx5_ib_ucontext *c;
3797        struct mlx5_user_mmap_entry *entry;
3798        u64 mmap_offset;
3799        u32 length;
3800        int err;
3801
3802        c = to_mucontext(ib_uverbs_get_ucontext(attrs));
3803        if (IS_ERR(c))
3804                return PTR_ERR(c);
3805
3806        err = uverbs_get_const(&alloc_type, attrs,
3807                               MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE);
3808        if (err)
3809                return err;
3810
3811        if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF &&
3812            alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
3813                return -EOPNOTSUPP;
3814
3815        if (!to_mdev(c->ibucontext.device)->wc_support &&
3816            alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
3817                return -EOPNOTSUPP;
3818
3819        entry = alloc_uar_entry(c, alloc_type);
3820        if (IS_ERR(entry))
3821                return PTR_ERR(entry);
3822
3823        mmap_offset = mlx5_entry_to_mmap_offset(entry);
3824        length = entry->rdma_entry.npages * PAGE_SIZE;
3825        uobj->object = entry;
3826        uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
3827
3828        err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
3829                             &mmap_offset, sizeof(mmap_offset));
3830        if (err)
3831                return err;
3832
3833        err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
3834                             &entry->page_idx, sizeof(entry->page_idx));
3835        if (err)
3836                return err;
3837
3838        err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
3839                             &length, sizeof(length));
3840        return err;
3841}
3842
3843DECLARE_UVERBS_NAMED_METHOD(
3844        MLX5_IB_METHOD_UAR_OBJ_ALLOC,
3845        UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE,
3846                        MLX5_IB_OBJECT_UAR,
3847                        UVERBS_ACCESS_NEW,
3848                        UA_MANDATORY),
3849        UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE,
3850                             enum mlx5_ib_uapi_uar_alloc_type,
3851                             UA_MANDATORY),
3852        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
3853                           UVERBS_ATTR_TYPE(u32),
3854                           UA_MANDATORY),
3855        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
3856                           UVERBS_ATTR_TYPE(u32),
3857                           UA_MANDATORY),
3858        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
3859                            UVERBS_ATTR_TYPE(u64),
3860                            UA_MANDATORY));
3861
3862DECLARE_UVERBS_NAMED_METHOD_DESTROY(
3863        MLX5_IB_METHOD_UAR_OBJ_DESTROY,
3864        UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE,
3865                        MLX5_IB_OBJECT_UAR,
3866                        UVERBS_ACCESS_DESTROY,
3867                        UA_MANDATORY));
3868
3869DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR,
3870                            UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
3871                            &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC),
3872                            &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY));
3873
3874ADD_UVERBS_ATTRIBUTES_SIMPLE(
3875        mlx5_ib_dm,
3876        UVERBS_OBJECT_DM,
3877        UVERBS_METHOD_DM_ALLOC,
3878        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
3879                            UVERBS_ATTR_TYPE(u64),
3880                            UA_MANDATORY),
3881        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
3882                            UVERBS_ATTR_TYPE(u16),
3883                            UA_OPTIONAL),
3884        UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
3885                             enum mlx5_ib_uapi_dm_type,
3886                             UA_OPTIONAL));
3887
3888ADD_UVERBS_ATTRIBUTES_SIMPLE(
3889        mlx5_ib_flow_action,
3890        UVERBS_OBJECT_FLOW_ACTION,
3891        UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
3892        UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
3893                             enum mlx5_ib_uapi_flow_action_flags));
3894
3895ADD_UVERBS_ATTRIBUTES_SIMPLE(
3896        mlx5_ib_query_context,
3897        UVERBS_OBJECT_DEVICE,
3898        UVERBS_METHOD_QUERY_CONTEXT,
3899        UVERBS_ATTR_PTR_OUT(
3900                MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
3901                UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp,
3902                                   dump_fill_mkey),
3903                UA_MANDATORY));
3904
3905static const struct uapi_definition mlx5_ib_defs[] = {
3906        UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
3907        UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
3908        UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
3909        UAPI_DEF_CHAIN(mlx5_ib_std_types_defs),
3910
3911        UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
3912                                &mlx5_ib_flow_action),
3913        UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
3914        UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
3915        UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
3916                                UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
3917        UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
3918        {}
3919};
3920
3921static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
3922{
3923        mlx5_ib_cleanup_multiport_master(dev);
3924        WARN_ON(!xa_empty(&dev->odp_mkeys));
3925        cleanup_srcu_struct(&dev->odp_srcu);
3926
3927        WARN_ON(!xa_empty(&dev->sig_mrs));
3928        WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
3929}
3930
3931static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
3932{
3933        struct mlx5_core_dev *mdev = dev->mdev;
3934        int err;
3935        int i;
3936
3937        for (i = 0; i < dev->num_ports; i++) {
3938                spin_lock_init(&dev->port[i].mp.mpi_lock);
3939                rwlock_init(&dev->port[i].roce.netdev_lock);
3940                dev->port[i].roce.dev = dev;
3941                dev->port[i].roce.native_port_num = i + 1;
3942                dev->port[i].roce.last_port_state = IB_PORT_DOWN;
3943        }
3944
3945        mlx5_ib_internal_fill_odp_caps(dev);
3946
3947        err = mlx5_ib_init_multiport_master(dev);
3948        if (err)
3949                return err;
3950
3951        err = set_has_smi_cap(dev);
3952        if (err)
3953                return err;
3954
3955        if (!mlx5_core_mp_enabled(mdev)) {
3956                for (i = 1; i <= dev->num_ports; i++) {
3957                        err = get_port_caps(dev, i);
3958                        if (err)
3959                                break;
3960                }
3961        } else {
3962                err = get_port_caps(dev, mlx5_core_native_port_num(mdev));
3963        }
3964        if (err)
3965                goto err_mp;
3966
3967        if (mlx5_use_mad_ifc(dev))
3968                get_ext_port_caps(dev);
3969
3970        dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
3971        dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
3972        dev->ib_dev.phys_port_cnt       = dev->num_ports;
3973        dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_count(mdev);
3974        dev->ib_dev.dev.parent          = mdev->device;
3975        dev->ib_dev.lag_flags           = RDMA_LAG_FLAGS_HASH_ALL_SLAVES;
3976
3977        mutex_init(&dev->cap_mask_mutex);
3978        INIT_LIST_HEAD(&dev->qp_list);
3979        spin_lock_init(&dev->reset_flow_resource_lock);
3980        xa_init(&dev->odp_mkeys);
3981        xa_init(&dev->sig_mrs);
3982        atomic_set(&dev->mkey_var, 0);
3983
3984        spin_lock_init(&dev->dm.lock);
3985        dev->dm.dev = mdev;
3986
3987        err = init_srcu_struct(&dev->odp_srcu);
3988        if (err)
3989                goto err_mp;
3990
3991        return 0;
3992
3993err_mp:
3994        mlx5_ib_cleanup_multiport_master(dev);
3995
3996        return -ENOMEM;
3997}
3998
3999static int mlx5_ib_enable_driver(struct ib_device *dev)
4000{
4001        struct mlx5_ib_dev *mdev = to_mdev(dev);
4002        int ret;
4003
4004        ret = mlx5_ib_test_wc(mdev);
4005        mlx5_ib_dbg(mdev, "Write-Combining %s",
4006                    mdev->wc_support ? "supported" : "not supported");
4007
4008        return ret;
4009}
4010
4011static const struct ib_device_ops mlx5_ib_dev_ops = {
4012        .owner = THIS_MODULE,
4013        .driver_id = RDMA_DRIVER_MLX5,
4014        .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION,
4015
4016        .add_gid = mlx5_ib_add_gid,
4017        .alloc_mr = mlx5_ib_alloc_mr,
4018        .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
4019        .alloc_pd = mlx5_ib_alloc_pd,
4020        .alloc_ucontext = mlx5_ib_alloc_ucontext,
4021        .attach_mcast = mlx5_ib_mcg_attach,
4022        .check_mr_status = mlx5_ib_check_mr_status,
4023        .create_ah = mlx5_ib_create_ah,
4024        .create_cq = mlx5_ib_create_cq,
4025        .create_qp = mlx5_ib_create_qp,
4026        .create_srq = mlx5_ib_create_srq,
4027        .dealloc_pd = mlx5_ib_dealloc_pd,
4028        .dealloc_ucontext = mlx5_ib_dealloc_ucontext,
4029        .del_gid = mlx5_ib_del_gid,
4030        .dereg_mr = mlx5_ib_dereg_mr,
4031        .destroy_ah = mlx5_ib_destroy_ah,
4032        .destroy_cq = mlx5_ib_destroy_cq,
4033        .destroy_qp = mlx5_ib_destroy_qp,
4034        .destroy_srq = mlx5_ib_destroy_srq,
4035        .detach_mcast = mlx5_ib_mcg_detach,
4036        .disassociate_ucontext = mlx5_ib_disassociate_ucontext,
4037        .drain_rq = mlx5_ib_drain_rq,
4038        .drain_sq = mlx5_ib_drain_sq,
4039        .enable_driver = mlx5_ib_enable_driver,
4040        .get_dev_fw_str = get_dev_fw_str,
4041        .get_dma_mr = mlx5_ib_get_dma_mr,
4042        .get_link_layer = mlx5_ib_port_link_layer,
4043        .map_mr_sg = mlx5_ib_map_mr_sg,
4044        .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
4045        .mmap = mlx5_ib_mmap,
4046        .mmap_free = mlx5_ib_mmap_free,
4047        .modify_cq = mlx5_ib_modify_cq,
4048        .modify_device = mlx5_ib_modify_device,
4049        .modify_port = mlx5_ib_modify_port,
4050        .modify_qp = mlx5_ib_modify_qp,
4051        .modify_srq = mlx5_ib_modify_srq,
4052        .poll_cq = mlx5_ib_poll_cq,
4053        .post_recv = mlx5_ib_post_recv_nodrain,
4054        .post_send = mlx5_ib_post_send_nodrain,
4055        .post_srq_recv = mlx5_ib_post_srq_recv,
4056        .process_mad = mlx5_ib_process_mad,
4057        .query_ah = mlx5_ib_query_ah,
4058        .query_device = mlx5_ib_query_device,
4059        .query_gid = mlx5_ib_query_gid,
4060        .query_pkey = mlx5_ib_query_pkey,
4061        .query_qp = mlx5_ib_query_qp,
4062        .query_srq = mlx5_ib_query_srq,
4063        .query_ucontext = mlx5_ib_query_ucontext,
4064        .reg_user_mr = mlx5_ib_reg_user_mr,
4065        .req_notify_cq = mlx5_ib_arm_cq,
4066        .rereg_user_mr = mlx5_ib_rereg_user_mr,
4067        .resize_cq = mlx5_ib_resize_cq,
4068
4069        INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
4070        INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
4071        INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
4072        INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
4073        INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
4074        INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
4075};
4076
4077static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
4078        .rdma_netdev_get_params = mlx5_ib_rn_get_params,
4079};
4080
4081static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
4082        .get_vf_config = mlx5_ib_get_vf_config,
4083        .get_vf_guid = mlx5_ib_get_vf_guid,
4084        .get_vf_stats = mlx5_ib_get_vf_stats,
4085        .set_vf_guid = mlx5_ib_set_vf_guid,
4086        .set_vf_link_state = mlx5_ib_set_vf_link_state,
4087};
4088
4089static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
4090        .alloc_mw = mlx5_ib_alloc_mw,
4091        .dealloc_mw = mlx5_ib_dealloc_mw,
4092
4093        INIT_RDMA_OBJ_SIZE(ib_mw, mlx5_ib_mw, ibmw),
4094};
4095
4096static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
4097        .alloc_xrcd = mlx5_ib_alloc_xrcd,
4098        .dealloc_xrcd = mlx5_ib_dealloc_xrcd,
4099
4100        INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx5_ib_xrcd, ibxrcd),
4101};
4102
4103static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
4104        .alloc_dm = mlx5_ib_alloc_dm,
4105        .dealloc_dm = mlx5_ib_dealloc_dm,
4106        .reg_dm_mr = mlx5_ib_reg_dm_mr,
4107};
4108
4109static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
4110{
4111        struct mlx5_core_dev *mdev = dev->mdev;
4112        struct mlx5_var_table *var_table = &dev->var_table;
4113        u8 log_doorbell_bar_size;
4114        u8 log_doorbell_stride;
4115        u64 bar_size;
4116
4117        log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
4118                                        log_doorbell_bar_size);
4119        log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
4120                                        log_doorbell_stride);
4121        var_table->hw_start_addr = dev->mdev->bar_addr +
4122                                MLX5_CAP64_DEV_VDPA_EMULATION(mdev,
4123                                        doorbell_bar_offset);
4124        bar_size = (1ULL << log_doorbell_bar_size) * 4096;
4125        var_table->stride_size = 1ULL << log_doorbell_stride;
4126        var_table->num_var_hw_entries = div_u64(bar_size,
4127                                                var_table->stride_size);
4128        mutex_init(&var_table->bitmap_lock);
4129        var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
4130                                          GFP_KERNEL);
4131        return (var_table->bitmap) ? 0 : -ENOMEM;
4132}
4133
4134static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev)
4135{
4136        bitmap_free(dev->var_table.bitmap);
4137}
4138
4139static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
4140{
4141        struct mlx5_core_dev *mdev = dev->mdev;
4142        int err;
4143
4144        dev->ib_dev.uverbs_cmd_mask     =
4145                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
4146                (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
4147                (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
4148                (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
4149                (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
4150                (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
4151                (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
4152                (1ull << IB_USER_VERBS_CMD_REG_MR)              |
4153                (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
4154                (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
4155                (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
4156                (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
4157                (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
4158                (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
4159                (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
4160                (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
4161                (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
4162                (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
4163                (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
4164                (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
4165                (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
4166                (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
4167                (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
4168                (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
4169                (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
4170                (1ull << IB_USER_VERBS_CMD_OPEN_QP);
4171        dev->ib_dev.uverbs_ex_cmd_mask =
4172                (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
4173                (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
4174                (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)        |
4175                (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP)        |
4176                (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ)        |
4177                (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW)      |
4178                (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
4179
4180        if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
4181            IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
4182                ib_set_device_ops(&dev->ib_dev,
4183                                  &mlx5_ib_dev_ipoib_enhanced_ops);
4184
4185        if (mlx5_core_is_pf(mdev))
4186                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
4187
4188        dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
4189
4190        if (MLX5_CAP_GEN(mdev, imaicl)) {
4191                dev->ib_dev.uverbs_cmd_mask |=
4192                        (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
4193                        (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
4194                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
4195        }
4196
4197        if (MLX5_CAP_GEN(mdev, xrc)) {
4198                dev->ib_dev.uverbs_cmd_mask |=
4199                        (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
4200                        (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
4201                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
4202        }
4203
4204        if (MLX5_CAP_DEV_MEM(mdev, memic) ||
4205            MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
4206            MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
4207                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
4208
4209        ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
4210
4211        if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
4212                dev->ib_dev.driver_def = mlx5_ib_defs;
4213
4214        err = init_node_data(dev);
4215        if (err)
4216                return err;
4217
4218        if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
4219            (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
4220             MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
4221                mutex_init(&dev->lb.mutex);
4222
4223        if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
4224                        MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) {
4225                err = mlx5_ib_init_var_table(dev);
4226                if (err)
4227                        return err;
4228        }
4229
4230        dev->ib_dev.use_cq_dim = true;
4231
4232        return 0;
4233}
4234
4235static const struct ib_device_ops mlx5_ib_dev_port_ops = {
4236        .get_port_immutable = mlx5_port_immutable,
4237        .query_port = mlx5_ib_query_port,
4238};
4239
4240static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
4241{
4242        ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
4243        return 0;
4244}
4245
4246static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
4247        .get_port_immutable = mlx5_port_rep_immutable,
4248        .query_port = mlx5_ib_rep_query_port,
4249};
4250
4251static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev)
4252{
4253        ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
4254        return 0;
4255}
4256
4257static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
4258        .create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
4259        .create_wq = mlx5_ib_create_wq,
4260        .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
4261        .destroy_wq = mlx5_ib_destroy_wq,
4262        .get_netdev = mlx5_ib_get_netdev,
4263        .modify_wq = mlx5_ib_modify_wq,
4264
4265        INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table,
4266                           ib_rwq_ind_tbl),
4267};
4268
4269static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
4270{
4271        struct mlx5_core_dev *mdev = dev->mdev;
4272        enum rdma_link_layer ll;
4273        int port_type_cap;
4274        u8 port_num = 0;
4275        int err;
4276
4277        port_type_cap = MLX5_CAP_GEN(mdev, port_type);
4278        ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
4279
4280        if (ll == IB_LINK_LAYER_ETHERNET) {
4281                dev->ib_dev.uverbs_ex_cmd_mask |=
4282                        (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
4283                        (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
4284                        (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
4285                        (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
4286                        (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
4287                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
4288
4289                port_num = mlx5_core_native_port_num(dev->mdev) - 1;
4290
4291                /* Register only for native ports */
4292                err = mlx5_add_netdev_notifier(dev, port_num);
4293                if (err || dev->is_rep || !mlx5_is_roce_enabled(mdev))
4294                        /*
4295                         * We don't enable ETH interface for
4296                         * 1. IB representors
4297                         * 2. User disabled ROCE through devlink interface
4298                         */
4299                        return err;
4300
4301                err = mlx5_enable_eth(dev);
4302                if (err)
4303                        goto cleanup;
4304        }
4305
4306        return 0;
4307cleanup:
4308        mlx5_remove_netdev_notifier(dev, port_num);
4309        return err;
4310}
4311
4312static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
4313{
4314        struct mlx5_core_dev *mdev = dev->mdev;
4315        enum rdma_link_layer ll;
4316        int port_type_cap;
4317        u8 port_num;
4318
4319        port_type_cap = MLX5_CAP_GEN(mdev, port_type);
4320        ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
4321
4322        if (ll == IB_LINK_LAYER_ETHERNET) {
4323                if (!dev->is_rep)
4324                        mlx5_disable_eth(dev);
4325
4326                port_num = mlx5_core_native_port_num(dev->mdev) - 1;
4327                mlx5_remove_netdev_notifier(dev, port_num);
4328        }
4329}
4330
4331static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
4332{
4333        mlx5_ib_init_cong_debugfs(dev,
4334                                  mlx5_core_native_port_num(dev->mdev) - 1);
4335        return 0;
4336}
4337
4338static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
4339{
4340        mlx5_ib_cleanup_cong_debugfs(dev,
4341                                     mlx5_core_native_port_num(dev->mdev) - 1);
4342}
4343
4344static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
4345{
4346        dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
4347        return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
4348}
4349
4350static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
4351{
4352        mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
4353}
4354
4355static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
4356{
4357        int err;
4358
4359        err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
4360        if (err)
4361                return err;
4362
4363        err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
4364        if (err)
4365                mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
4366
4367        return err;
4368}
4369
4370static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
4371{
4372        mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
4373        mlx5_free_bfreg(dev->mdev, &dev->bfreg);
4374}
4375
4376static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
4377{
4378        const char *name;
4379
4380        rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group);
4381        if (!mlx5_lag_is_roce(dev->mdev))
4382                name = "mlx5_%d";
4383        else
4384                name = "mlx5_bond_%d";
4385        return ib_register_device(&dev->ib_dev, name, &dev->mdev->pdev->dev);
4386}
4387
4388static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
4389{
4390        int err;
4391
4392        err = mlx5_mr_cache_cleanup(dev);
4393        if (err)
4394                mlx5_ib_warn(dev, "mr cache cleanup failed\n");
4395
4396        if (dev->umrc.qp)
4397                mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
4398        if (dev->umrc.cq)
4399                ib_free_cq(dev->umrc.cq);
4400        if (dev->umrc.pd)
4401                ib_dealloc_pd(dev->umrc.pd);
4402}
4403
4404static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
4405{
4406        ib_unregister_device(&dev->ib_dev);
4407}
4408
4409enum {
4410        MAX_UMR_WR = 128,
4411};
4412
4413static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
4414{
4415        struct ib_qp_init_attr *init_attr = NULL;
4416        struct ib_qp_attr *attr = NULL;
4417        struct ib_pd *pd;
4418        struct ib_cq *cq;
4419        struct ib_qp *qp;
4420        int ret;
4421
4422        attr = kzalloc(sizeof(*attr), GFP_KERNEL);
4423        init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
4424        if (!attr || !init_attr) {
4425                ret = -ENOMEM;
4426                goto error_0;
4427        }
4428
4429        pd = ib_alloc_pd(&dev->ib_dev, 0);
4430        if (IS_ERR(pd)) {
4431                mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
4432                ret = PTR_ERR(pd);
4433                goto error_0;
4434        }
4435
4436        cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
4437        if (IS_ERR(cq)) {
4438                mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
4439                ret = PTR_ERR(cq);
4440                goto error_2;
4441        }
4442
4443        init_attr->send_cq = cq;
4444        init_attr->recv_cq = cq;
4445        init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
4446        init_attr->cap.max_send_wr = MAX_UMR_WR;
4447        init_attr->cap.max_send_sge = 1;
4448        init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
4449        init_attr->port_num = 1;
4450        qp = mlx5_ib_create_qp(pd, init_attr, NULL);
4451        if (IS_ERR(qp)) {
4452                mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
4453                ret = PTR_ERR(qp);
4454                goto error_3;
4455        }
4456        qp->device     = &dev->ib_dev;
4457        qp->real_qp    = qp;
4458        qp->uobject    = NULL;
4459        qp->qp_type    = MLX5_IB_QPT_REG_UMR;
4460        qp->send_cq    = init_attr->send_cq;
4461        qp->recv_cq    = init_attr->recv_cq;
4462
4463        attr->qp_state = IB_QPS_INIT;
4464        attr->port_num = 1;
4465        ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
4466                                IB_QP_PORT, NULL);
4467        if (ret) {
4468                mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
4469                goto error_4;
4470        }
4471
4472        memset(attr, 0, sizeof(*attr));
4473        attr->qp_state = IB_QPS_RTR;
4474        attr->path_mtu = IB_MTU_256;
4475
4476        ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4477        if (ret) {
4478                mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
4479                goto error_4;
4480        }
4481
4482        memset(attr, 0, sizeof(*attr));
4483        attr->qp_state = IB_QPS_RTS;
4484        ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4485        if (ret) {
4486                mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
4487                goto error_4;
4488        }
4489
4490        dev->umrc.qp = qp;
4491        dev->umrc.cq = cq;
4492        dev->umrc.pd = pd;
4493
4494        sema_init(&dev->umrc.sem, MAX_UMR_WR);
4495        ret = mlx5_mr_cache_init(dev);
4496        if (ret) {
4497                mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
4498                goto error_4;
4499        }
4500
4501        kfree(attr);
4502        kfree(init_attr);
4503
4504        return 0;
4505
4506error_4:
4507        mlx5_ib_destroy_qp(qp, NULL);
4508        dev->umrc.qp = NULL;
4509
4510error_3:
4511        ib_free_cq(cq);
4512        dev->umrc.cq = NULL;
4513
4514error_2:
4515        ib_dealloc_pd(pd);
4516        dev->umrc.pd = NULL;
4517
4518error_0:
4519        kfree(attr);
4520        kfree(init_attr);
4521        return ret;
4522}
4523
4524static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
4525{
4526        struct dentry *root;
4527
4528        if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
4529                return 0;
4530
4531        mutex_init(&dev->delay_drop.lock);
4532        dev->delay_drop.dev = dev;
4533        dev->delay_drop.activate = false;
4534        dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
4535        INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
4536        atomic_set(&dev->delay_drop.rqs_cnt, 0);
4537        atomic_set(&dev->delay_drop.events_cnt, 0);
4538
4539        if (!mlx5_debugfs_root)
4540                return 0;
4541
4542        root = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root);
4543        dev->delay_drop.dir_debugfs = root;
4544
4545        debugfs_create_atomic_t("num_timeout_events", 0400, root,
4546                                &dev->delay_drop.events_cnt);
4547        debugfs_create_atomic_t("num_rqs", 0400, root,
4548                                &dev->delay_drop.rqs_cnt);
4549        debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
4550                            &fops_delay_drop_timeout);
4551        return 0;
4552}
4553
4554static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
4555{
4556        if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
4557                return;
4558
4559        cancel_work_sync(&dev->delay_drop.delay_drop_work);
4560        if (!dev->delay_drop.dir_debugfs)
4561                return;
4562
4563        debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
4564        dev->delay_drop.dir_debugfs = NULL;
4565}
4566
4567static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
4568{
4569        dev->mdev_events.notifier_call = mlx5_ib_event;
4570        mlx5_notifier_register(dev->mdev, &dev->mdev_events);
4571        return 0;
4572}
4573
4574static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
4575{
4576        mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
4577}
4578
4579void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
4580                      const struct mlx5_ib_profile *profile,
4581                      int stage)
4582{
4583        dev->ib_active = false;
4584
4585        /* Number of stages to cleanup */
4586        while (stage) {
4587                stage--;
4588                if (profile->stage[stage].cleanup)
4589                        profile->stage[stage].cleanup(dev);
4590        }
4591
4592        kfree(dev->port);
4593        ib_dealloc_device(&dev->ib_dev);
4594}
4595
4596void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
4597                    const struct mlx5_ib_profile *profile)
4598{
4599        int err;
4600        int i;
4601
4602        dev->profile = profile;
4603
4604        for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
4605                if (profile->stage[i].init) {
4606                        err = profile->stage[i].init(dev);
4607                        if (err)
4608                                goto err_out;
4609                }
4610        }
4611
4612        dev->ib_active = true;
4613
4614        return dev;
4615
4616err_out:
4617        __mlx5_ib_remove(dev, profile, i);
4618
4619        return NULL;
4620}
4621
4622static const struct mlx5_ib_profile pf_profile = {
4623        STAGE_CREATE(MLX5_IB_STAGE_INIT,
4624                     mlx5_ib_stage_init_init,
4625                     mlx5_ib_stage_init_cleanup),
4626        STAGE_CREATE(MLX5_IB_STAGE_FS,
4627                     mlx5_ib_fs_init,
4628                     mlx5_ib_fs_cleanup),
4629        STAGE_CREATE(MLX5_IB_STAGE_CAPS,
4630                     mlx5_ib_stage_caps_init,
4631                     mlx5_ib_stage_caps_cleanup),
4632        STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
4633                     mlx5_ib_stage_non_default_cb,
4634                     NULL),
4635        STAGE_CREATE(MLX5_IB_STAGE_ROCE,
4636                     mlx5_ib_roce_init,
4637                     mlx5_ib_roce_cleanup),
4638        STAGE_CREATE(MLX5_IB_STAGE_QP,
4639                     mlx5_init_qp_table,
4640                     mlx5_cleanup_qp_table),
4641        STAGE_CREATE(MLX5_IB_STAGE_SRQ,
4642                     mlx5_init_srq_table,
4643                     mlx5_cleanup_srq_table),
4644        STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
4645                     mlx5_ib_dev_res_init,
4646                     mlx5_ib_dev_res_cleanup),
4647        STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
4648                     mlx5_ib_stage_dev_notifier_init,
4649                     mlx5_ib_stage_dev_notifier_cleanup),
4650        STAGE_CREATE(MLX5_IB_STAGE_ODP,
4651                     mlx5_ib_odp_init_one,
4652                     mlx5_ib_odp_cleanup_one),
4653        STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
4654                     mlx5_ib_counters_init,
4655                     mlx5_ib_counters_cleanup),
4656        STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
4657                     mlx5_ib_stage_cong_debugfs_init,
4658                     mlx5_ib_stage_cong_debugfs_cleanup),
4659        STAGE_CREATE(MLX5_IB_STAGE_UAR,
4660                     mlx5_ib_stage_uar_init,
4661                     mlx5_ib_stage_uar_cleanup),
4662        STAGE_CREATE(MLX5_IB_STAGE_BFREG,
4663                     mlx5_ib_stage_bfrag_init,
4664                     mlx5_ib_stage_bfrag_cleanup),
4665        STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
4666                     NULL,
4667                     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
4668        STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
4669                     mlx5_ib_devx_init,
4670                     mlx5_ib_devx_cleanup),
4671        STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
4672                     mlx5_ib_stage_ib_reg_init,
4673                     mlx5_ib_stage_ib_reg_cleanup),
4674        STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
4675                     mlx5_ib_stage_post_ib_reg_umr_init,
4676                     NULL),
4677        STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
4678                     mlx5_ib_stage_delay_drop_init,
4679                     mlx5_ib_stage_delay_drop_cleanup),
4680        STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
4681                     mlx5_ib_restrack_init,
4682                     NULL),
4683};
4684
4685const struct mlx5_ib_profile raw_eth_profile = {
4686        STAGE_CREATE(MLX5_IB_STAGE_INIT,
4687                     mlx5_ib_stage_init_init,
4688                     mlx5_ib_stage_init_cleanup),
4689        STAGE_CREATE(MLX5_IB_STAGE_FS,
4690                     mlx5_ib_fs_init,
4691                     mlx5_ib_fs_cleanup),
4692        STAGE_CREATE(MLX5_IB_STAGE_CAPS,
4693                     mlx5_ib_stage_caps_init,
4694                     mlx5_ib_stage_caps_cleanup),
4695        STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
4696                     mlx5_ib_stage_raw_eth_non_default_cb,
4697                     NULL),
4698        STAGE_CREATE(MLX5_IB_STAGE_ROCE,
4699                     mlx5_ib_roce_init,
4700                     mlx5_ib_roce_cleanup),
4701        STAGE_CREATE(MLX5_IB_STAGE_QP,
4702                     mlx5_init_qp_table,
4703                     mlx5_cleanup_qp_table),
4704        STAGE_CREATE(MLX5_IB_STAGE_SRQ,
4705                     mlx5_init_srq_table,
4706                     mlx5_cleanup_srq_table),
4707        STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
4708                     mlx5_ib_dev_res_init,
4709                     mlx5_ib_dev_res_cleanup),
4710        STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
4711                     mlx5_ib_stage_dev_notifier_init,
4712                     mlx5_ib_stage_dev_notifier_cleanup),
4713        STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
4714                     mlx5_ib_counters_init,
4715                     mlx5_ib_counters_cleanup),
4716        STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
4717                     mlx5_ib_stage_cong_debugfs_init,
4718                     mlx5_ib_stage_cong_debugfs_cleanup),
4719        STAGE_CREATE(MLX5_IB_STAGE_UAR,
4720                     mlx5_ib_stage_uar_init,
4721                     mlx5_ib_stage_uar_cleanup),
4722        STAGE_CREATE(MLX5_IB_STAGE_BFREG,
4723                     mlx5_ib_stage_bfrag_init,
4724                     mlx5_ib_stage_bfrag_cleanup),
4725        STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
4726                     NULL,
4727                     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
4728        STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
4729                     mlx5_ib_devx_init,
4730                     mlx5_ib_devx_cleanup),
4731        STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
4732                     mlx5_ib_stage_ib_reg_init,
4733                     mlx5_ib_stage_ib_reg_cleanup),
4734        STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
4735                     mlx5_ib_stage_post_ib_reg_umr_init,
4736                     NULL),
4737        STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
4738                     mlx5_ib_restrack_init,
4739                     NULL),
4740};
4741
4742static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
4743{
4744        struct mlx5_ib_multiport_info *mpi;
4745        struct mlx5_ib_dev *dev;
4746        bool bound = false;
4747        int err;
4748
4749        mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
4750        if (!mpi)
4751                return NULL;
4752
4753        mpi->mdev = mdev;
4754
4755        err = mlx5_query_nic_vport_system_image_guid(mdev,
4756                                                     &mpi->sys_image_guid);
4757        if (err) {
4758                kfree(mpi);
4759                return NULL;
4760        }
4761
4762        mutex_lock(&mlx5_ib_multiport_mutex);
4763        list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
4764                if (dev->sys_image_guid == mpi->sys_image_guid)
4765                        bound = mlx5_ib_bind_slave_port(dev, mpi);
4766
4767                if (bound) {
4768                        rdma_roce_rescan_device(&dev->ib_dev);
4769                        break;
4770                }
4771        }
4772
4773        if (!bound) {
4774                list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
4775                dev_dbg(mdev->device,
4776                        "no suitable IB device found to bind to, added to unaffiliated list.\n");
4777        }
4778        mutex_unlock(&mlx5_ib_multiport_mutex);
4779
4780        return mpi;
4781}
4782
4783static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
4784{
4785        const struct mlx5_ib_profile *profile;
4786        enum rdma_link_layer ll;
4787        struct mlx5_ib_dev *dev;
4788        int port_type_cap;
4789        int num_ports;
4790
4791        if (MLX5_ESWITCH_MANAGER(mdev) &&
4792            mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
4793                if (!mlx5_core_mp_enabled(mdev))
4794                        mlx5_ib_register_vport_reps(mdev);
4795                return mdev;
4796        }
4797
4798        port_type_cap = MLX5_CAP_GEN(mdev, port_type);
4799        ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
4800
4801        if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
4802                return mlx5_ib_add_slave_port(mdev);
4803
4804        num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
4805                        MLX5_CAP_GEN(mdev, num_vhca_ports));
4806        dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
4807        if (!dev)
4808                return NULL;
4809        dev->port = kcalloc(num_ports, sizeof(*dev->port),
4810                             GFP_KERNEL);
4811        if (!dev->port) {
4812                ib_dealloc_device(&dev->ib_dev);
4813                return NULL;
4814        }
4815
4816        dev->mdev = mdev;
4817        dev->num_ports = num_ports;
4818
4819        if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_enabled(mdev))
4820                profile = &raw_eth_profile;
4821        else
4822                profile = &pf_profile;
4823
4824        return __mlx5_ib_add(dev, profile);
4825}
4826
4827static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
4828{
4829        struct mlx5_ib_multiport_info *mpi;
4830        struct mlx5_ib_dev *dev;
4831
4832        if (MLX5_ESWITCH_MANAGER(mdev) && context == mdev) {
4833                mlx5_ib_unregister_vport_reps(mdev);
4834                return;
4835        }
4836
4837        if (mlx5_core_is_mp_slave(mdev)) {
4838                mpi = context;
4839                mutex_lock(&mlx5_ib_multiport_mutex);
4840                if (mpi->ibdev)
4841                        mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
4842                list_del(&mpi->list);
4843                mutex_unlock(&mlx5_ib_multiport_mutex);
4844                kfree(mpi);
4845                return;
4846        }
4847
4848        dev = context;
4849        __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
4850}
4851
4852static struct mlx5_interface mlx5_ib_interface = {
4853        .add            = mlx5_ib_add,
4854        .remove         = mlx5_ib_remove,
4855        .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
4856};
4857
4858unsigned long mlx5_ib_get_xlt_emergency_page(void)
4859{
4860        mutex_lock(&xlt_emergency_page_mutex);
4861        return xlt_emergency_page;
4862}
4863
4864void mlx5_ib_put_xlt_emergency_page(void)
4865{
4866        mutex_unlock(&xlt_emergency_page_mutex);
4867}
4868
4869static int __init mlx5_ib_init(void)
4870{
4871        int err;
4872
4873        xlt_emergency_page = __get_free_page(GFP_KERNEL);
4874        if (!xlt_emergency_page)
4875                return -ENOMEM;
4876
4877        mutex_init(&xlt_emergency_page_mutex);
4878
4879        mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
4880        if (!mlx5_ib_event_wq) {
4881                free_page(xlt_emergency_page);
4882                return -ENOMEM;
4883        }
4884
4885        mlx5_ib_odp_init();
4886
4887        err = mlx5_register_interface(&mlx5_ib_interface);
4888
4889        return err;
4890}
4891
4892static void __exit mlx5_ib_cleanup(void)
4893{
4894        mlx5_unregister_interface(&mlx5_ib_interface);
4895        destroy_workqueue(mlx5_ib_event_wq);
4896        mutex_destroy(&xlt_emergency_page_mutex);
4897        free_page(xlt_emergency_page);
4898}
4899
4900module_init(mlx5_ib_init);
4901module_exit(mlx5_ib_cleanup);
4902