dpdk/drivers/common/mlx5/linux/mlx5_nl.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright 2018 6WIND S.A.
   3 * Copyright 2018 Mellanox Technologies, Ltd
   4 */
   5
   6#include <errno.h>
   7#include <linux/if_link.h>
   8#include <linux/rtnetlink.h>
   9#include <linux/genetlink.h>
  10#include <net/if.h>
  11#include <rdma/rdma_netlink.h>
  12#include <stdbool.h>
  13#include <stdint.h>
  14#include <stdlib.h>
  15#include <stdalign.h>
  16#include <string.h>
  17#include <sys/socket.h>
  18#include <unistd.h>
  19
  20#include <rte_errno.h>
  21
  22#include "mlx5_nl.h"
  23#include "../mlx5_common_log.h"
  24#include "mlx5_malloc.h"
  25#ifdef HAVE_DEVLINK
  26#include <linux/devlink.h>
  27#endif
  28
  29
  30/* Size of the buffer to receive kernel messages */
  31#define MLX5_NL_BUF_SIZE (32 * 1024)
  32/* Send buffer size for the Netlink socket */
  33#define MLX5_SEND_BUF_SIZE 32768
  34/* Receive buffer size for the Netlink socket */
  35#define MLX5_RECV_BUF_SIZE 32768
  36/* Maximal physical port name length. */
  37#define MLX5_PHYS_PORT_NAME_MAX 128
  38
  39/** Parameters of VLAN devices created by driver. */
  40#define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
  41/*
  42 * Define NDA_RTA as defined in iproute2 sources.
  43 *
  44 * see in iproute2 sources file include/libnetlink.h
  45 */
  46#ifndef MLX5_NDA_RTA
  47#define MLX5_NDA_RTA(r) \
  48        ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
  49#endif
  50/*
  51 * Define NLMSG_TAIL as defined in iproute2 sources.
  52 *
  53 * see in iproute2 sources file include/libnetlink.h
  54 */
  55#ifndef NLMSG_TAIL
  56#define NLMSG_TAIL(nmsg) \
  57        ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
  58#endif
  59/*
  60 * The following definitions are normally found in rdma/rdma_netlink.h,
  61 * however they are so recent that most systems do not expose them yet.
  62 */
  63#ifndef HAVE_RDMA_NL_NLDEV
  64#define RDMA_NL_NLDEV 5
  65#endif
  66#ifndef HAVE_RDMA_NLDEV_CMD_GET
  67#define RDMA_NLDEV_CMD_GET 1
  68#endif
  69#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
  70#define RDMA_NLDEV_CMD_PORT_GET 5
  71#endif
  72#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
  73#define RDMA_NLDEV_ATTR_DEV_INDEX 1
  74#endif
  75#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
  76#define RDMA_NLDEV_ATTR_DEV_NAME 2
  77#endif
  78#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
  79#define RDMA_NLDEV_ATTR_PORT_INDEX 3
  80#endif
  81#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
  82#define RDMA_NLDEV_ATTR_PORT_STATE 12
  83#endif
  84#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
  85#define RDMA_NLDEV_ATTR_NDEV_INDEX 50
  86#endif
  87
  88/* These are normally found in linux/if_link.h. */
  89#ifndef HAVE_IFLA_NUM_VF
  90#define IFLA_NUM_VF 21
  91#endif
  92#ifndef HAVE_IFLA_EXT_MASK
  93#define IFLA_EXT_MASK 29
  94#endif
  95#ifndef HAVE_IFLA_PHYS_SWITCH_ID
  96#define IFLA_PHYS_SWITCH_ID 36
  97#endif
  98#ifndef HAVE_IFLA_PHYS_PORT_NAME
  99#define IFLA_PHYS_PORT_NAME 38
 100#endif
 101
 102/*
 103 * Some Devlink defines may be missed in old kernel versions,
 104 * adjust used defines.
 105 */
 106#ifndef DEVLINK_GENL_NAME
 107#define DEVLINK_GENL_NAME "devlink"
 108#endif
 109#ifndef DEVLINK_GENL_VERSION
 110#define DEVLINK_GENL_VERSION 1
 111#endif
 112#ifndef DEVLINK_ATTR_BUS_NAME
 113#define DEVLINK_ATTR_BUS_NAME 1
 114#endif
 115#ifndef DEVLINK_ATTR_DEV_NAME
 116#define DEVLINK_ATTR_DEV_NAME 2
 117#endif
 118#ifndef DEVLINK_ATTR_PARAM
 119#define DEVLINK_ATTR_PARAM 80
 120#endif
 121#ifndef DEVLINK_ATTR_PARAM_NAME
 122#define DEVLINK_ATTR_PARAM_NAME 81
 123#endif
 124#ifndef DEVLINK_ATTR_PARAM_TYPE
 125#define DEVLINK_ATTR_PARAM_TYPE 83
 126#endif
 127#ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
 128#define DEVLINK_ATTR_PARAM_VALUES_LIST 84
 129#endif
 130#ifndef DEVLINK_ATTR_PARAM_VALUE
 131#define DEVLINK_ATTR_PARAM_VALUE 85
 132#endif
 133#ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
 134#define DEVLINK_ATTR_PARAM_VALUE_DATA 86
 135#endif
 136#ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
 137#define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
 138#endif
 139#ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
 140#define DEVLINK_PARAM_CMODE_DRIVERINIT 1
 141#endif
 142#ifndef DEVLINK_CMD_RELOAD
 143#define DEVLINK_CMD_RELOAD 37
 144#endif
 145#ifndef DEVLINK_CMD_PARAM_GET
 146#define DEVLINK_CMD_PARAM_GET 38
 147#endif
 148#ifndef DEVLINK_CMD_PARAM_SET
 149#define DEVLINK_CMD_PARAM_SET 39
 150#endif
 151#ifndef NLA_FLAG
 152#define NLA_FLAG 6
 153#endif
 154
 155/* Add/remove MAC address through Netlink */
 156struct mlx5_nl_mac_addr {
 157        struct rte_ether_addr (*mac)[];
 158        /**< MAC address handled by the device. */
 159        int mac_n; /**< Number of addresses in the array. */
 160};
 161
 162#define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
 163#define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
 164#define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
 165#define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
 166#define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
 167
 168/** Data structure used by mlx5_nl_cmdget_cb(). */
 169struct mlx5_nl_port_info {
 170        const char *name; /**< IB device name (in). */
 171        uint32_t flags; /**< found attribute flags (out). */
 172        uint32_t ibindex; /**< IB device index (out). */
 173        uint32_t ifindex; /**< Network interface index (out). */
 174        uint32_t portnum; /**< IB device max port number (out). */
 175        uint16_t state; /**< IB device port state (out). */
 176};
 177
 178uint32_t atomic_sn;
 179
 180/* Generate Netlink sequence number. */
 181#define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED)
 182
 183/**
 184 * Opens a Netlink socket.
 185 *
 186 * @param protocol
 187 *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
 188 *
 189 * @return
 190 *   A file descriptor on success, a negative errno value otherwise and
 191 *   rte_errno is set.
 192 */
 193int
 194mlx5_nl_init(int protocol)
 195{
 196        int fd;
 197        int buf_size;
 198        socklen_t opt_size;
 199        struct sockaddr_nl local = {
 200                .nl_family = AF_NETLINK,
 201        };
 202        int ret;
 203
 204        fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
 205        if (fd == -1) {
 206                rte_errno = errno;
 207                return -rte_errno;
 208        }
 209        opt_size = sizeof(buf_size);
 210        ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size);
 211        if (ret == -1) {
 212                rte_errno = errno;
 213                goto error;
 214        }
 215        DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size);
 216        if (buf_size < MLX5_SEND_BUF_SIZE) {
 217                ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
 218                                 &buf_size, sizeof(buf_size));
 219                if (ret == -1) {
 220                        rte_errno = errno;
 221                        goto error;
 222                }
 223        }
 224        opt_size = sizeof(buf_size);
 225        ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size);
 226        if (ret == -1) {
 227                rte_errno = errno;
 228                goto error;
 229        }
 230        DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size);
 231        if (buf_size < MLX5_RECV_BUF_SIZE) {
 232                ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
 233                                 &buf_size, sizeof(buf_size));
 234                if (ret == -1) {
 235                        rte_errno = errno;
 236                        goto error;
 237                }
 238        }
 239        ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
 240        if (ret == -1) {
 241                rte_errno = errno;
 242                goto error;
 243        }
 244        return fd;
 245error:
 246        close(fd);
 247        return -rte_errno;
 248}
 249
 250/**
 251 * Send a request message to the kernel on the Netlink socket.
 252 *
 253 * @param[in] nlsk_fd
 254 *   Netlink socket file descriptor.
 255 * @param[in] nh
 256 *   The Netlink message send to the kernel.
 257 * @param[in] ssn
 258 *   Sequence number.
 259 * @param[in] req
 260 *   Pointer to the request structure.
 261 * @param[in] len
 262 *   Length of the request in bytes.
 263 *
 264 * @return
 265 *   The number of sent bytes on success, a negative errno value otherwise and
 266 *   rte_errno is set.
 267 */
 268static int
 269mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
 270                int len)
 271{
 272        struct sockaddr_nl sa = {
 273                .nl_family = AF_NETLINK,
 274        };
 275        struct iovec iov[2] = {
 276                { .iov_base = nh, .iov_len = sizeof(*nh), },
 277                { .iov_base = req, .iov_len = len, },
 278        };
 279        struct msghdr msg = {
 280                .msg_name = &sa,
 281                .msg_namelen = sizeof(sa),
 282                .msg_iov = iov,
 283                .msg_iovlen = 2,
 284        };
 285        int send_bytes;
 286
 287        nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
 288        nh->nlmsg_seq = sn;
 289        send_bytes = sendmsg(nlsk_fd, &msg, 0);
 290        if (send_bytes < 0) {
 291                rte_errno = errno;
 292                return -rte_errno;
 293        }
 294        return send_bytes;
 295}
 296
 297/**
 298 * Send a message to the kernel on the Netlink socket.
 299 *
 300 * @param[in] nlsk_fd
 301 *   The Netlink socket file descriptor used for communication.
 302 * @param[in] nh
 303 *   The Netlink message send to the kernel.
 304 * @param[in] sn
 305 *   Sequence number.
 306 *
 307 * @return
 308 *   The number of sent bytes on success, a negative errno value otherwise and
 309 *   rte_errno is set.
 310 */
 311static int
 312mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
 313{
 314        struct sockaddr_nl sa = {
 315                .nl_family = AF_NETLINK,
 316        };
 317        struct iovec iov = {
 318                .iov_base = nh,
 319                .iov_len = nh->nlmsg_len,
 320        };
 321        struct msghdr msg = {
 322                .msg_name = &sa,
 323                .msg_namelen = sizeof(sa),
 324                .msg_iov = &iov,
 325                .msg_iovlen = 1,
 326        };
 327        int send_bytes;
 328
 329        nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
 330        nh->nlmsg_seq = sn;
 331        send_bytes = sendmsg(nlsk_fd, &msg, 0);
 332        if (send_bytes < 0) {
 333                rte_errno = errno;
 334                return -rte_errno;
 335        }
 336        return send_bytes;
 337}
 338
 339/**
 340 * Receive a message from the kernel on the Netlink socket, following
 341 * mlx5_nl_send().
 342 *
 343 * @param[in] nlsk_fd
 344 *   The Netlink socket file descriptor used for communication.
 345 * @param[in] sn
 346 *   Sequence number.
 347 * @param[in] cb
 348 *   The callback function to call for each Netlink message received.
 349 * @param[in, out] arg
 350 *   Custom arguments for the callback.
 351 *
 352 * @return
 353 *   0 on success, a negative errno value otherwise and rte_errno is set.
 354 */
 355static int
 356mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
 357             void *arg)
 358{
 359        struct sockaddr_nl sa;
 360        struct iovec iov;
 361        struct msghdr msg = {
 362                .msg_name = &sa,
 363                .msg_namelen = sizeof(sa),
 364                .msg_iov = &iov,
 365                /* One message at a time */
 366                .msg_iovlen = 1,
 367        };
 368        void *buf = NULL;
 369        int multipart = 0;
 370        int ret = 0;
 371
 372        do {
 373                struct nlmsghdr *nh;
 374                int recv_bytes;
 375
 376                do {
 377                        /* Query length of incoming message. */
 378                        iov.iov_base = NULL;
 379                        iov.iov_len = 0;
 380                        recv_bytes = recvmsg(nlsk_fd, &msg,
 381                                             MSG_PEEK | MSG_TRUNC);
 382                        if (recv_bytes < 0) {
 383                                rte_errno = errno;
 384                                ret = -rte_errno;
 385                                goto exit;
 386                        }
 387                        if (recv_bytes == 0) {
 388                                rte_errno = ENODATA;
 389                                ret = -rte_errno;
 390                                goto exit;
 391                        }
 392                        /* Allocate buffer to fetch the message. */
 393                        if (recv_bytes < MLX5_RECV_BUF_SIZE)
 394                                recv_bytes = MLX5_RECV_BUF_SIZE;
 395                        mlx5_free(buf);
 396                        buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY);
 397                        if (!buf) {
 398                                rte_errno = ENOMEM;
 399                                ret = -rte_errno;
 400                                goto exit;
 401                        }
 402                        /* Fetch the message. */
 403                        iov.iov_base = buf;
 404                        iov.iov_len = recv_bytes;
 405                        recv_bytes = recvmsg(nlsk_fd, &msg, 0);
 406                        if (recv_bytes == -1) {
 407                                rte_errno = errno;
 408                                ret = -rte_errno;
 409                                goto exit;
 410                        }
 411                        nh = (struct nlmsghdr *)buf;
 412                } while (nh->nlmsg_seq != sn);
 413                for (;
 414                     NLMSG_OK(nh, (unsigned int)recv_bytes);
 415                     nh = NLMSG_NEXT(nh, recv_bytes)) {
 416                        if (nh->nlmsg_type == NLMSG_ERROR) {
 417                                struct nlmsgerr *err_data = NLMSG_DATA(nh);
 418
 419                                if (err_data->error < 0) {
 420                                        rte_errno = -err_data->error;
 421                                        ret = -rte_errno;
 422                                        goto exit;
 423                                }
 424                                /* Ack message. */
 425                                ret = 0;
 426                                goto exit;
 427                        }
 428                        /* Multi-part msgs and their trailing DONE message. */
 429                        if (nh->nlmsg_flags & NLM_F_MULTI) {
 430                                if (nh->nlmsg_type == NLMSG_DONE) {
 431                                        ret =  0;
 432                                        goto exit;
 433                                }
 434                                multipart = 1;
 435                        }
 436                        if (cb) {
 437                                ret = cb(nh, arg);
 438                                if (ret < 0)
 439                                        goto exit;
 440                        }
 441                }
 442        } while (multipart);
 443exit:
 444        mlx5_free(buf);
 445        return ret;
 446}
 447
 448/**
 449 * Parse Netlink message to retrieve the bridge MAC address.
 450 *
 451 * @param nh
 452 *   Pointer to Netlink Message Header.
 453 * @param arg
 454 *   PMD data register with this callback.
 455 *
 456 * @return
 457 *   0 on success, a negative errno value otherwise and rte_errno is set.
 458 */
 459static int
 460mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
 461{
 462        struct mlx5_nl_mac_addr *data = arg;
 463        struct ndmsg *r = NLMSG_DATA(nh);
 464        struct rtattr *attribute;
 465        int len;
 466
 467        len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
 468        for (attribute = MLX5_NDA_RTA(r);
 469             RTA_OK(attribute, len);
 470             attribute = RTA_NEXT(attribute, len)) {
 471                if (attribute->rta_type == NDA_LLADDR) {
 472                        if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
 473                                DRV_LOG(WARNING,
 474                                        "not enough room to finalize the"
 475                                        " request");
 476                                rte_errno = ENOMEM;
 477                                return -rte_errno;
 478                        }
 479#ifdef RTE_LIBRTE_MLX5_DEBUG
 480                        char m[RTE_ETHER_ADDR_FMT_SIZE];
 481
 482                        rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
 483                                              RTA_DATA(attribute));
 484                        DRV_LOG(DEBUG, "bridge MAC address %s", m);
 485#endif
 486                        memcpy(&(*data->mac)[data->mac_n++],
 487                               RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
 488                }
 489        }
 490        return 0;
 491}
 492
 493/**
 494 * Get bridge MAC addresses.
 495 *
 496 * @param[in] nlsk_fd
 497 *   Netlink socket file descriptor.
 498 * @param[in] iface_idx
 499 *   Net device interface index.
 500 * @param mac[out]
 501 *   Pointer to the array table of MAC addresses to fill.
 502 *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
 503 * @param mac_n[out]
 504 *   Number of entries filled in MAC array.
 505 *
 506 * @return
 507 *   0 on success, a negative errno value otherwise and rte_errno is set.
 508 */
 509static int
 510mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
 511                      struct rte_ether_addr (*mac)[], int *mac_n)
 512{
 513        struct {
 514                struct nlmsghdr hdr;
 515                struct ifinfomsg ifm;
 516        } req = {
 517                .hdr = {
 518                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
 519                        .nlmsg_type = RTM_GETNEIGH,
 520                        .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
 521                },
 522                .ifm = {
 523                        .ifi_family = PF_BRIDGE,
 524                        .ifi_index = iface_idx,
 525                },
 526        };
 527        struct mlx5_nl_mac_addr data = {
 528                .mac = mac,
 529                .mac_n = 0,
 530        };
 531        uint32_t sn = MLX5_NL_SN_GENERATE;
 532        int ret;
 533
 534        if (nlsk_fd == -1)
 535                return 0;
 536        ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
 537                              sizeof(struct ifinfomsg));
 538        if (ret < 0)
 539                goto error;
 540        ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
 541        if (ret < 0)
 542                goto error;
 543        *mac_n = data.mac_n;
 544        return 0;
 545error:
 546        DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
 547                iface_idx, strerror(rte_errno));
 548        return -rte_errno;
 549}
 550
 551/**
 552 * Modify the MAC address neighbour table with Netlink.
 553 *
 554 * @param[in] nlsk_fd
 555 *   Netlink socket file descriptor.
 556 * @param[in] iface_idx
 557 *   Net device interface index.
 558 * @param mac
 559 *   MAC address to consider.
 560 * @param add
 561 *   1 to add the MAC address, 0 to remove the MAC address.
 562 *
 563 * @return
 564 *   0 on success, a negative errno value otherwise and rte_errno is set.
 565 */
 566static int
 567mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
 568                        struct rte_ether_addr *mac, int add)
 569{
 570        struct {
 571                struct nlmsghdr hdr;
 572                struct ndmsg ndm;
 573                struct rtattr rta;
 574                uint8_t buffer[RTE_ETHER_ADDR_LEN];
 575        } req = {
 576                .hdr = {
 577                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
 578                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
 579                                NLM_F_EXCL | NLM_F_ACK,
 580                        .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
 581                },
 582                .ndm = {
 583                        .ndm_family = PF_BRIDGE,
 584                        .ndm_state = NUD_NOARP | NUD_PERMANENT,
 585                        .ndm_ifindex = iface_idx,
 586                        .ndm_flags = NTF_SELF,
 587                },
 588                .rta = {
 589                        .rta_type = NDA_LLADDR,
 590                        .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
 591                },
 592        };
 593        uint32_t sn = MLX5_NL_SN_GENERATE;
 594        int ret;
 595
 596        if (nlsk_fd == -1)
 597                return 0;
 598        memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
 599        req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
 600                RTA_ALIGN(req.rta.rta_len);
 601        ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
 602        if (ret < 0)
 603                goto error;
 604        ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
 605        if (ret < 0)
 606                goto error;
 607        return 0;
 608error:
 609#ifdef RTE_LIBRTE_MLX5_DEBUG
 610        {
 611                char m[RTE_ETHER_ADDR_FMT_SIZE];
 612
 613                rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
 614                DRV_LOG(DEBUG,
 615                        "Interface %u cannot %s MAC address %s %s",
 616                        iface_idx,
 617                        add ? "add" : "remove", m, strerror(rte_errno));
 618        }
 619#endif
 620        return -rte_errno;
 621}
 622
 623/**
 624 * Modify the VF MAC address neighbour table with Netlink.
 625 *
 626 * @param[in] nlsk_fd
 627 *   Netlink socket file descriptor.
 628 * @param[in] iface_idx
 629 *   Net device interface index.
 630 * @param mac
 631 *    MAC address to consider.
 632 * @param vf_index
 633 *    VF index.
 634 *
 635 * @return
 636 *    0 on success, a negative errno value otherwise and rte_errno is set.
 637 */
 638int
 639mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
 640                           struct rte_ether_addr *mac, int vf_index)
 641{
 642        int ret;
 643        struct {
 644                struct nlmsghdr hdr;
 645                struct ifinfomsg ifm;
 646                struct rtattr vf_list_rta;
 647                struct rtattr vf_info_rta;
 648                struct rtattr vf_mac_rta;
 649                struct ifla_vf_mac ivm;
 650        } req = {
 651                .hdr = {
 652                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
 653                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
 654                        .nlmsg_type = RTM_BASE,
 655                },
 656                .ifm = {
 657                        .ifi_index = iface_idx,
 658                },
 659                .vf_list_rta = {
 660                        .rta_type = IFLA_VFINFO_LIST,
 661                        .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
 662                },
 663                .vf_info_rta = {
 664                        .rta_type = IFLA_VF_INFO,
 665                        .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
 666                },
 667                .vf_mac_rta = {
 668                        .rta_type = IFLA_VF_MAC,
 669                },
 670        };
 671        struct ifla_vf_mac ivm = {
 672                .vf = vf_index,
 673        };
 674        uint32_t sn = MLX5_NL_SN_GENERATE;
 675
 676        memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
 677        memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
 678
 679        req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
 680        req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
 681                RTA_ALIGN(req.vf_list_rta.rta_len) +
 682                RTA_ALIGN(req.vf_info_rta.rta_len) +
 683                RTA_ALIGN(req.vf_mac_rta.rta_len);
 684        req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
 685                                               &req.vf_list_rta);
 686        req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
 687                                               &req.vf_info_rta);
 688
 689        if (nlsk_fd < 0)
 690                return -1;
 691        ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
 692        if (ret < 0)
 693                goto error;
 694        ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
 695        if (ret < 0)
 696                goto error;
 697        return 0;
 698error:
 699        DRV_LOG(ERR,
 700                "representor %u cannot set VF MAC address "
 701                RTE_ETHER_ADDR_PRT_FMT " : %s",
 702                vf_index,
 703                RTE_ETHER_ADDR_BYTES(mac),
 704                strerror(rte_errno));
 705        return -rte_errno;
 706}
 707
 708/**
 709 * Add a MAC address.
 710 *
 711 * @param[in] nlsk_fd
 712 *   Netlink socket file descriptor.
 713 * @param[in] iface_idx
 714 *   Net device interface index.
 715 * @param mac_own
 716 *   BITFIELD_DECLARE array to store the mac.
 717 * @param mac
 718 *   MAC address to register.
 719 * @param index
 720 *   MAC address index.
 721 *
 722 * @return
 723 *   0 on success, a negative errno value otherwise and rte_errno is set.
 724 */
 725int
 726mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
 727                     uint64_t *mac_own, struct rte_ether_addr *mac,
 728                     uint32_t index)
 729{
 730        int ret;
 731
 732        ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
 733        if (!ret) {
 734                MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
 735                if (index >= MLX5_MAX_MAC_ADDRESSES)
 736                        return -EINVAL;
 737
 738                BITFIELD_SET(mac_own, index);
 739        }
 740        if (ret == -EEXIST)
 741                return 0;
 742        return ret;
 743}
 744
 745/**
 746 * Remove a MAC address.
 747 *
 748 * @param[in] nlsk_fd
 749 *   Netlink socket file descriptor.
 750 * @param[in] iface_idx
 751 *   Net device interface index.
 752 * @param mac_own
 753 *   BITFIELD_DECLARE array to store the mac.
 754 * @param mac
 755 *   MAC address to remove.
 756 * @param index
 757 *   MAC address index.
 758 *
 759 * @return
 760 *   0 on success, a negative errno value otherwise and rte_errno is set.
 761 */
 762int
 763mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
 764                        struct rte_ether_addr *mac, uint32_t index)
 765{
 766        MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
 767        if (index >= MLX5_MAX_MAC_ADDRESSES)
 768                return -EINVAL;
 769
 770        BITFIELD_RESET(mac_own, index);
 771        return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
 772}
 773
 774/**
 775 * Synchronize Netlink bridge table to the internal table.
 776 *
 777 * @param[in] nlsk_fd
 778 *   Netlink socket file descriptor.
 779 * @param[in] iface_idx
 780 *   Net device interface index.
 781 * @param mac_addrs
 782 *   Mac addresses array to sync.
 783 * @param n
 784 *   @p mac_addrs array size.
 785 */
 786void
 787mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
 788                      struct rte_ether_addr *mac_addrs, int n)
 789{
 790        struct rte_ether_addr macs[n];
 791        int macs_n = 0;
 792        int i;
 793        int ret;
 794
 795        memset(macs, 0, n * sizeof(macs[0]));
 796        ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
 797        if (ret)
 798                return;
 799        for (i = 0; i != macs_n; ++i) {
 800                int j;
 801
 802                /* Verify the address is not in the array yet. */
 803                for (j = 0; j != n; ++j)
 804                        if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
 805                                break;
 806                if (j != n)
 807                        continue;
 808                if (rte_is_multicast_ether_addr(&macs[i])) {
 809                        /* Find the first entry available. */
 810                        for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
 811                                if (rte_is_zero_ether_addr(&mac_addrs[j])) {
 812                                        mac_addrs[j] = macs[i];
 813                                        break;
 814                                }
 815                        }
 816                } else {
 817                        /* Find the first entry available. */
 818                        for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
 819                                if (rte_is_zero_ether_addr(&mac_addrs[j])) {
 820                                        mac_addrs[j] = macs[i];
 821                                        break;
 822                                }
 823                        }
 824                }
 825        }
 826}
 827
 828/**
 829 * Flush all added MAC addresses.
 830 *
 831 * @param[in] nlsk_fd
 832 *   Netlink socket file descriptor.
 833 * @param[in] iface_idx
 834 *   Net device interface index.
 835 * @param[in] mac_addrs
 836 *   Mac addresses array to flush.
 837 * @param n
 838 *   @p mac_addrs array size.
 839 * @param mac_own
 840 *   BITFIELD_DECLARE array to store the mac.
 841 */
 842void
 843mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
 844                       struct rte_ether_addr *mac_addrs, int n,
 845                       uint64_t *mac_own)
 846{
 847        int i;
 848
 849        if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
 850                return;
 851
 852        for (i = n - 1; i >= 0; --i) {
 853                struct rte_ether_addr *m = &mac_addrs[i];
 854
 855                if (BITFIELD_ISSET(mac_own, i))
 856                        mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
 857                                                i);
 858        }
 859}
 860
 861/**
 862 * Enable promiscuous / all multicast mode through Netlink.
 863 *
 864 * @param[in] nlsk_fd
 865 *   Netlink socket file descriptor.
 866 * @param[in] iface_idx
 867 *   Net device interface index.
 868 * @param flags
 869 *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
 870 * @param enable
 871 *   Nonzero to enable, disable otherwise.
 872 *
 873 * @return
 874 *   0 on success, a negative errno value otherwise and rte_errno is set.
 875 */
 876static int
 877mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
 878                     int enable)
 879{
 880        struct {
 881                struct nlmsghdr hdr;
 882                struct ifinfomsg ifi;
 883        } req = {
 884                .hdr = {
 885                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
 886                        .nlmsg_type = RTM_NEWLINK,
 887                        .nlmsg_flags = NLM_F_REQUEST,
 888                },
 889                .ifi = {
 890                        .ifi_flags = enable ? flags : 0,
 891                        .ifi_change = flags,
 892                        .ifi_index = iface_idx,
 893                },
 894        };
 895        uint32_t sn = MLX5_NL_SN_GENERATE;
 896        int ret;
 897
 898        MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
 899        if (nlsk_fd < 0)
 900                return 0;
 901        ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
 902        if (ret < 0)
 903                return ret;
 904        return 0;
 905}
 906
 907/**
 908 * Enable promiscuous mode through Netlink.
 909 *
 910 * @param[in] nlsk_fd
 911 *   Netlink socket file descriptor.
 912 * @param[in] iface_idx
 913 *   Net device interface index.
 914 * @param enable
 915 *   Nonzero to enable, disable otherwise.
 916 *
 917 * @return
 918 *   0 on success, a negative errno value otherwise and rte_errno is set.
 919 */
 920int
 921mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
 922{
 923        int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
 924
 925        if (ret)
 926                DRV_LOG(DEBUG,
 927                        "Interface %u cannot %s promisc mode: Netlink error %s",
 928                        iface_idx, enable ? "enable" : "disable",
 929                        strerror(rte_errno));
 930        return ret;
 931}
 932
 933/**
 934 * Enable all multicast mode through Netlink.
 935 *
 936 * @param[in] nlsk_fd
 937 *   Netlink socket file descriptor.
 938 * @param[in] iface_idx
 939 *   Net device interface index.
 940 * @param enable
 941 *   Nonzero to enable, disable otherwise.
 942 *
 943 * @return
 944 *   0 on success, a negative errno value otherwise and rte_errno is set.
 945 */
 946int
 947mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
 948{
 949        int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
 950                                       enable);
 951
 952        if (ret)
 953                DRV_LOG(DEBUG,
 954                        "Interface %u cannot %s allmulti : Netlink error %s",
 955                        iface_idx, enable ? "enable" : "disable",
 956                        strerror(rte_errno));
 957        return ret;
 958}
 959
 960/**
 961 * Process network interface information from Netlink message.
 962 *
 963 * @param nh
 964 *   Pointer to Netlink message header.
 965 * @param arg
 966 *   Opaque data pointer for this callback.
 967 *
 968 * @return
 969 *   0 on success, a negative errno value otherwise and rte_errno is set.
 970 */
 971static int
 972mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 973{
 974        struct mlx5_nl_port_info *data = arg;
 975        struct mlx5_nl_port_info local = {
 976                .flags = 0,
 977        };
 978        size_t off = NLMSG_HDRLEN;
 979
 980        if (nh->nlmsg_type !=
 981            RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
 982            nh->nlmsg_type !=
 983            RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
 984                goto error;
 985        while (off < nh->nlmsg_len) {
 986                struct nlattr *na = (void *)((uintptr_t)nh + off);
 987                void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
 988
 989                if (na->nla_len > nh->nlmsg_len - off)
 990                        goto error;
 991                switch (na->nla_type) {
 992                case RDMA_NLDEV_ATTR_DEV_INDEX:
 993                        local.ibindex = *(uint32_t *)payload;
 994                        local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
 995                        break;
 996                case RDMA_NLDEV_ATTR_DEV_NAME:
 997                        if (!strcmp(payload, data->name))
 998                                local.flags |= MLX5_NL_CMD_GET_IB_NAME;
 999                        break;
1000                case RDMA_NLDEV_ATTR_NDEV_INDEX:
1001                        local.ifindex = *(uint32_t *)payload;
1002                        local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
1003                        break;
1004                case RDMA_NLDEV_ATTR_PORT_INDEX:
1005                        local.portnum = *(uint32_t *)payload;
1006                        local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
1007                        break;
1008                case RDMA_NLDEV_ATTR_PORT_STATE:
1009                        local.state = *(uint8_t *)payload;
1010                        local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
1011                        break;
1012                default:
1013                        break;
1014                }
1015                off += NLA_ALIGN(na->nla_len);
1016        }
1017        /*
1018         * It is possible to have multiple messages for all
1019         * Infiniband devices in the system with appropriate name.
1020         * So we should gather parameters locally and copy to
1021         * query context only in case of coinciding device name.
1022         */
1023        if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
1024                data->flags = local.flags;
1025                data->ibindex = local.ibindex;
1026                data->ifindex = local.ifindex;
1027                data->portnum = local.portnum;
1028                data->state = local.state;
1029        }
1030        return 0;
1031error:
1032        rte_errno = EINVAL;
1033        return -rte_errno;
1034}
1035
1036/**
1037 * Get port info of network interface associated with some IB device.
1038 *
1039 * This is the only somewhat safe method to avoid resorting to heuristics
1040 * when faced with port representors. Unfortunately it requires at least
1041 * Linux 4.17.
1042 *
1043 * @param nl
1044 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1045 * @param[in] pindex
1046 *   IB device port index, starting from 1
1047 * @param[out] data
1048 *   Pointer to port info.
1049 * @return
1050 *   0 on success, negative on error and rte_errno is set.
1051 */
1052static int
1053mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
1054{
1055        union {
1056                struct nlmsghdr nh;
1057                uint8_t buf[NLMSG_HDRLEN +
1058                            NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
1059                            NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1060        } req = {
1061                .nh = {
1062                        .nlmsg_len = NLMSG_LENGTH(0),
1063                        .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1064                                                       RDMA_NLDEV_CMD_GET),
1065                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1066                },
1067        };
1068        struct nlattr *na;
1069        uint32_t sn = MLX5_NL_SN_GENERATE;
1070        int ret;
1071
1072        ret = mlx5_nl_send(nl, &req.nh, sn);
1073        if (ret < 0)
1074                return ret;
1075        ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1076        if (ret < 0)
1077                return ret;
1078        if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1079            !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
1080                goto error;
1081        data->flags = 0;
1082        sn = MLX5_NL_SN_GENERATE;
1083        req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1084                                             RDMA_NLDEV_CMD_PORT_GET);
1085        req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1086        req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1087        na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1088        na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
1089        na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1090        memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1091               &data->ibindex, sizeof(data->ibindex));
1092        na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1093        na->nla_len = NLA_HDRLEN + sizeof(pindex);
1094        na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1095        memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1096               &pindex, sizeof(pindex));
1097        ret = mlx5_nl_send(nl, &req.nh, sn);
1098        if (ret < 0)
1099                return ret;
1100        ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1101        if (ret < 0)
1102                return ret;
1103        if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1104            !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1105            !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1106            !data->ifindex)
1107                goto error;
1108        return 1;
1109error:
1110        rte_errno = ENODEV;
1111        return -rte_errno;
1112}
1113
1114/**
1115 * Get index of network interface associated with some IB device.
1116 *
1117 * This is the only somewhat safe method to avoid resorting to heuristics
1118 * when faced with port representors. Unfortunately it requires at least
1119 * Linux 4.17.
1120 *
1121 * @param nl
1122 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1123 * @param[in] name
1124 *   IB device name.
1125 * @param[in] pindex
1126 *   IB device port index, starting from 1
1127 * @return
1128 *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1129 *   is set.
1130 */
1131unsigned int
1132mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1133{
1134        struct mlx5_nl_port_info data = {
1135                        .ifindex = 0,
1136                        .name = name,
1137        };
1138
1139        if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1140                return 0;
1141        return data.ifindex;
1142}
1143
1144/**
1145 * Get IB device port state.
1146 *
1147 * This is the only somewhat safe method to get info for port number >= 255.
1148 * Unfortunately it requires at least Linux 4.17.
1149 *
1150 * @param nl
1151 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1152 * @param[in] name
1153 *   IB device name.
1154 * @param[in] pindex
1155 *   IB device port index, starting from 1
1156 * @return
1157 *   Port state (ibv_port_state) on success, negative on error
1158 *   and rte_errno is set.
1159 */
1160int
1161mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
1162{
1163        struct mlx5_nl_port_info data = {
1164                        .state = 0,
1165                        .name = name,
1166        };
1167
1168        if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1169                return -rte_errno;
1170        if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
1171                rte_errno = ENOTSUP;
1172                return -rte_errno;
1173        }
1174        return (int)data.state;
1175}
1176
1177/**
1178 * Get the number of physical ports of given IB device.
1179 *
1180 * @param nl
1181 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1182 * @param[in] name
1183 *   IB device name.
1184 *
1185 * @return
1186 *   A valid (nonzero) number of ports on success, 0 otherwise
1187 *   and rte_errno is set.
1188 */
1189unsigned int
1190mlx5_nl_portnum(int nl, const char *name)
1191{
1192        struct mlx5_nl_port_info data = {
1193                .flags = 0,
1194                .name = name,
1195                .ifindex = 0,
1196                .portnum = 0,
1197        };
1198        struct nlmsghdr req = {
1199                .nlmsg_len = NLMSG_LENGTH(0),
1200                .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1201                                               RDMA_NLDEV_CMD_GET),
1202                .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1203        };
1204        uint32_t sn = MLX5_NL_SN_GENERATE;
1205        int ret;
1206
1207        ret = mlx5_nl_send(nl, &req, sn);
1208        if (ret < 0)
1209                return 0;
1210        ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1211        if (ret < 0)
1212                return 0;
1213        if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1214            !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1215            !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1216                rte_errno = ENODEV;
1217                return 0;
1218        }
1219        if (!data.portnum)
1220                rte_errno = EINVAL;
1221        return data.portnum;
1222}
1223
1224/**
1225 * Analyze gathered port parameters via Netlink to recognize master
1226 * and representor devices for E-Switch configuration.
1227 *
1228 * @param[in] num_vf_set
1229 *   flag of presence of number of VFs port attribute.
1230 * @param[inout] switch_info
1231 *   Port information, including port name as a number and port name
1232 *   type if recognized
1233 *
1234 * @return
1235 *   master and representor flags are set in switch_info according to
1236 *   recognized parameters (if any).
1237 */
1238static void
1239mlx5_nl_check_switch_info(bool num_vf_set,
1240                          struct mlx5_switch_info *switch_info)
1241{
1242        switch (switch_info->name_type) {
1243        case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1244                /*
1245                 * Name is not recognized, assume the master,
1246                 * check the number of VFs key presence.
1247                 */
1248                switch_info->master = num_vf_set;
1249                break;
1250        case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1251                /*
1252                 * Name is not set, this assumes the legacy naming
1253                 * schema for master, just check if there is a
1254                 * number of VFs key.
1255                 */
1256                switch_info->master = num_vf_set;
1257                break;
1258        case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1259                /* New uplink naming schema recognized. */
1260                switch_info->master = 1;
1261                break;
1262        case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1263                /* Legacy representors naming schema. */
1264                switch_info->representor = !num_vf_set;
1265                break;
1266        case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1267                /* Fallthrough */
1268        case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1269                /* Fallthrough */
1270        case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1271                /* New representors naming schema. */
1272                switch_info->representor = 1;
1273                break;
1274        }
1275}
1276
1277/**
1278 * Process switch information from Netlink message.
1279 *
1280 * @param nh
1281 *   Pointer to Netlink message header.
1282 * @param arg
1283 *   Opaque data pointer for this callback.
1284 *
1285 * @return
1286 *   0 on success, a negative errno value otherwise and rte_errno is set.
1287 */
1288static int
1289mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1290{
1291        struct mlx5_switch_info info = {
1292                .master = 0,
1293                .representor = 0,
1294                .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1295                .port_name = 0,
1296                .switch_id = 0,
1297        };
1298        size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1299        bool switch_id_set = false;
1300        bool num_vf_set = false;
1301        int len;
1302
1303        if (nh->nlmsg_type != RTM_NEWLINK)
1304                goto error;
1305        while (off < nh->nlmsg_len) {
1306                struct rtattr *ra = (void *)((uintptr_t)nh + off);
1307                void *payload = RTA_DATA(ra);
1308                unsigned int i;
1309
1310                if (ra->rta_len > nh->nlmsg_len - off)
1311                        goto error;
1312                switch (ra->rta_type) {
1313                case IFLA_NUM_VF:
1314                        num_vf_set = true;
1315                        break;
1316                case IFLA_PHYS_PORT_NAME:
1317                        len = RTA_PAYLOAD(ra);
1318                        /* Some kernels do not pad attributes with zero. */
1319                        if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) {
1320                                char name[MLX5_PHYS_PORT_NAME_MAX];
1321
1322                                /*
1323                                 * We can't just patch the message with padding
1324                                 * zero - it might corrupt the following items
1325                                 * in the message, we have to copy the string
1326                                 * by attribute length and pad the copied one.
1327                                 */
1328                                memcpy(name, payload, len);
1329                                name[len] = 0;
1330                                mlx5_translate_port_name(name, &info);
1331                        } else {
1332                                info.name_type =
1333                                        MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1334                        }
1335                        break;
1336                case IFLA_PHYS_SWITCH_ID:
1337                        info.switch_id = 0;
1338                        for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1339                                info.switch_id <<= 8;
1340                                info.switch_id |= ((uint8_t *)payload)[i];
1341                        }
1342                        switch_id_set = true;
1343                        break;
1344                }
1345                off += RTA_ALIGN(ra->rta_len);
1346        }
1347        if (switch_id_set) {
1348                /* We have some E-Switch configuration. */
1349                mlx5_nl_check_switch_info(num_vf_set, &info);
1350        }
1351        MLX5_ASSERT(!(info.master && info.representor));
1352        memcpy(arg, &info, sizeof(info));
1353        return 0;
1354error:
1355        rte_errno = EINVAL;
1356        return -rte_errno;
1357}
1358
1359/**
1360 * Get switch information associated with network interface.
1361 *
1362 * @param nl
1363 *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1364 * @param ifindex
1365 *   Network interface index.
1366 * @param[out] info
1367 *   Switch information object, populated in case of success.
1368 *
1369 * @return
1370 *   0 on success, a negative errno value otherwise and rte_errno is set.
1371 */
1372int
1373mlx5_nl_switch_info(int nl, unsigned int ifindex,
1374                    struct mlx5_switch_info *info)
1375{
1376        struct {
1377                struct nlmsghdr nh;
1378                struct ifinfomsg info;
1379                struct rtattr rta;
1380                uint32_t extmask;
1381        } req = {
1382                .nh = {
1383                        .nlmsg_len = NLMSG_LENGTH
1384                                        (sizeof(req.info) +
1385                                         RTA_LENGTH(sizeof(uint32_t))),
1386                        .nlmsg_type = RTM_GETLINK,
1387                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1388                },
1389                .info = {
1390                        .ifi_family = AF_UNSPEC,
1391                        .ifi_index = ifindex,
1392                },
1393                .rta = {
1394                        .rta_type = IFLA_EXT_MASK,
1395                        .rta_len = RTA_LENGTH(sizeof(int32_t)),
1396                },
1397                .extmask = RTE_LE32(1),
1398        };
1399        uint32_t sn = MLX5_NL_SN_GENERATE;
1400        int ret;
1401
1402        ret = mlx5_nl_send(nl, &req.nh, sn);
1403        if (ret >= 0)
1404                ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1405        if (info->master && info->representor) {
1406                DRV_LOG(ERR, "ifindex %u device is recognized as master"
1407                             " and as representor", ifindex);
1408                rte_errno = ENODEV;
1409                ret = -rte_errno;
1410        }
1411        return ret;
1412}
1413
1414/*
1415 * Delete VLAN network device by ifindex.
1416 *
1417 * @param[in] tcf
1418 *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1419 * @param[in] ifindex
1420 *   Interface index of network device to delete.
1421 */
1422void
1423mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1424                      uint32_t ifindex)
1425{
1426        uint32_t sn = MLX5_NL_SN_GENERATE;
1427        int ret;
1428        struct {
1429                struct nlmsghdr nh;
1430                struct ifinfomsg info;
1431        } req = {
1432                .nh = {
1433                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1434                        .nlmsg_type = RTM_DELLINK,
1435                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1436                },
1437                .info = {
1438                        .ifi_family = AF_UNSPEC,
1439                        .ifi_index = ifindex,
1440                },
1441        };
1442
1443        if (ifindex) {
1444                ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1445                if (ret >= 0)
1446                        ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1447                if (ret < 0)
1448                        DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1449                                " ifindex %u, %d", ifindex, ret);
1450        }
1451}
1452
1453/* Set of subroutines to build Netlink message. */
1454static struct nlattr *
1455nl_msg_tail(struct nlmsghdr *nlh)
1456{
1457        return (struct nlattr *)
1458                (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1459}
1460
1461static void
1462nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1463{
1464        struct nlattr *nla = nl_msg_tail(nlh);
1465
1466        nla->nla_type = type;
1467        nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1468        nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1469
1470        if (alen)
1471                memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1472}
1473
1474static struct nlattr *
1475nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1476{
1477        struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1478
1479        nl_attr_put(nlh, type, NULL, 0);
1480        return nest;
1481}
1482
1483static void
1484nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1485{
1486        nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1487}
1488
1489/*
1490 * Create network VLAN device with specified VLAN tag.
1491 *
1492 * @param[in] tcf
1493 *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1494 * @param[in] ifindex
1495 *   Base network interface index.
1496 * @param[in] tag
1497 *   VLAN tag for VLAN network device to create.
1498 */
1499uint32_t
1500mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1501                         uint32_t ifindex, uint16_t tag)
1502{
1503        struct nlmsghdr *nlh;
1504        struct ifinfomsg *ifm;
1505        char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1506
1507        __rte_cache_aligned
1508        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1509                    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1510                    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1511                    NLMSG_ALIGN(sizeof(uint32_t)) +
1512                    NLMSG_ALIGN(sizeof(name)) +
1513                    NLMSG_ALIGN(sizeof("vlan")) +
1514                    NLMSG_ALIGN(sizeof(uint32_t)) +
1515                    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1516        struct nlattr *na_info;
1517        struct nlattr *na_vlan;
1518        uint32_t sn = MLX5_NL_SN_GENERATE;
1519        int ret;
1520
1521        memset(buf, 0, sizeof(buf));
1522        nlh = (struct nlmsghdr *)buf;
1523        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1524        nlh->nlmsg_type = RTM_NEWLINK;
1525        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1526                           NLM_F_EXCL | NLM_F_ACK;
1527        ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1528        nlh->nlmsg_len += sizeof(struct ifinfomsg);
1529        ifm->ifi_family = AF_UNSPEC;
1530        ifm->ifi_type = 0;
1531        ifm->ifi_index = 0;
1532        ifm->ifi_flags = IFF_UP;
1533        ifm->ifi_change = 0xffffffff;
1534        nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1535        ret = snprintf(name, sizeof(name), "%s.%u.%u",
1536                       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1537        nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1538        na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1539        nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1540        na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1541        nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1542        nl_attr_nest_end(nlh, na_vlan);
1543        nl_attr_nest_end(nlh, na_info);
1544        MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1545        ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1546        if (ret >= 0)
1547                ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1548        if (ret < 0) {
1549                DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1550                        ret);
1551        }
1552        /* Try to get ifindex of created or pre-existing device. */
1553        ret = if_nametoindex(name);
1554        if (!ret) {
1555                DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1556                        errno);
1557                return 0;
1558        }
1559        return ret;
1560}
1561
1562/**
1563 * Parse Netlink message to retrieve the general family ID.
1564 *
1565 * @param nh
1566 *   Pointer to Netlink Message Header.
1567 * @param arg
1568 *   PMD data register with this callback.
1569 *
1570 * @return
1571 *   0 on success, a negative errno value otherwise and rte_errno is set.
1572 */
1573static int
1574mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1575{
1576
1577        struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1578        struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1579                                        NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1580
1581        for (; nla->nla_len && nla < tail;
1582             nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1583                if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1584                        *(uint16_t *)arg = *(uint16_t *)(nla + 1);
1585                        return 0;
1586                }
1587        }
1588        return -EINVAL;
1589}
1590
1591#define MLX5_NL_MAX_ATTR_SIZE 100
1592/**
1593 * Get generic netlink family ID.
1594 *
1595 * @param[in] nlsk_fd
1596 *   Netlink socket file descriptor.
1597 * @param[in] name
1598 *   The family name.
1599 *
1600 * @return
1601 *   ID >= 0 on success and @p enable is updated, a negative errno value
1602 *   otherwise and rte_errno is set.
1603 */
1604static int
1605mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1606{
1607        struct nlmsghdr *nlh;
1608        struct genlmsghdr *genl;
1609        uint32_t sn = MLX5_NL_SN_GENERATE;
1610        int name_size = strlen(name) + 1;
1611        int ret;
1612        uint16_t id = -1;
1613        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1614                    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1615                    NLMSG_ALIGN(sizeof(struct nlattr)) +
1616                    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1617
1618        memset(buf, 0, sizeof(buf));
1619        nlh = (struct nlmsghdr *)buf;
1620        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1621        nlh->nlmsg_type = GENL_ID_CTRL;
1622        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1623        genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1624        nlh->nlmsg_len += sizeof(struct genlmsghdr);
1625        genl->cmd = CTRL_CMD_GETFAMILY;
1626        genl->version = 1;
1627        nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1628        ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1629        if (ret >= 0)
1630                ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1631        if (ret < 0) {
1632                DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1633                        ret);
1634                return ret;
1635        }
1636        DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1637        return (int)id;
1638}
1639
1640/**
1641 * Get Devlink family ID.
1642 *
1643 * @param[in] nlsk_fd
1644 *   Netlink socket file descriptor.
1645 *
1646 * @return
1647 *   ID >= 0 on success and @p enable is updated, a negative errno value
1648 *   otherwise and rte_errno is set.
1649 */
1650
1651int
1652mlx5_nl_devlink_family_id_get(int nlsk_fd)
1653{
1654        return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1655}
1656
1657/**
1658 * Parse Netlink message to retrieve the ROCE enable status.
1659 *
1660 * @param nh
1661 *   Pointer to Netlink Message Header.
1662 * @param arg
1663 *   PMD data register with this callback.
1664 *
1665 * @return
1666 *   0 on success, a negative errno value otherwise and rte_errno is set.
1667 */
1668static int
1669mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1670{
1671
1672        int ret = -EINVAL;
1673        int *enable = arg;
1674        struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1675        struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1676                                        NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1677
1678        while (nla->nla_len && nla < tail) {
1679                switch (nla->nla_type) {
1680                /* Expected nested attributes case. */
1681                case DEVLINK_ATTR_PARAM:
1682                case DEVLINK_ATTR_PARAM_VALUES_LIST:
1683                case DEVLINK_ATTR_PARAM_VALUE:
1684                        ret = 0;
1685                        nla += 1;
1686                        break;
1687                case DEVLINK_ATTR_PARAM_VALUE_DATA:
1688                        *enable = 1;
1689                        return 0;
1690                default:
1691                        nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1692                }
1693        }
1694        *enable = 0;
1695        return ret;
1696}
1697
1698/**
1699 * Get ROCE enable status through Netlink.
1700 *
1701 * @param[in] nlsk_fd
1702 *   Netlink socket file descriptor.
1703 * @param[in] family_id
1704 *   the Devlink family ID.
1705 * @param pci_addr
1706 *   The device PCI address.
1707 * @param[out] enable
1708 *   Where to store the enable status.
1709 *
1710 * @return
1711 *   0 on success and @p enable is updated, a negative errno value otherwise
1712 *   and rte_errno is set.
1713 */
1714int
1715mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1716                        int *enable)
1717{
1718        struct nlmsghdr *nlh;
1719        struct genlmsghdr *genl;
1720        uint32_t sn = MLX5_NL_SN_GENERATE;
1721        int ret;
1722        int cur_en = 0;
1723        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1724                    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1725                    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1726                    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1727
1728        memset(buf, 0, sizeof(buf));
1729        nlh = (struct nlmsghdr *)buf;
1730        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1731        nlh->nlmsg_type = family_id;
1732        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1733        genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1734        nlh->nlmsg_len += sizeof(struct genlmsghdr);
1735        genl->cmd = DEVLINK_CMD_PARAM_GET;
1736        genl->version = DEVLINK_GENL_VERSION;
1737        nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1738        nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1739        nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1740        ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1741        if (ret >= 0)
1742                ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1743        if (ret < 0) {
1744                DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1745                        pci_addr, ret);
1746                return ret;
1747        }
1748        *enable = cur_en;
1749        DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1750                cur_en ? "en" : "dis", pci_addr);
1751        return ret;
1752}
1753
1754/**
1755 * Reload mlx5 device kernel driver through Netlink.
1756 *
1757 * @param[in] nlsk_fd
1758 *   Netlink socket file descriptor.
1759 * @param[in] family_id
1760 *   the Devlink family ID.
1761 * @param pci_addr
1762 *   The device PCI address.
1763 * @param[out] enable
1764 *   The enable status to set.
1765 *
1766 * @return
1767 *   0 on success, a negative errno value otherwise and rte_errno is set.
1768 */
1769static int
1770mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1771{
1772        struct nlmsghdr *nlh;
1773        struct genlmsghdr *genl;
1774        uint32_t sn = MLX5_NL_SN_GENERATE;
1775        int ret;
1776        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1777                    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1778                    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1779                    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1780
1781        memset(buf, 0, sizeof(buf));
1782        nlh = (struct nlmsghdr *)buf;
1783        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1784        nlh->nlmsg_type = family_id;
1785        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1786        genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1787        nlh->nlmsg_len += sizeof(struct genlmsghdr);
1788        genl->cmd = DEVLINK_CMD_RELOAD;
1789        genl->version = DEVLINK_GENL_VERSION;
1790        nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1791        nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1792        ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1793        if (ret >= 0)
1794                ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1795        if (ret < 0) {
1796                DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1797                        pci_addr, ret);
1798                return ret;
1799        }
1800        DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1801                pci_addr);
1802        return 0;
1803}
1804
1805/**
1806 * Set ROCE enable status through Netlink.
1807 *
1808 * @param[in] nlsk_fd
1809 *   Netlink socket file descriptor.
1810 * @param[in] family_id
1811 *   the Devlink family ID.
1812 * @param pci_addr
1813 *   The device PCI address.
1814 * @param[out] enable
1815 *   The enable status to set.
1816 *
1817 * @return
1818 *   0 on success, a negative errno value otherwise and rte_errno is set.
1819 */
1820int
1821mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1822                        int enable)
1823{
1824        struct nlmsghdr *nlh;
1825        struct genlmsghdr *genl;
1826        uint32_t sn = MLX5_NL_SN_GENERATE;
1827        int ret;
1828        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1829                    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1830                    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1831                    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1832        uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1833        uint8_t ptype = NLA_FLAG;
1834;
1835
1836        memset(buf, 0, sizeof(buf));
1837        nlh = (struct nlmsghdr *)buf;
1838        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1839        nlh->nlmsg_type = family_id;
1840        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1841        genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1842        nlh->nlmsg_len += sizeof(struct genlmsghdr);
1843        genl->cmd = DEVLINK_CMD_PARAM_SET;
1844        genl->version = DEVLINK_GENL_VERSION;
1845        nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1846        nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1847        nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1848        nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1849        nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1850        if (enable)
1851                nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1852        ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1853        if (ret >= 0)
1854                ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1855        if (ret < 0) {
1856                DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1857                        " %d.", enable ? "en" : "dis", pci_addr, ret);
1858                return ret;
1859        }
1860        DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1861                pci_addr, enable ? "en" : "dis");
1862        /* Now, need to reload the driver. */
1863        return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1864}
1865