dpdk/drivers/common/mlx5/linux/mlx5_nl.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright 2018 6WIND S.A.
   3 * Copyright 2018 Mellanox Technologies, Ltd
   4 */
   5
   6#include <errno.h>
   7#include <linux/if_link.h>
   8#include <linux/rtnetlink.h>
   9#include <linux/genetlink.h>
  10#include <net/if.h>
  11#include <rdma/rdma_netlink.h>
  12#include <stdbool.h>
  13#include <stdint.h>
  14#include <stdlib.h>
  15#include <stdalign.h>
  16#include <string.h>
  17#include <sys/socket.h>
  18#include <unistd.h>
  19
  20#include <rte_errno.h>
  21
  22#include "mlx5_nl.h"
  23#include "../mlx5_common_log.h"
  24#include "mlx5_malloc.h"
  25#ifdef HAVE_DEVLINK
  26#include <linux/devlink.h>
  27#endif
  28
  29
  30/* Size of the buffer to receive kernel messages */
  31#define MLX5_NL_BUF_SIZE (32 * 1024)
  32/* Send buffer size for the Netlink socket */
  33#define MLX5_SEND_BUF_SIZE 32768
  34/* Receive buffer size for the Netlink socket */
  35#define MLX5_RECV_BUF_SIZE 32768
  36/* Maximal physical port name length. */
  37#define MLX5_PHYS_PORT_NAME_MAX 128
  38
  39/** Parameters of VLAN devices created by driver. */
  40#define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
  41/*
  42 * Define NDA_RTA as defined in iproute2 sources.
  43 *
  44 * see in iproute2 sources file include/libnetlink.h
  45 */
  46#ifndef MLX5_NDA_RTA
  47#define MLX5_NDA_RTA(r) \
  48        ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
  49#endif
  50/*
  51 * Define NLMSG_TAIL as defined in iproute2 sources.
  52 *
  53 * see in iproute2 sources file include/libnetlink.h
  54 */
  55#ifndef NLMSG_TAIL
  56#define NLMSG_TAIL(nmsg) \
  57        ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
  58#endif
  59/*
  60 * The following definitions are normally found in rdma/rdma_netlink.h,
  61 * however they are so recent that most systems do not expose them yet.
  62 */
  63#ifndef HAVE_RDMA_NL_NLDEV
  64#define RDMA_NL_NLDEV 5
  65#endif
  66#ifndef HAVE_RDMA_NLDEV_CMD_GET
  67#define RDMA_NLDEV_CMD_GET 1
  68#endif
  69#ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
  70#define RDMA_NLDEV_CMD_PORT_GET 5
  71#endif
  72#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
  73#define RDMA_NLDEV_ATTR_DEV_INDEX 1
  74#endif
  75#ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
  76#define RDMA_NLDEV_ATTR_DEV_NAME 2
  77#endif
  78#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
  79#define RDMA_NLDEV_ATTR_PORT_INDEX 3
  80#endif
  81#ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
  82#define RDMA_NLDEV_ATTR_PORT_STATE 12
  83#endif
  84#ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
  85#define RDMA_NLDEV_ATTR_NDEV_INDEX 50
  86#endif
  87
  88/* These are normally found in linux/if_link.h. */
  89#ifndef HAVE_IFLA_NUM_VF
  90#define IFLA_NUM_VF 21
  91#endif
  92#ifndef HAVE_IFLA_EXT_MASK
  93#define IFLA_EXT_MASK 29
  94#endif
  95#ifndef HAVE_IFLA_PHYS_SWITCH_ID
  96#define IFLA_PHYS_SWITCH_ID 36
  97#endif
  98#ifndef HAVE_IFLA_PHYS_PORT_NAME
  99#define IFLA_PHYS_PORT_NAME 38
 100#endif
 101
 102/*
 103 * Some Devlink defines may be missed in old kernel versions,
 104 * adjust used defines.
 105 */
 106#ifndef DEVLINK_GENL_NAME
 107#define DEVLINK_GENL_NAME "devlink"
 108#endif
 109#ifndef DEVLINK_GENL_VERSION
 110#define DEVLINK_GENL_VERSION 1
 111#endif
 112#ifndef DEVLINK_ATTR_BUS_NAME
 113#define DEVLINK_ATTR_BUS_NAME 1
 114#endif
 115#ifndef DEVLINK_ATTR_DEV_NAME
 116#define DEVLINK_ATTR_DEV_NAME 2
 117#endif
 118#ifndef DEVLINK_ATTR_PARAM
 119#define DEVLINK_ATTR_PARAM 80
 120#endif
 121#ifndef DEVLINK_ATTR_PARAM_NAME
 122#define DEVLINK_ATTR_PARAM_NAME 81
 123#endif
 124#ifndef DEVLINK_ATTR_PARAM_TYPE
 125#define DEVLINK_ATTR_PARAM_TYPE 83
 126#endif
 127#ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
 128#define DEVLINK_ATTR_PARAM_VALUES_LIST 84
 129#endif
 130#ifndef DEVLINK_ATTR_PARAM_VALUE
 131#define DEVLINK_ATTR_PARAM_VALUE 85
 132#endif
 133#ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
 134#define DEVLINK_ATTR_PARAM_VALUE_DATA 86
 135#endif
 136#ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
 137#define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
 138#endif
 139#ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
 140#define DEVLINK_PARAM_CMODE_DRIVERINIT 1
 141#endif
 142#ifndef DEVLINK_CMD_RELOAD
 143#define DEVLINK_CMD_RELOAD 37
 144#endif
 145#ifndef DEVLINK_CMD_PARAM_GET
 146#define DEVLINK_CMD_PARAM_GET 38
 147#endif
 148#ifndef DEVLINK_CMD_PARAM_SET
 149#define DEVLINK_CMD_PARAM_SET 39
 150#endif
 151#ifndef NLA_FLAG
 152#define NLA_FLAG 6
 153#endif
 154
 155/* Add/remove MAC address through Netlink */
 156struct mlx5_nl_mac_addr {
 157        struct rte_ether_addr (*mac)[];
 158        /**< MAC address handled by the device. */
 159        int mac_n; /**< Number of addresses in the array. */
 160};
 161
 162#define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
 163#define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
 164#define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
 165#define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
 166#define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
 167
 168/** Data structure used by mlx5_nl_cmdget_cb(). */
 169struct mlx5_nl_port_info {
 170        const char *name; /**< IB device name (in). */
 171        uint32_t flags; /**< found attribute flags (out). */
 172        uint32_t ibindex; /**< IB device index (out). */
 173        uint32_t ifindex; /**< Network interface index (out). */
 174        uint32_t portnum; /**< IB device max port number (out). */
 175        uint16_t state; /**< IB device port state (out). */
 176};
 177
 178uint32_t atomic_sn;
 179
 180/* Generate Netlink sequence number. */
 181#define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED)
 182
 183/**
 184 * Opens a Netlink socket.
 185 *
 186 * @param protocol
 187 *   Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
 188 * @param groups
 189 *   Groups to listen (e.g. RTMGRP_LINK), can be 0.
 190 *
 191 * @return
 192 *   A file descriptor on success, a negative errno value otherwise and
 193 *   rte_errno is set.
 194 */
 195int
 196mlx5_nl_init(int protocol, int groups)
 197{
 198        int fd;
 199        int buf_size;
 200        socklen_t opt_size;
 201        struct sockaddr_nl local = {
 202                .nl_family = AF_NETLINK,
 203                .nl_groups = groups,
 204        };
 205        int ret;
 206
 207        fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
 208        if (fd == -1) {
 209                rte_errno = errno;
 210                return -rte_errno;
 211        }
 212        opt_size = sizeof(buf_size);
 213        ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size);
 214        if (ret == -1) {
 215                rte_errno = errno;
 216                goto error;
 217        }
 218        DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size);
 219        if (buf_size < MLX5_SEND_BUF_SIZE) {
 220                ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
 221                                 &buf_size, sizeof(buf_size));
 222                if (ret == -1) {
 223                        rte_errno = errno;
 224                        goto error;
 225                }
 226        }
 227        opt_size = sizeof(buf_size);
 228        ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size);
 229        if (ret == -1) {
 230                rte_errno = errno;
 231                goto error;
 232        }
 233        DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size);
 234        if (buf_size < MLX5_RECV_BUF_SIZE) {
 235                ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
 236                                 &buf_size, sizeof(buf_size));
 237                if (ret == -1) {
 238                        rte_errno = errno;
 239                        goto error;
 240                }
 241        }
 242        ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
 243        if (ret == -1) {
 244                rte_errno = errno;
 245                goto error;
 246        }
 247        return fd;
 248error:
 249        close(fd);
 250        return -rte_errno;
 251}
 252
 253/**
 254 * Send a request message to the kernel on the Netlink socket.
 255 *
 256 * @param[in] nlsk_fd
 257 *   Netlink socket file descriptor.
 258 * @param[in] nh
 259 *   The Netlink message send to the kernel.
 260 * @param[in] ssn
 261 *   Sequence number.
 262 * @param[in] req
 263 *   Pointer to the request structure.
 264 * @param[in] len
 265 *   Length of the request in bytes.
 266 *
 267 * @return
 268 *   The number of sent bytes on success, a negative errno value otherwise and
 269 *   rte_errno is set.
 270 */
 271static int
 272mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
 273                int len)
 274{
 275        struct sockaddr_nl sa = {
 276                .nl_family = AF_NETLINK,
 277        };
 278        struct iovec iov[2] = {
 279                { .iov_base = nh, .iov_len = sizeof(*nh), },
 280                { .iov_base = req, .iov_len = len, },
 281        };
 282        struct msghdr msg = {
 283                .msg_name = &sa,
 284                .msg_namelen = sizeof(sa),
 285                .msg_iov = iov,
 286                .msg_iovlen = 2,
 287        };
 288        int send_bytes;
 289
 290        nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
 291        nh->nlmsg_seq = sn;
 292        send_bytes = sendmsg(nlsk_fd, &msg, 0);
 293        if (send_bytes < 0) {
 294                rte_errno = errno;
 295                return -rte_errno;
 296        }
 297        return send_bytes;
 298}
 299
 300/**
 301 * Send a message to the kernel on the Netlink socket.
 302 *
 303 * @param[in] nlsk_fd
 304 *   The Netlink socket file descriptor used for communication.
 305 * @param[in] nh
 306 *   The Netlink message send to the kernel.
 307 * @param[in] sn
 308 *   Sequence number.
 309 *
 310 * @return
 311 *   The number of sent bytes on success, a negative errno value otherwise and
 312 *   rte_errno is set.
 313 */
 314static int
 315mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
 316{
 317        struct sockaddr_nl sa = {
 318                .nl_family = AF_NETLINK,
 319        };
 320        struct iovec iov = {
 321                .iov_base = nh,
 322                .iov_len = nh->nlmsg_len,
 323        };
 324        struct msghdr msg = {
 325                .msg_name = &sa,
 326                .msg_namelen = sizeof(sa),
 327                .msg_iov = &iov,
 328                .msg_iovlen = 1,
 329        };
 330        int send_bytes;
 331
 332        nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
 333        nh->nlmsg_seq = sn;
 334        send_bytes = sendmsg(nlsk_fd, &msg, 0);
 335        if (send_bytes < 0) {
 336                rte_errno = errno;
 337                return -rte_errno;
 338        }
 339        return send_bytes;
 340}
 341
 342/**
 343 * Receive a message from the kernel on the Netlink socket, following
 344 * mlx5_nl_send().
 345 *
 346 * @param[in] nlsk_fd
 347 *   The Netlink socket file descriptor used for communication.
 348 * @param[in] sn
 349 *   Sequence number.
 350 * @param[in] cb
 351 *   The callback function to call for each Netlink message received.
 352 * @param[in, out] arg
 353 *   Custom arguments for the callback.
 354 *
 355 * @return
 356 *   0 on success, a negative errno value otherwise and rte_errno is set.
 357 */
 358static int
 359mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
 360             void *arg)
 361{
 362        struct sockaddr_nl sa;
 363        struct iovec iov;
 364        struct msghdr msg = {
 365                .msg_name = &sa,
 366                .msg_namelen = sizeof(sa),
 367                .msg_iov = &iov,
 368                /* One message at a time */
 369                .msg_iovlen = 1,
 370        };
 371        void *buf = NULL;
 372        int multipart = 0;
 373        int ret = 0;
 374
 375        do {
 376                struct nlmsghdr *nh;
 377                int recv_bytes;
 378
 379                do {
 380                        /* Query length of incoming message. */
 381                        iov.iov_base = NULL;
 382                        iov.iov_len = 0;
 383                        recv_bytes = recvmsg(nlsk_fd, &msg,
 384                                             MSG_PEEK | MSG_TRUNC);
 385                        if (recv_bytes < 0) {
 386                                rte_errno = errno;
 387                                ret = -rte_errno;
 388                                goto exit;
 389                        }
 390                        if (recv_bytes == 0) {
 391                                rte_errno = ENODATA;
 392                                ret = -rte_errno;
 393                                goto exit;
 394                        }
 395                        /* Allocate buffer to fetch the message. */
 396                        if (recv_bytes < MLX5_RECV_BUF_SIZE)
 397                                recv_bytes = MLX5_RECV_BUF_SIZE;
 398                        mlx5_free(buf);
 399                        buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY);
 400                        if (!buf) {
 401                                rte_errno = ENOMEM;
 402                                ret = -rte_errno;
 403                                goto exit;
 404                        }
 405                        /* Fetch the message. */
 406                        iov.iov_base = buf;
 407                        iov.iov_len = recv_bytes;
 408                        recv_bytes = recvmsg(nlsk_fd, &msg, 0);
 409                        if (recv_bytes == -1) {
 410                                rte_errno = errno;
 411                                ret = -rte_errno;
 412                                goto exit;
 413                        }
 414                        nh = (struct nlmsghdr *)buf;
 415                } while (nh->nlmsg_seq != sn);
 416                for (;
 417                     NLMSG_OK(nh, (unsigned int)recv_bytes);
 418                     nh = NLMSG_NEXT(nh, recv_bytes)) {
 419                        if (nh->nlmsg_type == NLMSG_ERROR) {
 420                                struct nlmsgerr *err_data = NLMSG_DATA(nh);
 421
 422                                if (err_data->error < 0) {
 423                                        rte_errno = -err_data->error;
 424                                        ret = -rte_errno;
 425                                        goto exit;
 426                                }
 427                                /* Ack message. */
 428                                ret = 0;
 429                                goto exit;
 430                        }
 431                        /* Multi-part msgs and their trailing DONE message. */
 432                        if (nh->nlmsg_flags & NLM_F_MULTI) {
 433                                if (nh->nlmsg_type == NLMSG_DONE) {
 434                                        ret =  0;
 435                                        goto exit;
 436                                }
 437                                multipart = 1;
 438                        }
 439                        if (cb) {
 440                                ret = cb(nh, arg);
 441                                if (ret < 0)
 442                                        goto exit;
 443                        }
 444                }
 445        } while (multipart);
 446exit:
 447        mlx5_free(buf);
 448        return ret;
 449}
 450
 451/**
 452 * Parse Netlink message to retrieve the bridge MAC address.
 453 *
 454 * @param nh
 455 *   Pointer to Netlink Message Header.
 456 * @param arg
 457 *   PMD data register with this callback.
 458 *
 459 * @return
 460 *   0 on success, a negative errno value otherwise and rte_errno is set.
 461 */
 462static int
 463mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
 464{
 465        struct mlx5_nl_mac_addr *data = arg;
 466        struct ndmsg *r = NLMSG_DATA(nh);
 467        struct rtattr *attribute;
 468        int len;
 469
 470        len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
 471        for (attribute = MLX5_NDA_RTA(r);
 472             RTA_OK(attribute, len);
 473             attribute = RTA_NEXT(attribute, len)) {
 474                if (attribute->rta_type == NDA_LLADDR) {
 475                        if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
 476                                DRV_LOG(WARNING,
 477                                        "not enough room to finalize the"
 478                                        " request");
 479                                rte_errno = ENOMEM;
 480                                return -rte_errno;
 481                        }
 482#ifdef RTE_LIBRTE_MLX5_DEBUG
 483                        char m[RTE_ETHER_ADDR_FMT_SIZE];
 484
 485                        rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
 486                                              RTA_DATA(attribute));
 487                        DRV_LOG(DEBUG, "bridge MAC address %s", m);
 488#endif
 489                        memcpy(&(*data->mac)[data->mac_n++],
 490                               RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
 491                }
 492        }
 493        return 0;
 494}
 495
 496/**
 497 * Get bridge MAC addresses.
 498 *
 499 * @param[in] nlsk_fd
 500 *   Netlink socket file descriptor.
 501 * @param[in] iface_idx
 502 *   Net device interface index.
 503 * @param mac[out]
 504 *   Pointer to the array table of MAC addresses to fill.
 505 *   Its size should be of MLX5_MAX_MAC_ADDRESSES.
 506 * @param mac_n[out]
 507 *   Number of entries filled in MAC array.
 508 *
 509 * @return
 510 *   0 on success, a negative errno value otherwise and rte_errno is set.
 511 */
 512static int
 513mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
 514                      struct rte_ether_addr (*mac)[], int *mac_n)
 515{
 516        struct {
 517                struct nlmsghdr hdr;
 518                struct ifinfomsg ifm;
 519        } req = {
 520                .hdr = {
 521                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
 522                        .nlmsg_type = RTM_GETNEIGH,
 523                        .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
 524                },
 525                .ifm = {
 526                        .ifi_family = PF_BRIDGE,
 527                        .ifi_index = iface_idx,
 528                },
 529        };
 530        struct mlx5_nl_mac_addr data = {
 531                .mac = mac,
 532                .mac_n = 0,
 533        };
 534        uint32_t sn = MLX5_NL_SN_GENERATE;
 535        int ret;
 536
 537        if (nlsk_fd == -1)
 538                return 0;
 539        ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
 540                              sizeof(struct ifinfomsg));
 541        if (ret < 0)
 542                goto error;
 543        ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
 544        if (ret < 0)
 545                goto error;
 546        *mac_n = data.mac_n;
 547        return 0;
 548error:
 549        DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
 550                iface_idx, strerror(rte_errno));
 551        return -rte_errno;
 552}
 553
 554/**
 555 * Modify the MAC address neighbour table with Netlink.
 556 *
 557 * @param[in] nlsk_fd
 558 *   Netlink socket file descriptor.
 559 * @param[in] iface_idx
 560 *   Net device interface index.
 561 * @param mac
 562 *   MAC address to consider.
 563 * @param add
 564 *   1 to add the MAC address, 0 to remove the MAC address.
 565 *
 566 * @return
 567 *   0 on success, a negative errno value otherwise and rte_errno is set.
 568 */
 569static int
 570mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
 571                        struct rte_ether_addr *mac, int add)
 572{
 573        struct {
 574                struct nlmsghdr hdr;
 575                struct ndmsg ndm;
 576                struct rtattr rta;
 577                uint8_t buffer[RTE_ETHER_ADDR_LEN];
 578        } req = {
 579                .hdr = {
 580                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
 581                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
 582                                NLM_F_EXCL | NLM_F_ACK,
 583                        .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
 584                },
 585                .ndm = {
 586                        .ndm_family = PF_BRIDGE,
 587                        .ndm_state = NUD_NOARP | NUD_PERMANENT,
 588                        .ndm_ifindex = iface_idx,
 589                        .ndm_flags = NTF_SELF,
 590                },
 591                .rta = {
 592                        .rta_type = NDA_LLADDR,
 593                        .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
 594                },
 595        };
 596        uint32_t sn = MLX5_NL_SN_GENERATE;
 597        int ret;
 598
 599        if (nlsk_fd == -1)
 600                return 0;
 601        memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
 602        req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
 603                RTA_ALIGN(req.rta.rta_len);
 604        ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
 605        if (ret < 0)
 606                goto error;
 607        ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
 608        if (ret < 0)
 609                goto error;
 610        return 0;
 611error:
 612#ifdef RTE_LIBRTE_MLX5_DEBUG
 613        {
 614                char m[RTE_ETHER_ADDR_FMT_SIZE];
 615
 616                rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
 617                DRV_LOG(DEBUG,
 618                        "Interface %u cannot %s MAC address %s %s",
 619                        iface_idx,
 620                        add ? "add" : "remove", m, strerror(rte_errno));
 621        }
 622#endif
 623        return -rte_errno;
 624}
 625
 626/**
 627 * Modify the VF MAC address neighbour table with Netlink.
 628 *
 629 * @param[in] nlsk_fd
 630 *   Netlink socket file descriptor.
 631 * @param[in] iface_idx
 632 *   Net device interface index.
 633 * @param mac
 634 *    MAC address to consider.
 635 * @param vf_index
 636 *    VF index.
 637 *
 638 * @return
 639 *    0 on success, a negative errno value otherwise and rte_errno is set.
 640 */
 641int
 642mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
 643                           struct rte_ether_addr *mac, int vf_index)
 644{
 645        int ret;
 646        struct {
 647                struct nlmsghdr hdr;
 648                struct ifinfomsg ifm;
 649                struct rtattr vf_list_rta;
 650                struct rtattr vf_info_rta;
 651                struct rtattr vf_mac_rta;
 652                struct ifla_vf_mac ivm;
 653        } req = {
 654                .hdr = {
 655                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
 656                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
 657                        .nlmsg_type = RTM_BASE,
 658                },
 659                .ifm = {
 660                        .ifi_index = iface_idx,
 661                },
 662                .vf_list_rta = {
 663                        .rta_type = IFLA_VFINFO_LIST,
 664                        .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
 665                },
 666                .vf_info_rta = {
 667                        .rta_type = IFLA_VF_INFO,
 668                        .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
 669                },
 670                .vf_mac_rta = {
 671                        .rta_type = IFLA_VF_MAC,
 672                },
 673        };
 674        struct ifla_vf_mac ivm = {
 675                .vf = vf_index,
 676        };
 677        uint32_t sn = MLX5_NL_SN_GENERATE;
 678
 679        memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
 680        memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
 681
 682        req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
 683        req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
 684                RTA_ALIGN(req.vf_list_rta.rta_len) +
 685                RTA_ALIGN(req.vf_info_rta.rta_len) +
 686                RTA_ALIGN(req.vf_mac_rta.rta_len);
 687        req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
 688                                               &req.vf_list_rta);
 689        req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
 690                                               &req.vf_info_rta);
 691
 692        if (nlsk_fd < 0)
 693                return -1;
 694        ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
 695        if (ret < 0)
 696                goto error;
 697        ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
 698        if (ret < 0)
 699                goto error;
 700        return 0;
 701error:
 702        DRV_LOG(ERR,
 703                "representor %u cannot set VF MAC address "
 704                RTE_ETHER_ADDR_PRT_FMT " : %s",
 705                vf_index,
 706                RTE_ETHER_ADDR_BYTES(mac),
 707                strerror(rte_errno));
 708        return -rte_errno;
 709}
 710
 711/**
 712 * Add a MAC address.
 713 *
 714 * @param[in] nlsk_fd
 715 *   Netlink socket file descriptor.
 716 * @param[in] iface_idx
 717 *   Net device interface index.
 718 * @param mac_own
 719 *   BITFIELD_DECLARE array to store the mac.
 720 * @param mac
 721 *   MAC address to register.
 722 * @param index
 723 *   MAC address index.
 724 *
 725 * @return
 726 *   0 on success, a negative errno value otherwise and rte_errno is set.
 727 */
 728int
 729mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
 730                     uint64_t *mac_own, struct rte_ether_addr *mac,
 731                     uint32_t index)
 732{
 733        int ret;
 734
 735        ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
 736        if (!ret) {
 737                MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
 738                if (index >= MLX5_MAX_MAC_ADDRESSES)
 739                        return -EINVAL;
 740
 741                BITFIELD_SET(mac_own, index);
 742        }
 743        if (ret == -EEXIST)
 744                return 0;
 745        return ret;
 746}
 747
 748/**
 749 * Remove a MAC address.
 750 *
 751 * @param[in] nlsk_fd
 752 *   Netlink socket file descriptor.
 753 * @param[in] iface_idx
 754 *   Net device interface index.
 755 * @param mac_own
 756 *   BITFIELD_DECLARE array to store the mac.
 757 * @param mac
 758 *   MAC address to remove.
 759 * @param index
 760 *   MAC address index.
 761 *
 762 * @return
 763 *   0 on success, a negative errno value otherwise and rte_errno is set.
 764 */
 765int
 766mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
 767                        struct rte_ether_addr *mac, uint32_t index)
 768{
 769        MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
 770        if (index >= MLX5_MAX_MAC_ADDRESSES)
 771                return -EINVAL;
 772
 773        BITFIELD_RESET(mac_own, index);
 774        return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
 775}
 776
 777/**
 778 * Synchronize Netlink bridge table to the internal table.
 779 *
 780 * @param[in] nlsk_fd
 781 *   Netlink socket file descriptor.
 782 * @param[in] iface_idx
 783 *   Net device interface index.
 784 * @param mac_addrs
 785 *   Mac addresses array to sync.
 786 * @param n
 787 *   @p mac_addrs array size.
 788 */
 789void
 790mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
 791                      struct rte_ether_addr *mac_addrs, int n)
 792{
 793        struct rte_ether_addr macs[n];
 794        int macs_n = 0;
 795        int i;
 796        int ret;
 797
 798        memset(macs, 0, n * sizeof(macs[0]));
 799        ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
 800        if (ret)
 801                return;
 802        for (i = 0; i != macs_n; ++i) {
 803                int j;
 804
 805                /* Verify the address is not in the array yet. */
 806                for (j = 0; j != n; ++j)
 807                        if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
 808                                break;
 809                if (j != n)
 810                        continue;
 811                if (rte_is_multicast_ether_addr(&macs[i])) {
 812                        /* Find the first entry available. */
 813                        for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
 814                                if (rte_is_zero_ether_addr(&mac_addrs[j])) {
 815                                        mac_addrs[j] = macs[i];
 816                                        break;
 817                                }
 818                        }
 819                } else {
 820                        /* Find the first entry available. */
 821                        for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
 822                                if (rte_is_zero_ether_addr(&mac_addrs[j])) {
 823                                        mac_addrs[j] = macs[i];
 824                                        break;
 825                                }
 826                        }
 827                }
 828        }
 829}
 830
 831/**
 832 * Flush all added MAC addresses.
 833 *
 834 * @param[in] nlsk_fd
 835 *   Netlink socket file descriptor.
 836 * @param[in] iface_idx
 837 *   Net device interface index.
 838 * @param[in] mac_addrs
 839 *   Mac addresses array to flush.
 840 * @param n
 841 *   @p mac_addrs array size.
 842 * @param mac_own
 843 *   BITFIELD_DECLARE array to store the mac.
 844 */
 845void
 846mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
 847                       struct rte_ether_addr *mac_addrs, int n,
 848                       uint64_t *mac_own)
 849{
 850        int i;
 851
 852        if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
 853                return;
 854
 855        for (i = n - 1; i >= 0; --i) {
 856                struct rte_ether_addr *m = &mac_addrs[i];
 857
 858                if (BITFIELD_ISSET(mac_own, i))
 859                        mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
 860                                                i);
 861        }
 862}
 863
 864/**
 865 * Enable promiscuous / all multicast mode through Netlink.
 866 *
 867 * @param[in] nlsk_fd
 868 *   Netlink socket file descriptor.
 869 * @param[in] iface_idx
 870 *   Net device interface index.
 871 * @param flags
 872 *   IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
 873 * @param enable
 874 *   Nonzero to enable, disable otherwise.
 875 *
 876 * @return
 877 *   0 on success, a negative errno value otherwise and rte_errno is set.
 878 */
 879static int
 880mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
 881                     int enable)
 882{
 883        struct {
 884                struct nlmsghdr hdr;
 885                struct ifinfomsg ifi;
 886        } req = {
 887                .hdr = {
 888                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
 889                        .nlmsg_type = RTM_NEWLINK,
 890                        .nlmsg_flags = NLM_F_REQUEST,
 891                },
 892                .ifi = {
 893                        .ifi_flags = enable ? flags : 0,
 894                        .ifi_change = flags,
 895                        .ifi_index = iface_idx,
 896                },
 897        };
 898        uint32_t sn = MLX5_NL_SN_GENERATE;
 899        int ret;
 900
 901        MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
 902        if (nlsk_fd < 0)
 903                return 0;
 904        ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
 905        if (ret < 0)
 906                return ret;
 907        return 0;
 908}
 909
 910/**
 911 * Enable promiscuous mode through Netlink.
 912 *
 913 * @param[in] nlsk_fd
 914 *   Netlink socket file descriptor.
 915 * @param[in] iface_idx
 916 *   Net device interface index.
 917 * @param enable
 918 *   Nonzero to enable, disable otherwise.
 919 *
 920 * @return
 921 *   0 on success, a negative errno value otherwise and rte_errno is set.
 922 */
 923int
 924mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
 925{
 926        int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
 927
 928        if (ret)
 929                DRV_LOG(DEBUG,
 930                        "Interface %u cannot %s promisc mode: Netlink error %s",
 931                        iface_idx, enable ? "enable" : "disable",
 932                        strerror(rte_errno));
 933        return ret;
 934}
 935
 936/**
 937 * Enable all multicast mode through Netlink.
 938 *
 939 * @param[in] nlsk_fd
 940 *   Netlink socket file descriptor.
 941 * @param[in] iface_idx
 942 *   Net device interface index.
 943 * @param enable
 944 *   Nonzero to enable, disable otherwise.
 945 *
 946 * @return
 947 *   0 on success, a negative errno value otherwise and rte_errno is set.
 948 */
 949int
 950mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
 951{
 952        int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
 953                                       enable);
 954
 955        if (ret)
 956                DRV_LOG(DEBUG,
 957                        "Interface %u cannot %s allmulti : Netlink error %s",
 958                        iface_idx, enable ? "enable" : "disable",
 959                        strerror(rte_errno));
 960        return ret;
 961}
 962
 963/**
 964 * Process network interface information from Netlink message.
 965 *
 966 * @param nh
 967 *   Pointer to Netlink message header.
 968 * @param arg
 969 *   Opaque data pointer for this callback.
 970 *
 971 * @return
 972 *   0 on success, a negative errno value otherwise and rte_errno is set.
 973 */
 974static int
 975mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
 976{
 977        struct mlx5_nl_port_info *data = arg;
 978        struct mlx5_nl_port_info local = {
 979                .flags = 0,
 980        };
 981        size_t off = NLMSG_HDRLEN;
 982
 983        if (nh->nlmsg_type !=
 984            RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
 985            nh->nlmsg_type !=
 986            RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
 987                goto error;
 988        while (off < nh->nlmsg_len) {
 989                struct nlattr *na = (void *)((uintptr_t)nh + off);
 990                void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
 991
 992                if (na->nla_len > nh->nlmsg_len - off)
 993                        goto error;
 994                switch (na->nla_type) {
 995                case RDMA_NLDEV_ATTR_DEV_INDEX:
 996                        local.ibindex = *(uint32_t *)payload;
 997                        local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
 998                        break;
 999                case RDMA_NLDEV_ATTR_DEV_NAME:
1000                        if (!strcmp(payload, data->name))
1001                                local.flags |= MLX5_NL_CMD_GET_IB_NAME;
1002                        break;
1003                case RDMA_NLDEV_ATTR_NDEV_INDEX:
1004                        local.ifindex = *(uint32_t *)payload;
1005                        local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
1006                        break;
1007                case RDMA_NLDEV_ATTR_PORT_INDEX:
1008                        local.portnum = *(uint32_t *)payload;
1009                        local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
1010                        break;
1011                case RDMA_NLDEV_ATTR_PORT_STATE:
1012                        local.state = *(uint8_t *)payload;
1013                        local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
1014                        break;
1015                default:
1016                        break;
1017                }
1018                off += NLA_ALIGN(na->nla_len);
1019        }
1020        /*
1021         * It is possible to have multiple messages for all
1022         * Infiniband devices in the system with appropriate name.
1023         * So we should gather parameters locally and copy to
1024         * query context only in case of coinciding device name.
1025         */
1026        if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
1027                data->flags = local.flags;
1028                data->ibindex = local.ibindex;
1029                data->ifindex = local.ifindex;
1030                data->portnum = local.portnum;
1031                data->state = local.state;
1032        }
1033        return 0;
1034error:
1035        rte_errno = EINVAL;
1036        return -rte_errno;
1037}
1038
1039/**
1040 * Get port info of network interface associated with some IB device.
1041 *
1042 * This is the only somewhat safe method to avoid resorting to heuristics
1043 * when faced with port representors. Unfortunately it requires at least
1044 * Linux 4.17.
1045 *
1046 * @param nl
1047 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1048 * @param[in] pindex
1049 *   IB device port index, starting from 1
1050 * @param[out] data
1051 *   Pointer to port info.
1052 * @return
1053 *   0 on success, negative on error and rte_errno is set.
1054 */
1055static int
1056mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
1057{
1058        union {
1059                struct nlmsghdr nh;
1060                uint8_t buf[NLMSG_HDRLEN +
1061                            NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
1062                            NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1063        } req = {
1064                .nh = {
1065                        .nlmsg_len = NLMSG_LENGTH(0),
1066                        .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1067                                                       RDMA_NLDEV_CMD_GET),
1068                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1069                },
1070        };
1071        struct nlattr *na;
1072        uint32_t sn = MLX5_NL_SN_GENERATE;
1073        int ret;
1074
1075        ret = mlx5_nl_send(nl, &req.nh, sn);
1076        if (ret < 0)
1077                return ret;
1078        ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1079        if (ret < 0)
1080                return ret;
1081        if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1082            !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
1083                goto error;
1084        data->flags = 0;
1085        sn = MLX5_NL_SN_GENERATE;
1086        req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1087                                             RDMA_NLDEV_CMD_PORT_GET);
1088        req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1089        req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1090        na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1091        na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
1092        na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1093        memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1094               &data->ibindex, sizeof(data->ibindex));
1095        na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1096        na->nla_len = NLA_HDRLEN + sizeof(pindex);
1097        na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1098        memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1099               &pindex, sizeof(pindex));
1100        ret = mlx5_nl_send(nl, &req.nh, sn);
1101        if (ret < 0)
1102                return ret;
1103        ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1104        if (ret < 0)
1105                return ret;
1106        if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1107            !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1108            !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1109            !data->ifindex)
1110                goto error;
1111        return 1;
1112error:
1113        rte_errno = ENODEV;
1114        return -rte_errno;
1115}
1116
1117/**
1118 * Get index of network interface associated with some IB device.
1119 *
1120 * This is the only somewhat safe method to avoid resorting to heuristics
1121 * when faced with port representors. Unfortunately it requires at least
1122 * Linux 4.17.
1123 *
1124 * @param nl
1125 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1126 * @param[in] name
1127 *   IB device name.
1128 * @param[in] pindex
1129 *   IB device port index, starting from 1
1130 * @return
1131 *   A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1132 *   is set.
1133 */
1134unsigned int
1135mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1136{
1137        struct mlx5_nl_port_info data = {
1138                        .ifindex = 0,
1139                        .name = name,
1140        };
1141
1142        if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1143                return 0;
1144        return data.ifindex;
1145}
1146
1147/**
1148 * Get IB device port state.
1149 *
1150 * This is the only somewhat safe method to get info for port number >= 255.
1151 * Unfortunately it requires at least Linux 4.17.
1152 *
1153 * @param nl
1154 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1155 * @param[in] name
1156 *   IB device name.
1157 * @param[in] pindex
1158 *   IB device port index, starting from 1
1159 * @return
1160 *   Port state (ibv_port_state) on success, negative on error
1161 *   and rte_errno is set.
1162 */
1163int
1164mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
1165{
1166        struct mlx5_nl_port_info data = {
1167                        .state = 0,
1168                        .name = name,
1169        };
1170
1171        if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1172                return -rte_errno;
1173        if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
1174                rte_errno = ENOTSUP;
1175                return -rte_errno;
1176        }
1177        return (int)data.state;
1178}
1179
1180/**
1181 * Get the number of physical ports of given IB device.
1182 *
1183 * @param nl
1184 *   Netlink socket of the RDMA kind (NETLINK_RDMA).
1185 * @param[in] name
1186 *   IB device name.
1187 *
1188 * @return
1189 *   A valid (nonzero) number of ports on success, 0 otherwise
1190 *   and rte_errno is set.
1191 */
1192unsigned int
1193mlx5_nl_portnum(int nl, const char *name)
1194{
1195        struct mlx5_nl_port_info data = {
1196                .flags = 0,
1197                .name = name,
1198                .ifindex = 0,
1199                .portnum = 0,
1200        };
1201        struct nlmsghdr req = {
1202                .nlmsg_len = NLMSG_LENGTH(0),
1203                .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1204                                               RDMA_NLDEV_CMD_GET),
1205                .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1206        };
1207        uint32_t sn = MLX5_NL_SN_GENERATE;
1208        int ret;
1209
1210        ret = mlx5_nl_send(nl, &req, sn);
1211        if (ret < 0)
1212                return 0;
1213        ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1214        if (ret < 0)
1215                return 0;
1216        if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1217            !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1218            !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1219                rte_errno = ENODEV;
1220                return 0;
1221        }
1222        if (!data.portnum)
1223                rte_errno = EINVAL;
1224        return data.portnum;
1225}
1226
1227/**
1228 * Analyze gathered port parameters via Netlink to recognize master
1229 * and representor devices for E-Switch configuration.
1230 *
1231 * @param[in] num_vf_set
1232 *   flag of presence of number of VFs port attribute.
1233 * @param[inout] switch_info
1234 *   Port information, including port name as a number and port name
1235 *   type if recognized
1236 *
1237 * @return
1238 *   master and representor flags are set in switch_info according to
1239 *   recognized parameters (if any).
1240 */
1241static void
1242mlx5_nl_check_switch_info(bool num_vf_set,
1243                          struct mlx5_switch_info *switch_info)
1244{
1245        switch (switch_info->name_type) {
1246        case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1247                /*
1248                 * Name is not recognized, assume the master,
1249                 * check the number of VFs key presence.
1250                 */
1251                switch_info->master = num_vf_set;
1252                break;
1253        case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1254                /*
1255                 * Name is not set, this assumes the legacy naming
1256                 * schema for master, just check if there is a
1257                 * number of VFs key.
1258                 */
1259                switch_info->master = num_vf_set;
1260                break;
1261        case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1262                /* New uplink naming schema recognized. */
1263                switch_info->master = 1;
1264                break;
1265        case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1266                /* Legacy representors naming schema. */
1267                switch_info->representor = !num_vf_set;
1268                break;
1269        case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1270                /* Fallthrough */
1271        case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1272                /* Fallthrough */
1273        case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1274                /* New representors naming schema. */
1275                switch_info->representor = 1;
1276                break;
1277        }
1278}
1279
1280/**
1281 * Process switch information from Netlink message.
1282 *
1283 * @param nh
1284 *   Pointer to Netlink message header.
1285 * @param arg
1286 *   Opaque data pointer for this callback.
1287 *
1288 * @return
1289 *   0 on success, a negative errno value otherwise and rte_errno is set.
1290 */
1291static int
1292mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1293{
1294        struct mlx5_switch_info info = {
1295                .master = 0,
1296                .representor = 0,
1297                .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1298                .port_name = 0,
1299                .switch_id = 0,
1300        };
1301        size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1302        bool switch_id_set = false;
1303        bool num_vf_set = false;
1304        int len;
1305
1306        if (nh->nlmsg_type != RTM_NEWLINK)
1307                goto error;
1308        while (off < nh->nlmsg_len) {
1309                struct rtattr *ra = (void *)((uintptr_t)nh + off);
1310                void *payload = RTA_DATA(ra);
1311                unsigned int i;
1312
1313                if (ra->rta_len > nh->nlmsg_len - off)
1314                        goto error;
1315                switch (ra->rta_type) {
1316                case IFLA_NUM_VF:
1317                        num_vf_set = true;
1318                        break;
1319                case IFLA_PHYS_PORT_NAME:
1320                        len = RTA_PAYLOAD(ra);
1321                        /* Some kernels do not pad attributes with zero. */
1322                        if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) {
1323                                char name[MLX5_PHYS_PORT_NAME_MAX];
1324
1325                                /*
1326                                 * We can't just patch the message with padding
1327                                 * zero - it might corrupt the following items
1328                                 * in the message, we have to copy the string
1329                                 * by attribute length and pad the copied one.
1330                                 */
1331                                memcpy(name, payload, len);
1332                                name[len] = 0;
1333                                mlx5_translate_port_name(name, &info);
1334                        } else {
1335                                info.name_type =
1336                                        MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1337                        }
1338                        break;
1339                case IFLA_PHYS_SWITCH_ID:
1340                        info.switch_id = 0;
1341                        for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1342                                info.switch_id <<= 8;
1343                                info.switch_id |= ((uint8_t *)payload)[i];
1344                        }
1345                        switch_id_set = true;
1346                        break;
1347                }
1348                off += RTA_ALIGN(ra->rta_len);
1349        }
1350        if (switch_id_set) {
1351                /* We have some E-Switch configuration. */
1352                mlx5_nl_check_switch_info(num_vf_set, &info);
1353        }
1354        MLX5_ASSERT(!(info.master && info.representor));
1355        memcpy(arg, &info, sizeof(info));
1356        return 0;
1357error:
1358        rte_errno = EINVAL;
1359        return -rte_errno;
1360}
1361
1362/**
1363 * Get switch information associated with network interface.
1364 *
1365 * @param nl
1366 *   Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1367 * @param ifindex
1368 *   Network interface index.
1369 * @param[out] info
1370 *   Switch information object, populated in case of success.
1371 *
1372 * @return
1373 *   0 on success, a negative errno value otherwise and rte_errno is set.
1374 */
1375int
1376mlx5_nl_switch_info(int nl, unsigned int ifindex,
1377                    struct mlx5_switch_info *info)
1378{
1379        struct {
1380                struct nlmsghdr nh;
1381                struct ifinfomsg info;
1382                struct rtattr rta;
1383                uint32_t extmask;
1384        } req = {
1385                .nh = {
1386                        .nlmsg_len = NLMSG_LENGTH
1387                                        (sizeof(req.info) +
1388                                         RTA_LENGTH(sizeof(uint32_t))),
1389                        .nlmsg_type = RTM_GETLINK,
1390                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1391                },
1392                .info = {
1393                        .ifi_family = AF_UNSPEC,
1394                        .ifi_index = ifindex,
1395                },
1396                .rta = {
1397                        .rta_type = IFLA_EXT_MASK,
1398                        .rta_len = RTA_LENGTH(sizeof(int32_t)),
1399                },
1400                .extmask = RTE_LE32(1),
1401        };
1402        uint32_t sn = MLX5_NL_SN_GENERATE;
1403        int ret;
1404
1405        ret = mlx5_nl_send(nl, &req.nh, sn);
1406        if (ret >= 0)
1407                ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1408        if (info->master && info->representor) {
1409                DRV_LOG(ERR, "ifindex %u device is recognized as master"
1410                             " and as representor", ifindex);
1411                rte_errno = ENODEV;
1412                ret = -rte_errno;
1413        }
1414        return ret;
1415}
1416
1417/*
1418 * Delete VLAN network device by ifindex.
1419 *
1420 * @param[in] tcf
1421 *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1422 * @param[in] ifindex
1423 *   Interface index of network device to delete.
1424 */
1425void
1426mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1427                      uint32_t ifindex)
1428{
1429        uint32_t sn = MLX5_NL_SN_GENERATE;
1430        int ret;
1431        struct {
1432                struct nlmsghdr nh;
1433                struct ifinfomsg info;
1434        } req = {
1435                .nh = {
1436                        .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1437                        .nlmsg_type = RTM_DELLINK,
1438                        .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1439                },
1440                .info = {
1441                        .ifi_family = AF_UNSPEC,
1442                        .ifi_index = ifindex,
1443                },
1444        };
1445
1446        if (ifindex) {
1447                ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1448                if (ret >= 0)
1449                        ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1450                if (ret < 0)
1451                        DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1452                                " ifindex %u, %d", ifindex, ret);
1453        }
1454}
1455
1456/* Set of subroutines to build Netlink message. */
1457static struct nlattr *
1458nl_msg_tail(struct nlmsghdr *nlh)
1459{
1460        return (struct nlattr *)
1461                (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1462}
1463
1464static void
1465nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1466{
1467        struct nlattr *nla = nl_msg_tail(nlh);
1468
1469        nla->nla_type = type;
1470        nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1471        nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1472
1473        if (alen)
1474                memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1475}
1476
1477static struct nlattr *
1478nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1479{
1480        struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1481
1482        nl_attr_put(nlh, type, NULL, 0);
1483        return nest;
1484}
1485
1486static void
1487nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1488{
1489        nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1490}
1491
1492/*
1493 * Create network VLAN device with specified VLAN tag.
1494 *
1495 * @param[in] tcf
1496 *   Context object initialized by mlx5_nl_vlan_vmwa_init().
1497 * @param[in] ifindex
1498 *   Base network interface index.
1499 * @param[in] tag
1500 *   VLAN tag for VLAN network device to create.
1501 */
1502uint32_t
1503mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1504                         uint32_t ifindex, uint16_t tag)
1505{
1506        struct nlmsghdr *nlh;
1507        struct ifinfomsg *ifm;
1508        char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1509
1510        __rte_cache_aligned
1511        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1512                    NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1513                    NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1514                    NLMSG_ALIGN(sizeof(uint32_t)) +
1515                    NLMSG_ALIGN(sizeof(name)) +
1516                    NLMSG_ALIGN(sizeof("vlan")) +
1517                    NLMSG_ALIGN(sizeof(uint32_t)) +
1518                    NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1519        struct nlattr *na_info;
1520        struct nlattr *na_vlan;
1521        uint32_t sn = MLX5_NL_SN_GENERATE;
1522        int ret;
1523
1524        memset(buf, 0, sizeof(buf));
1525        nlh = (struct nlmsghdr *)buf;
1526        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1527        nlh->nlmsg_type = RTM_NEWLINK;
1528        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1529                           NLM_F_EXCL | NLM_F_ACK;
1530        ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1531        nlh->nlmsg_len += sizeof(struct ifinfomsg);
1532        ifm->ifi_family = AF_UNSPEC;
1533        ifm->ifi_type = 0;
1534        ifm->ifi_index = 0;
1535        ifm->ifi_flags = IFF_UP;
1536        ifm->ifi_change = 0xffffffff;
1537        nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1538        ret = snprintf(name, sizeof(name), "%s.%u.%u",
1539                       MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1540        nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1541        na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1542        nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1543        na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1544        nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1545        nl_attr_nest_end(nlh, na_vlan);
1546        nl_attr_nest_end(nlh, na_info);
1547        MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1548        ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1549        if (ret >= 0)
1550                ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1551        if (ret < 0) {
1552                DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1553                        ret);
1554        }
1555        /* Try to get ifindex of created or pre-existing device. */
1556        ret = if_nametoindex(name);
1557        if (!ret) {
1558                DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1559                        errno);
1560                return 0;
1561        }
1562        return ret;
1563}
1564
1565/**
1566 * Parse Netlink message to retrieve the general family ID.
1567 *
1568 * @param nh
1569 *   Pointer to Netlink Message Header.
1570 * @param arg
1571 *   PMD data register with this callback.
1572 *
1573 * @return
1574 *   0 on success, a negative errno value otherwise and rte_errno is set.
1575 */
1576static int
1577mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1578{
1579
1580        struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1581        struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1582                                        NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1583
1584        for (; nla->nla_len && nla < tail;
1585             nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1586                if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1587                        *(uint16_t *)arg = *(uint16_t *)(nla + 1);
1588                        return 0;
1589                }
1590        }
1591        return -EINVAL;
1592}
1593
1594#define MLX5_NL_MAX_ATTR_SIZE 100
1595/**
1596 * Get generic netlink family ID.
1597 *
1598 * @param[in] nlsk_fd
1599 *   Netlink socket file descriptor.
1600 * @param[in] name
1601 *   The family name.
1602 *
1603 * @return
1604 *   ID >= 0 on success and @p enable is updated, a negative errno value
1605 *   otherwise and rte_errno is set.
1606 */
1607static int
1608mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1609{
1610        struct nlmsghdr *nlh;
1611        struct genlmsghdr *genl;
1612        uint32_t sn = MLX5_NL_SN_GENERATE;
1613        int name_size = strlen(name) + 1;
1614        int ret;
1615        uint16_t id = -1;
1616        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1617                    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1618                    NLMSG_ALIGN(sizeof(struct nlattr)) +
1619                    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1620
1621        memset(buf, 0, sizeof(buf));
1622        nlh = (struct nlmsghdr *)buf;
1623        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1624        nlh->nlmsg_type = GENL_ID_CTRL;
1625        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1626        genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1627        nlh->nlmsg_len += sizeof(struct genlmsghdr);
1628        genl->cmd = CTRL_CMD_GETFAMILY;
1629        genl->version = 1;
1630        nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1631        ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1632        if (ret >= 0)
1633                ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1634        if (ret < 0) {
1635                DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1636                        ret);
1637                return ret;
1638        }
1639        DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1640        return (int)id;
1641}
1642
1643/**
1644 * Get Devlink family ID.
1645 *
1646 * @param[in] nlsk_fd
1647 *   Netlink socket file descriptor.
1648 *
1649 * @return
1650 *   ID >= 0 on success and @p enable is updated, a negative errno value
1651 *   otherwise and rte_errno is set.
1652 */
1653
1654int
1655mlx5_nl_devlink_family_id_get(int nlsk_fd)
1656{
1657        return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1658}
1659
1660/**
1661 * Parse Netlink message to retrieve the ROCE enable status.
1662 *
1663 * @param nh
1664 *   Pointer to Netlink Message Header.
1665 * @param arg
1666 *   PMD data register with this callback.
1667 *
1668 * @return
1669 *   0 on success, a negative errno value otherwise and rte_errno is set.
1670 */
1671static int
1672mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1673{
1674
1675        int ret = -EINVAL;
1676        int *enable = arg;
1677        struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1678        struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1679                                        NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1680
1681        while (nla->nla_len && nla < tail) {
1682                switch (nla->nla_type) {
1683                /* Expected nested attributes case. */
1684                case DEVLINK_ATTR_PARAM:
1685                case DEVLINK_ATTR_PARAM_VALUES_LIST:
1686                case DEVLINK_ATTR_PARAM_VALUE:
1687                        ret = 0;
1688                        nla += 1;
1689                        break;
1690                case DEVLINK_ATTR_PARAM_VALUE_DATA:
1691                        *enable = 1;
1692                        return 0;
1693                default:
1694                        nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1695                }
1696        }
1697        *enable = 0;
1698        return ret;
1699}
1700
1701/**
1702 * Get ROCE enable status through Netlink.
1703 *
1704 * @param[in] nlsk_fd
1705 *   Netlink socket file descriptor.
1706 * @param[in] family_id
1707 *   the Devlink family ID.
1708 * @param pci_addr
1709 *   The device PCI address.
1710 * @param[out] enable
1711 *   Where to store the enable status.
1712 *
1713 * @return
1714 *   0 on success and @p enable is updated, a negative errno value otherwise
1715 *   and rte_errno is set.
1716 */
1717int
1718mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1719                        int *enable)
1720{
1721        struct nlmsghdr *nlh;
1722        struct genlmsghdr *genl;
1723        uint32_t sn = MLX5_NL_SN_GENERATE;
1724        int ret;
1725        int cur_en = 0;
1726        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1727                    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1728                    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1729                    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1730
1731        memset(buf, 0, sizeof(buf));
1732        nlh = (struct nlmsghdr *)buf;
1733        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1734        nlh->nlmsg_type = family_id;
1735        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1736        genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1737        nlh->nlmsg_len += sizeof(struct genlmsghdr);
1738        genl->cmd = DEVLINK_CMD_PARAM_GET;
1739        genl->version = DEVLINK_GENL_VERSION;
1740        nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1741        nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1742        nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1743        ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1744        if (ret >= 0)
1745                ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1746        if (ret < 0) {
1747                DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1748                        pci_addr, ret);
1749                return ret;
1750        }
1751        *enable = cur_en;
1752        DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1753                cur_en ? "en" : "dis", pci_addr);
1754        return ret;
1755}
1756
1757/**
1758 * Reload mlx5 device kernel driver through Netlink.
1759 *
1760 * @param[in] nlsk_fd
1761 *   Netlink socket file descriptor.
1762 * @param[in] family_id
1763 *   the Devlink family ID.
1764 * @param pci_addr
1765 *   The device PCI address.
1766 * @param[out] enable
1767 *   The enable status to set.
1768 *
1769 * @return
1770 *   0 on success, a negative errno value otherwise and rte_errno is set.
1771 */
1772static int
1773mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1774{
1775        struct nlmsghdr *nlh;
1776        struct genlmsghdr *genl;
1777        uint32_t sn = MLX5_NL_SN_GENERATE;
1778        int ret;
1779        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1780                    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1781                    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1782                    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1783
1784        memset(buf, 0, sizeof(buf));
1785        nlh = (struct nlmsghdr *)buf;
1786        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1787        nlh->nlmsg_type = family_id;
1788        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1789        genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1790        nlh->nlmsg_len += sizeof(struct genlmsghdr);
1791        genl->cmd = DEVLINK_CMD_RELOAD;
1792        genl->version = DEVLINK_GENL_VERSION;
1793        nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1794        nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1795        ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1796        if (ret >= 0)
1797                ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1798        if (ret < 0) {
1799                DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1800                        pci_addr, ret);
1801                return ret;
1802        }
1803        DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1804                pci_addr);
1805        return 0;
1806}
1807
1808/**
1809 * Set ROCE enable status through Netlink.
1810 *
1811 * @param[in] nlsk_fd
1812 *   Netlink socket file descriptor.
1813 * @param[in] family_id
1814 *   the Devlink family ID.
1815 * @param pci_addr
1816 *   The device PCI address.
1817 * @param[out] enable
1818 *   The enable status to set.
1819 *
1820 * @return
1821 *   0 on success, a negative errno value otherwise and rte_errno is set.
1822 */
1823int
1824mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1825                        int enable)
1826{
1827        struct nlmsghdr *nlh;
1828        struct genlmsghdr *genl;
1829        uint32_t sn = MLX5_NL_SN_GENERATE;
1830        int ret;
1831        uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1832                    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1833                    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1834                    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1835        uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1836        uint8_t ptype = NLA_FLAG;
1837;
1838
1839        memset(buf, 0, sizeof(buf));
1840        nlh = (struct nlmsghdr *)buf;
1841        nlh->nlmsg_len = sizeof(struct nlmsghdr);
1842        nlh->nlmsg_type = family_id;
1843        nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1844        genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1845        nlh->nlmsg_len += sizeof(struct genlmsghdr);
1846        genl->cmd = DEVLINK_CMD_PARAM_SET;
1847        genl->version = DEVLINK_GENL_VERSION;
1848        nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1849        nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1850        nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1851        nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1852        nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1853        if (enable)
1854                nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1855        ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1856        if (ret >= 0)
1857                ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1858        if (ret < 0) {
1859                DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1860                        " %d.", enable ? "en" : "dis", pci_addr, ret);
1861                return ret;
1862        }
1863        DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1864                pci_addr, enable ? "en" : "dis");
1865        /* Now, need to reload the driver. */
1866        return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1867}
1868
1869/**
1870 * Try to parse a Netlink message as a link status update.
1871 *
1872 * @param hdr
1873 *  Netlink message header.
1874 * @param[out] ifindex
1875 *  Index of the updated interface.
1876 *
1877 * @return
1878 *  0 on success, negative on failure.
1879 */
1880int
1881mlx5_nl_parse_link_status_update(struct nlmsghdr *hdr, uint32_t *ifindex)
1882{
1883        struct ifinfomsg *info;
1884
1885        switch (hdr->nlmsg_type) {
1886        case RTM_NEWLINK:
1887        case RTM_DELLINK:
1888        case RTM_GETLINK:
1889        case RTM_SETLINK:
1890                info = NLMSG_DATA(hdr);
1891                *ifindex = info->ifi_index;
1892                return 0;
1893        }
1894        return -1;
1895}
1896
1897/**
1898 * Read pending events from a Netlink socket.
1899 *
1900 * @param nlsk_fd
1901 *  Netlink socket.
1902 * @param cb
1903 *  Callback invoked for each of the events.
1904 * @param cb_arg
1905 *  User data for the callback.
1906 *
1907 * @return
1908 *  0 on success, including the case when there are no events.
1909 *  Negative on failure and rte_errno is set.
1910 */
1911int
1912mlx5_nl_read_events(int nlsk_fd, mlx5_nl_event_cb *cb, void *cb_arg)
1913{
1914        char buf[8192];
1915        struct sockaddr_nl addr;
1916        struct iovec iov = {
1917                .iov_base = buf,
1918                .iov_len = sizeof(buf),
1919        };
1920        struct msghdr msg = {
1921                .msg_name = &addr,
1922                .msg_namelen = sizeof(addr),
1923                .msg_iov = &iov,
1924                .msg_iovlen = 1,
1925        };
1926        struct nlmsghdr *hdr;
1927        ssize_t size;
1928
1929        while (1) {
1930                size = recvmsg(nlsk_fd, &msg, MSG_DONTWAIT);
1931                if (size < 0) {
1932                        if (errno == EAGAIN)
1933                                return 0;
1934                        if (errno == EINTR)
1935                                continue;
1936                        DRV_LOG(DEBUG, "Failed to receive netlink message: %s",
1937                                strerror(errno));
1938                        rte_errno = errno;
1939                        return -rte_errno;
1940                }
1941                hdr = (struct nlmsghdr *)buf;
1942                while (size >= (ssize_t)sizeof(*hdr)) {
1943                        ssize_t msg_len = hdr->nlmsg_len;
1944                        ssize_t data_len = msg_len - sizeof(*hdr);
1945                        ssize_t aligned_len;
1946
1947                        if (data_len < 0) {
1948                                DRV_LOG(DEBUG, "Netlink message too short");
1949                                rte_errno = EINVAL;
1950                                return -rte_errno;
1951                        }
1952                        aligned_len = NLMSG_ALIGN(msg_len);
1953                        if (aligned_len > size) {
1954                                DRV_LOG(DEBUG, "Netlink message too long");
1955                                rte_errno = EINVAL;
1956                                return -rte_errno;
1957                        }
1958                        cb(hdr, cb_arg);
1959                        hdr = RTE_PTR_ADD(hdr, aligned_len);
1960                        size -= aligned_len;
1961                }
1962        }
1963        return 0;
1964}
1965