dpdk/drivers/net/mlx5/linux/mlx5_ethdev_os.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright 2015 6WIND S.A.
   3 * Copyright 2015 Mellanox Technologies, Ltd
   4 */
   5
   6#include <stddef.h>
   7#include <inttypes.h>
   8#include <unistd.h>
   9#include <stdbool.h>
  10#include <stdint.h>
  11#include <stdio.h>
  12#include <string.h>
  13#include <stdlib.h>
  14#include <errno.h>
  15#include <dirent.h>
  16#include <net/if.h>
  17#include <sys/ioctl.h>
  18#include <sys/socket.h>
  19#include <netinet/in.h>
  20#include <linux/ethtool.h>
  21#include <linux/sockios.h>
  22#include <fcntl.h>
  23#include <stdalign.h>
  24#include <sys/un.h>
  25#include <time.h>
  26
  27#include <ethdev_driver.h>
  28#include <rte_bus_pci.h>
  29#include <rte_mbuf.h>
  30#include <rte_common.h>
  31#include <rte_interrupts.h>
  32#include <rte_malloc.h>
  33#include <rte_string_fns.h>
  34#include <rte_rwlock.h>
  35#include <rte_cycles.h>
  36
  37#include <mlx5_glue.h>
  38#include <mlx5_devx_cmds.h>
  39#include <mlx5_common.h>
  40#include <mlx5_malloc.h>
  41
  42#include "mlx5.h"
  43#include "mlx5_rxtx.h"
  44#include "mlx5_utils.h"
  45
  46/* Supported speed values found in /usr/include/linux/ethtool.h */
  47#ifndef HAVE_SUPPORTED_40000baseKR4_Full
  48#define SUPPORTED_40000baseKR4_Full (1 << 23)
  49#endif
  50#ifndef HAVE_SUPPORTED_40000baseCR4_Full
  51#define SUPPORTED_40000baseCR4_Full (1 << 24)
  52#endif
  53#ifndef HAVE_SUPPORTED_40000baseSR4_Full
  54#define SUPPORTED_40000baseSR4_Full (1 << 25)
  55#endif
  56#ifndef HAVE_SUPPORTED_40000baseLR4_Full
  57#define SUPPORTED_40000baseLR4_Full (1 << 26)
  58#endif
  59#ifndef HAVE_SUPPORTED_56000baseKR4_Full
  60#define SUPPORTED_56000baseKR4_Full (1 << 27)
  61#endif
  62#ifndef HAVE_SUPPORTED_56000baseCR4_Full
  63#define SUPPORTED_56000baseCR4_Full (1 << 28)
  64#endif
  65#ifndef HAVE_SUPPORTED_56000baseSR4_Full
  66#define SUPPORTED_56000baseSR4_Full (1 << 29)
  67#endif
  68#ifndef HAVE_SUPPORTED_56000baseLR4_Full
  69#define SUPPORTED_56000baseLR4_Full (1 << 30)
  70#endif
  71
  72/* Add defines in case the running kernel is not the same as user headers. */
  73#ifndef ETHTOOL_GLINKSETTINGS
  74struct ethtool_link_settings {
  75        uint32_t cmd;
  76        uint32_t speed;
  77        uint8_t duplex;
  78        uint8_t port;
  79        uint8_t phy_address;
  80        uint8_t autoneg;
  81        uint8_t mdio_support;
  82        uint8_t eth_to_mdix;
  83        uint8_t eth_tp_mdix_ctrl;
  84        int8_t link_mode_masks_nwords;
  85        uint32_t reserved[8];
  86        uint32_t link_mode_masks[];
  87};
  88
  89/* The kernel values can be found in /include/uapi/linux/ethtool.h */
  90#define ETHTOOL_GLINKSETTINGS 0x0000004c
  91#define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
  92#define ETHTOOL_LINK_MODE_Autoneg_BIT 6
  93#define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
  94#define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
  95#define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
  96#define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
  97#define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
  98#define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
  99#define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
 100#define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
 101#define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
 102#define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
 103#define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
 104#define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
 105#define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
 106#define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
 107#endif
 108#ifndef HAVE_ETHTOOL_LINK_MODE_25G
 109#define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
 110#define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
 111#define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
 112#endif
 113#ifndef HAVE_ETHTOOL_LINK_MODE_50G
 114#define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
 115#define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
 116#endif
 117#ifndef HAVE_ETHTOOL_LINK_MODE_100G
 118#define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
 119#define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
 120#define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
 121#define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
 122#endif
 123#ifndef HAVE_ETHTOOL_LINK_MODE_200G
 124#define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62
 125#define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63
 126#define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */
 127#define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */
 128#define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
 129#endif
 130
 131/* Get interface index from SubFunction device name. */
 132int
 133mlx5_auxiliary_get_ifindex(const char *sf_name)
 134{
 135        char if_name[IF_NAMESIZE] = { 0 };
 136
 137        if (mlx5_auxiliary_get_child_name(sf_name, "/net",
 138                                          if_name, sizeof(if_name)) != 0)
 139                return -rte_errno;
 140        return if_nametoindex(if_name);
 141}
 142
 143/**
 144 * Get interface name from private structure.
 145 *
 146 * This is a port representor-aware version of mlx5_get_ifname_sysfs().
 147 *
 148 * @param[in] dev
 149 *   Pointer to Ethernet device.
 150 * @param[out] ifname
 151 *   Interface name output buffer.
 152 *
 153 * @return
 154 *   0 on success, a negative errno value otherwise and rte_errno is set.
 155 */
 156int
 157mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[MLX5_NAMESIZE])
 158{
 159        struct mlx5_priv *priv = dev->data->dev_private;
 160        unsigned int ifindex;
 161
 162        MLX5_ASSERT(priv);
 163        MLX5_ASSERT(priv->sh);
 164        if (priv->master && priv->sh->bond.ifindex > 0) {
 165                memcpy(ifname, priv->sh->bond.ifname, MLX5_NAMESIZE);
 166                return 0;
 167        }
 168        ifindex = mlx5_ifindex(dev);
 169        if (!ifindex) {
 170                if (!priv->representor)
 171                        return mlx5_get_ifname_sysfs(priv->sh->ibdev_path,
 172                                                     *ifname);
 173                rte_errno = ENXIO;
 174                return -rte_errno;
 175        }
 176        if (if_indextoname(ifindex, &(*ifname)[0]))
 177                return 0;
 178        rte_errno = errno;
 179        return -rte_errno;
 180}
 181
 182/**
 183 * Perform ifreq ioctl() on associated netdev ifname.
 184 *
 185 * @param[in] ifname
 186 *   Pointer to netdev name.
 187 * @param req
 188 *   Request number to pass to ioctl().
 189 * @param[out] ifr
 190 *   Interface request structure output buffer.
 191 *
 192 * @return
 193 *   0 on success, a negative errno value otherwise and rte_errno is set.
 194 */
 195static int
 196mlx5_ifreq_by_ifname(const char *ifname, int req, struct ifreq *ifr)
 197{
 198        int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
 199        int ret = 0;
 200
 201        if (sock == -1) {
 202                rte_errno = errno;
 203                return -rte_errno;
 204        }
 205        rte_strscpy(ifr->ifr_name, ifname, sizeof(ifr->ifr_name));
 206        ret = ioctl(sock, req, ifr);
 207        if (ret == -1) {
 208                rte_errno = errno;
 209                goto error;
 210        }
 211        close(sock);
 212        return 0;
 213error:
 214        close(sock);
 215        return -rte_errno;
 216}
 217
 218/**
 219 * Perform ifreq ioctl() on associated Ethernet device.
 220 *
 221 * @param[in] dev
 222 *   Pointer to Ethernet device.
 223 * @param req
 224 *   Request number to pass to ioctl().
 225 * @param[out] ifr
 226 *   Interface request structure output buffer.
 227 *
 228 * @return
 229 *   0 on success, a negative errno value otherwise and rte_errno is set.
 230 */
 231static int
 232mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
 233{
 234        char ifname[sizeof(ifr->ifr_name)];
 235        int ret;
 236
 237        ret = mlx5_get_ifname(dev, &ifname);
 238        if (ret)
 239                return -rte_errno;
 240        return mlx5_ifreq_by_ifname(ifname, req, ifr);
 241}
 242
 243/**
 244 * Get device MTU.
 245 *
 246 * @param dev
 247 *   Pointer to Ethernet device.
 248 * @param[out] mtu
 249 *   MTU value output buffer.
 250 *
 251 * @return
 252 *   0 on success, a negative errno value otherwise and rte_errno is set.
 253 */
 254int
 255mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
 256{
 257        struct ifreq request;
 258        int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
 259
 260        if (ret)
 261                return ret;
 262        *mtu = request.ifr_mtu;
 263        return 0;
 264}
 265
 266/**
 267 * Set device MTU.
 268 *
 269 * @param dev
 270 *   Pointer to Ethernet device.
 271 * @param mtu
 272 *   MTU value to set.
 273 *
 274 * @return
 275 *   0 on success, a negative errno value otherwise and rte_errno is set.
 276 */
 277int
 278mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 279{
 280        struct ifreq request = { .ifr_mtu = mtu, };
 281
 282        return mlx5_ifreq(dev, SIOCSIFMTU, &request);
 283}
 284
 285/**
 286 * Set device flags.
 287 *
 288 * @param dev
 289 *   Pointer to Ethernet device.
 290 * @param keep
 291 *   Bitmask for flags that must remain untouched.
 292 * @param flags
 293 *   Bitmask for flags to modify.
 294 *
 295 * @return
 296 *   0 on success, a negative errno value otherwise and rte_errno is set.
 297 */
 298static int
 299mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
 300{
 301        struct ifreq request;
 302        int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
 303
 304        if (ret)
 305                return ret;
 306        request.ifr_flags &= keep;
 307        request.ifr_flags |= flags & ~keep;
 308        return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
 309}
 310
 311/**
 312 * Get device current raw clock counter
 313 *
 314 * @param dev
 315 *   Pointer to Ethernet device structure.
 316 * @param[out] time
 317 *   Current raw clock counter of the device.
 318 *
 319 * @return
 320 *   0 if the clock has correctly been read
 321 *   The value of errno in case of error
 322 */
 323int
 324mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
 325{
 326        struct mlx5_priv *priv = dev->data->dev_private;
 327        struct ibv_context *ctx = priv->sh->ctx;
 328        struct ibv_values_ex values;
 329        int err = 0;
 330
 331        values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
 332        err = mlx5_glue->query_rt_values_ex(ctx, &values);
 333        if (err != 0) {
 334                DRV_LOG(WARNING, "Could not query the clock !");
 335                return err;
 336        }
 337        *clock = values.raw_clock.tv_nsec;
 338        return 0;
 339}
 340
 341/**
 342 * Retrieve the master device for representor in the same switch domain.
 343 *
 344 * @param dev
 345 *   Pointer to representor Ethernet device structure.
 346 *
 347 * @return
 348 *   Master device structure  on success, NULL otherwise.
 349 */
 350static struct rte_eth_dev *
 351mlx5_find_master_dev(struct rte_eth_dev *dev)
 352{
 353        struct mlx5_priv *priv;
 354        uint16_t port_id;
 355        uint16_t domain_id;
 356
 357        priv = dev->data->dev_private;
 358        domain_id = priv->domain_id;
 359        MLX5_ASSERT(priv->representor);
 360        MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
 361                struct mlx5_priv *opriv =
 362                        rte_eth_devices[port_id].data->dev_private;
 363                if (opriv &&
 364                    opriv->master &&
 365                    opriv->domain_id == domain_id &&
 366                    opriv->sh == priv->sh)
 367                        return &rte_eth_devices[port_id];
 368        }
 369        return NULL;
 370}
 371
 372/**
 373 * DPDK callback to retrieve physical link information.
 374 *
 375 * @param dev
 376 *   Pointer to Ethernet device structure.
 377 * @param[out] link
 378 *   Storage for current link status.
 379 *
 380 * @return
 381 *   0 on success, a negative errno value otherwise and rte_errno is set.
 382 */
 383static int
 384mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
 385                               struct rte_eth_link *link)
 386{
 387        struct mlx5_priv *priv = dev->data->dev_private;
 388        struct ethtool_cmd edata = {
 389                .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
 390        };
 391        struct ifreq ifr;
 392        struct rte_eth_link dev_link;
 393        int link_speed = 0;
 394        int ret;
 395
 396        ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
 397        if (ret) {
 398                DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 399                        dev->data->port_id, strerror(rte_errno));
 400                return ret;
 401        }
 402        dev_link = (struct rte_eth_link) {
 403                .link_status = ((ifr.ifr_flags & IFF_UP) &&
 404                                (ifr.ifr_flags & IFF_RUNNING)),
 405        };
 406        ifr = (struct ifreq) {
 407                .ifr_data = (void *)&edata,
 408        };
 409        ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
 410        if (ret) {
 411                if (ret == -ENOTSUP && priv->representor) {
 412                        struct rte_eth_dev *master;
 413
 414                        /*
 415                         * For representors we can try to inherit link
 416                         * settings from the master device. Actually
 417                         * link settings do not make a lot of sense
 418                         * for representors due to missing physical
 419                         * link. The old kernel drivers supported
 420                         * emulated settings query for representors,
 421                         * the new ones do not, so we have to add
 422                         * this code for compatibility issues.
 423                         */
 424                        master = mlx5_find_master_dev(dev);
 425                        if (master) {
 426                                ifr = (struct ifreq) {
 427                                        .ifr_data = (void *)&edata,
 428                                };
 429                                ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
 430                        }
 431                }
 432                if (ret) {
 433                        DRV_LOG(WARNING,
 434                                "port %u ioctl(SIOCETHTOOL,"
 435                                " ETHTOOL_GSET) failed: %s",
 436                                dev->data->port_id, strerror(rte_errno));
 437                        return ret;
 438                }
 439        }
 440        link_speed = ethtool_cmd_speed(&edata);
 441        if (link_speed == -1)
 442                dev_link.link_speed = ETH_SPEED_NUM_UNKNOWN;
 443        else
 444                dev_link.link_speed = link_speed;
 445        priv->link_speed_capa = 0;
 446        if (edata.supported & (SUPPORTED_1000baseT_Full |
 447                               SUPPORTED_1000baseKX_Full))
 448                priv->link_speed_capa |= ETH_LINK_SPEED_1G;
 449        if (edata.supported & SUPPORTED_10000baseKR_Full)
 450                priv->link_speed_capa |= ETH_LINK_SPEED_10G;
 451        if (edata.supported & (SUPPORTED_40000baseKR4_Full |
 452                               SUPPORTED_40000baseCR4_Full |
 453                               SUPPORTED_40000baseSR4_Full |
 454                               SUPPORTED_40000baseLR4_Full))
 455                priv->link_speed_capa |= ETH_LINK_SPEED_40G;
 456        dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
 457                                ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
 458        dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
 459                        ETH_LINK_SPEED_FIXED);
 460        *link = dev_link;
 461        return 0;
 462}
 463
 464/**
 465 * Retrieve physical link information (unlocked version using new ioctl).
 466 *
 467 * @param dev
 468 *   Pointer to Ethernet device structure.
 469 * @param[out] link
 470 *   Storage for current link status.
 471 *
 472 * @return
 473 *   0 on success, a negative errno value otherwise and rte_errno is set.
 474 */
 475static int
 476mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
 477                             struct rte_eth_link *link)
 478
 479{
 480        struct mlx5_priv *priv = dev->data->dev_private;
 481        struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
 482        struct ifreq ifr;
 483        struct rte_eth_link dev_link;
 484        struct rte_eth_dev *master = NULL;
 485        uint64_t sc;
 486        int ret;
 487
 488        ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
 489        if (ret) {
 490                DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
 491                        dev->data->port_id, strerror(rte_errno));
 492                return ret;
 493        }
 494        dev_link = (struct rte_eth_link) {
 495                .link_status = ((ifr.ifr_flags & IFF_UP) &&
 496                                (ifr.ifr_flags & IFF_RUNNING)),
 497        };
 498        ifr = (struct ifreq) {
 499                .ifr_data = (void *)&gcmd,
 500        };
 501        ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
 502        if (ret) {
 503                if (ret == -ENOTSUP && priv->representor) {
 504                        /*
 505                         * For representors we can try to inherit link
 506                         * settings from the master device. Actually
 507                         * link settings do not make a lot of sense
 508                         * for representors due to missing physical
 509                         * link. The old kernel drivers supported
 510                         * emulated settings query for representors,
 511                         * the new ones do not, so we have to add
 512                         * this code for compatibility issues.
 513                         */
 514                        master = mlx5_find_master_dev(dev);
 515                        if (master) {
 516                                ifr = (struct ifreq) {
 517                                        .ifr_data = (void *)&gcmd,
 518                                };
 519                                ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
 520                        }
 521                }
 522                if (ret) {
 523                        DRV_LOG(DEBUG,
 524                                "port %u ioctl(SIOCETHTOOL,"
 525                                " ETHTOOL_GLINKSETTINGS) failed: %s",
 526                                dev->data->port_id, strerror(rte_errno));
 527                        return ret;
 528                }
 529        }
 530        gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
 531
 532        alignas(struct ethtool_link_settings)
 533        uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
 534                     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
 535        struct ethtool_link_settings *ecmd = (void *)data;
 536
 537        *ecmd = gcmd;
 538        ifr.ifr_data = (void *)ecmd;
 539        ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
 540        if (ret) {
 541                DRV_LOG(DEBUG,
 542                        "port %u ioctl(SIOCETHTOOL,"
 543                        "ETHTOOL_GLINKSETTINGS) failed: %s",
 544                        dev->data->port_id, strerror(rte_errno));
 545                return ret;
 546        }
 547        dev_link.link_speed = (ecmd->speed == UINT32_MAX) ?
 548                                ETH_SPEED_NUM_UNKNOWN : ecmd->speed;
 549        sc = ecmd->link_mode_masks[0] |
 550                ((uint64_t)ecmd->link_mode_masks[1] << 32);
 551        priv->link_speed_capa = 0;
 552        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
 553                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
 554                priv->link_speed_capa |= ETH_LINK_SPEED_1G;
 555        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
 556                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
 557                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
 558                priv->link_speed_capa |= ETH_LINK_SPEED_10G;
 559        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
 560                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
 561                priv->link_speed_capa |= ETH_LINK_SPEED_20G;
 562        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
 563                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
 564                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
 565                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
 566                priv->link_speed_capa |= ETH_LINK_SPEED_40G;
 567        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
 568                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
 569                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
 570                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
 571                priv->link_speed_capa |= ETH_LINK_SPEED_56G;
 572        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
 573                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
 574                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
 575                priv->link_speed_capa |= ETH_LINK_SPEED_25G;
 576        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
 577                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
 578                priv->link_speed_capa |= ETH_LINK_SPEED_50G;
 579        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
 580                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
 581                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
 582                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
 583                priv->link_speed_capa |= ETH_LINK_SPEED_100G;
 584        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
 585                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
 586                priv->link_speed_capa |= ETH_LINK_SPEED_200G;
 587
 588        sc = ecmd->link_mode_masks[2] |
 589                ((uint64_t)ecmd->link_mode_masks[3] << 32);
 590        if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) |
 591                  MLX5_BITSHIFT
 592                       (ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
 593                  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
 594                priv->link_speed_capa |= ETH_LINK_SPEED_200G;
 595        dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
 596                                ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
 597        dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
 598                                  ETH_LINK_SPEED_FIXED);
 599        *link = dev_link;
 600        return 0;
 601}
 602
 603/**
 604 * DPDK callback to retrieve physical link information.
 605 *
 606 * @param dev
 607 *   Pointer to Ethernet device structure.
 608 * @param wait_to_complete
 609 *   Wait for request completion.
 610 *
 611 * @return
 612 *   0 if link status was not updated, positive if it was, a negative errno
 613 *   value otherwise and rte_errno is set.
 614 */
 615int
 616mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
 617{
 618        int ret;
 619        struct rte_eth_link dev_link;
 620        time_t start_time = time(NULL);
 621        int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
 622
 623        do {
 624                ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
 625                if (ret == -ENOTSUP)
 626                        ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
 627                if (ret == 0)
 628                        break;
 629                /* Handle wait to complete situation. */
 630                if ((wait_to_complete || retry) && ret == -EAGAIN) {
 631                        if (abs((int)difftime(time(NULL), start_time)) <
 632                            MLX5_LINK_STATUS_TIMEOUT) {
 633                                usleep(0);
 634                                continue;
 635                        } else {
 636                                rte_errno = EBUSY;
 637                                return -rte_errno;
 638                        }
 639                } else if (ret < 0) {
 640                        return ret;
 641                }
 642        } while (wait_to_complete || retry-- > 0);
 643        ret = !!memcmp(&dev->data->dev_link, &dev_link,
 644                       sizeof(struct rte_eth_link));
 645        dev->data->dev_link = dev_link;
 646        return ret;
 647}
 648
 649/**
 650 * DPDK callback to get flow control status.
 651 *
 652 * @param dev
 653 *   Pointer to Ethernet device structure.
 654 * @param[out] fc_conf
 655 *   Flow control output buffer.
 656 *
 657 * @return
 658 *   0 on success, a negative errno value otherwise and rte_errno is set.
 659 */
 660int
 661mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 662{
 663        struct ifreq ifr;
 664        struct ethtool_pauseparam ethpause = {
 665                .cmd = ETHTOOL_GPAUSEPARAM
 666        };
 667        int ret;
 668
 669        ifr.ifr_data = (void *)&ethpause;
 670        ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
 671        if (ret) {
 672                DRV_LOG(WARNING,
 673                        "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
 674                        " %s",
 675                        dev->data->port_id, strerror(rte_errno));
 676                return ret;
 677        }
 678        fc_conf->autoneg = ethpause.autoneg;
 679        if (ethpause.rx_pause && ethpause.tx_pause)
 680                fc_conf->mode = RTE_FC_FULL;
 681        else if (ethpause.rx_pause)
 682                fc_conf->mode = RTE_FC_RX_PAUSE;
 683        else if (ethpause.tx_pause)
 684                fc_conf->mode = RTE_FC_TX_PAUSE;
 685        else
 686                fc_conf->mode = RTE_FC_NONE;
 687        return 0;
 688}
 689
 690/**
 691 * DPDK callback to modify flow control parameters.
 692 *
 693 * @param dev
 694 *   Pointer to Ethernet device structure.
 695 * @param[in] fc_conf
 696 *   Flow control parameters.
 697 *
 698 * @return
 699 *   0 on success, a negative errno value otherwise and rte_errno is set.
 700 */
 701int
 702mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
 703{
 704        struct ifreq ifr;
 705        struct ethtool_pauseparam ethpause = {
 706                .cmd = ETHTOOL_SPAUSEPARAM
 707        };
 708        int ret;
 709
 710        ifr.ifr_data = (void *)&ethpause;
 711        ethpause.autoneg = fc_conf->autoneg;
 712        if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
 713            (fc_conf->mode & RTE_FC_RX_PAUSE))
 714                ethpause.rx_pause = 1;
 715        else
 716                ethpause.rx_pause = 0;
 717
 718        if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
 719            (fc_conf->mode & RTE_FC_TX_PAUSE))
 720                ethpause.tx_pause = 1;
 721        else
 722                ethpause.tx_pause = 0;
 723        ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
 724        if (ret) {
 725                DRV_LOG(WARNING,
 726                        "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
 727                        " failed: %s",
 728                        dev->data->port_id, strerror(rte_errno));
 729                return ret;
 730        }
 731        return 0;
 732}
 733
 734/**
 735 * Handle asynchronous removal event for entire multiport device.
 736 *
 737 * @param sh
 738 *   Infiniband device shared context.
 739 */
 740static void
 741mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
 742{
 743        uint32_t i;
 744
 745        for (i = 0; i < sh->max_port; ++i) {
 746                struct rte_eth_dev *dev;
 747
 748                if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
 749                        /*
 750                         * Or not existing port either no
 751                         * handler installed for this port.
 752                         */
 753                        continue;
 754                }
 755                dev = &rte_eth_devices[sh->port[i].ih_port_id];
 756                MLX5_ASSERT(dev);
 757                if (dev->data->dev_conf.intr_conf.rmv)
 758                        rte_eth_dev_callback_process
 759                                (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
 760        }
 761}
 762
 763/**
 764 * Handle shared asynchronous events the NIC (removal event
 765 * and link status change). Supports multiport IB device.
 766 *
 767 * @param cb_arg
 768 *   Callback argument.
 769 */
 770void
 771mlx5_dev_interrupt_handler(void *cb_arg)
 772{
 773        struct mlx5_dev_ctx_shared *sh = cb_arg;
 774        struct ibv_async_event event;
 775
 776        /* Read all message from the IB device and acknowledge them. */
 777        for (;;) {
 778                struct rte_eth_dev *dev;
 779                uint32_t tmp;
 780
 781                if (mlx5_glue->get_async_event(sh->ctx, &event))
 782                        break;
 783                /* Retrieve and check IB port index. */
 784                tmp = (uint32_t)event.element.port_num;
 785                if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
 786                        /*
 787                         * The DEVICE_FATAL event is called once for
 788                         * entire device without port specifying.
 789                         * We should notify all existing ports.
 790                         */
 791                        mlx5_glue->ack_async_event(&event);
 792                        mlx5_dev_interrupt_device_fatal(sh);
 793                        continue;
 794                }
 795                MLX5_ASSERT(tmp && (tmp <= sh->max_port));
 796                if (!tmp) {
 797                        /* Unsupported device level event. */
 798                        mlx5_glue->ack_async_event(&event);
 799                        DRV_LOG(DEBUG,
 800                                "unsupported common event (type %d)",
 801                                event.event_type);
 802                        continue;
 803                }
 804                if (tmp > sh->max_port) {
 805                        /* Invalid IB port index. */
 806                        mlx5_glue->ack_async_event(&event);
 807                        DRV_LOG(DEBUG,
 808                                "cannot handle an event (type %d)"
 809                                "due to invalid IB port index (%u)",
 810                                event.event_type, tmp);
 811                        continue;
 812                }
 813                if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
 814                        /* No handler installed. */
 815                        mlx5_glue->ack_async_event(&event);
 816                        DRV_LOG(DEBUG,
 817                                "cannot handle an event (type %d)"
 818                                "due to no handler installed for port %u",
 819                                event.event_type, tmp);
 820                        continue;
 821                }
 822                /* Retrieve ethernet device descriptor. */
 823                tmp = sh->port[tmp - 1].ih_port_id;
 824                dev = &rte_eth_devices[tmp];
 825                MLX5_ASSERT(dev);
 826                if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
 827                     event.event_type == IBV_EVENT_PORT_ERR) &&
 828                        dev->data->dev_conf.intr_conf.lsc) {
 829                        mlx5_glue->ack_async_event(&event);
 830                        if (mlx5_link_update(dev, 0) == -EAGAIN) {
 831                                usleep(0);
 832                                continue;
 833                        }
 834                        rte_eth_dev_callback_process
 835                                (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
 836                        continue;
 837                }
 838                DRV_LOG(DEBUG,
 839                        "port %u cannot handle an unknown event (type %d)",
 840                        dev->data->port_id, event.event_type);
 841                mlx5_glue->ack_async_event(&event);
 842        }
 843}
 844
 845/*
 846 * Unregister callback handler safely. The handler may be active
 847 * while we are trying to unregister it, in this case code -EAGAIN
 848 * is returned by rte_intr_callback_unregister(). This routine checks
 849 * the return code and tries to unregister handler again.
 850 *
 851 * @param handle
 852 *   interrupt handle
 853 * @param cb_fn
 854 *   pointer to callback routine
 855 * @cb_arg
 856 *   opaque callback parameter
 857 */
 858void
 859mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
 860                              rte_intr_callback_fn cb_fn, void *cb_arg)
 861{
 862        /*
 863         * Try to reduce timeout management overhead by not calling
 864         * the timer related routines on the first iteration. If the
 865         * unregistering succeeds on first call there will be no
 866         * timer calls at all.
 867         */
 868        uint64_t twait = 0;
 869        uint64_t start = 0;
 870
 871        do {
 872                int ret;
 873
 874                ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
 875                if (ret >= 0)
 876                        return;
 877                if (ret != -EAGAIN) {
 878                        DRV_LOG(INFO, "failed to unregister interrupt"
 879                                      " handler (error: %d)", ret);
 880                        MLX5_ASSERT(false);
 881                        return;
 882                }
 883                if (twait) {
 884                        struct timespec onems;
 885
 886                        /* Wait one millisecond and try again. */
 887                        onems.tv_sec = 0;
 888                        onems.tv_nsec = NS_PER_S / MS_PER_S;
 889                        nanosleep(&onems, 0);
 890                        /* Check whether one second elapsed. */
 891                        if ((rte_get_timer_cycles() - start) <= twait)
 892                                continue;
 893                } else {
 894                        /*
 895                         * We get the amount of timer ticks for one second.
 896                         * If this amount elapsed it means we spent one
 897                         * second in waiting. This branch is executed once
 898                         * on first iteration.
 899                         */
 900                        twait = rte_get_timer_hz();
 901                        MLX5_ASSERT(twait);
 902                }
 903                /*
 904                 * Timeout elapsed, show message (once a second) and retry.
 905                 * We have no other acceptable option here, if we ignore
 906                 * the unregistering return code the handler will not
 907                 * be unregistered, fd will be closed and we may get the
 908                 * crush. Hanging and messaging in the loop seems not to be
 909                 * the worst choice.
 910                 */
 911                DRV_LOG(INFO, "Retrying to unregister interrupt handler");
 912                start = rte_get_timer_cycles();
 913        } while (true);
 914}
 915
 916/**
 917 * Handle DEVX interrupts from the NIC.
 918 * This function is probably called from the DPDK host thread.
 919 *
 920 * @param cb_arg
 921 *   Callback argument.
 922 */
 923void
 924mlx5_dev_interrupt_handler_devx(void *cb_arg)
 925{
 926#ifndef HAVE_IBV_DEVX_ASYNC
 927        (void)cb_arg;
 928        return;
 929#else
 930        struct mlx5_dev_ctx_shared *sh = cb_arg;
 931        union {
 932                struct mlx5dv_devx_async_cmd_hdr cmd_resp;
 933                uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
 934                            MLX5_ST_SZ_BYTES(traffic_counter) +
 935                            sizeof(struct mlx5dv_devx_async_cmd_hdr)];
 936        } out;
 937        uint8_t *buf = out.buf + sizeof(out.cmd_resp);
 938
 939        while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
 940                                                   &out.cmd_resp,
 941                                                   sizeof(out.buf)))
 942                mlx5_flow_async_pool_query_handle
 943                        (sh, (uint64_t)out.cmd_resp.wr_id,
 944                         mlx5_devx_get_out_command_status(buf));
 945#endif /* HAVE_IBV_DEVX_ASYNC */
 946}
 947
 948/**
 949 * DPDK callback to bring the link DOWN.
 950 *
 951 * @param dev
 952 *   Pointer to Ethernet device structure.
 953 *
 954 * @return
 955 *   0 on success, a negative errno value otherwise and rte_errno is set.
 956 */
 957int
 958mlx5_set_link_down(struct rte_eth_dev *dev)
 959{
 960        return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
 961}
 962
 963/**
 964 * DPDK callback to bring the link UP.
 965 *
 966 * @param dev
 967 *   Pointer to Ethernet device structure.
 968 *
 969 * @return
 970 *   0 on success, a negative errno value otherwise and rte_errno is set.
 971 */
 972int
 973mlx5_set_link_up(struct rte_eth_dev *dev)
 974{
 975        return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
 976}
 977
 978/**
 979 * Check if mlx5 device was removed.
 980 *
 981 * @param dev
 982 *   Pointer to Ethernet device structure.
 983 *
 984 * @return
 985 *   1 when device is removed, otherwise 0.
 986 */
 987int
 988mlx5_is_removed(struct rte_eth_dev *dev)
 989{
 990        struct ibv_device_attr device_attr;
 991        struct mlx5_priv *priv = dev->data->dev_private;
 992
 993        if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
 994                return 1;
 995        return 0;
 996}
 997
 998/**
 999 * Analyze gathered port parameters via sysfs to recognize master
1000 * and representor devices for E-Switch configuration.
1001 *
1002 * @param[in] device_dir
1003 *   flag of presence of "device" directory under port device key.
1004 * @param[inout] switch_info
1005 *   Port information, including port name as a number and port name
1006 *   type if recognized
1007 *
1008 * @return
1009 *   master and representor flags are set in switch_info according to
1010 *   recognized parameters (if any).
1011 */
1012static void
1013mlx5_sysfs_check_switch_info(bool device_dir,
1014                             struct mlx5_switch_info *switch_info)
1015{
1016        switch (switch_info->name_type) {
1017        case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1018                /*
1019                 * Name is not recognized, assume the master,
1020                 * check the device directory presence.
1021                 */
1022                switch_info->master = device_dir;
1023                break;
1024        case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1025                /*
1026                 * Name is not set, this assumes the legacy naming
1027                 * schema for master, just check if there is
1028                 * a device directory.
1029                 */
1030                switch_info->master = device_dir;
1031                break;
1032        case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1033                /* New uplink naming schema recognized. */
1034                switch_info->master = 1;
1035                break;
1036        case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1037                /* Legacy representors naming schema. */
1038                switch_info->representor = !device_dir;
1039                break;
1040        case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1041                /* Fallthrough */
1042        case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1043                /* Fallthrough */
1044        case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1045                /* New representors naming schema. */
1046                switch_info->representor = 1;
1047                break;
1048        default:
1049                switch_info->master = device_dir;
1050                break;
1051        }
1052}
1053
1054/**
1055 * Get switch information associated with network interface.
1056 *
1057 * @param ifindex
1058 *   Network interface index.
1059 * @param[out] info
1060 *   Switch information object, populated in case of success.
1061 *
1062 * @return
1063 *   0 on success, a negative errno value otherwise and rte_errno is set.
1064 */
1065int
1066mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
1067{
1068        char ifname[IF_NAMESIZE];
1069        char port_name[IF_NAMESIZE];
1070        FILE *file;
1071        struct mlx5_switch_info data = {
1072                .master = 0,
1073                .representor = 0,
1074                .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1075                .port_name = 0,
1076                .switch_id = 0,
1077        };
1078        DIR *dir;
1079        bool port_switch_id_set = false;
1080        bool device_dir = false;
1081        char c;
1082        int ret;
1083
1084        if (!if_indextoname(ifindex, ifname)) {
1085                rte_errno = errno;
1086                return -rte_errno;
1087        }
1088
1089        MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
1090              ifname);
1091        MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
1092              ifname);
1093        MKSTR(pci_device, "/sys/class/net/%s/device",
1094              ifname);
1095
1096        file = fopen(phys_port_name, "rb");
1097        if (file != NULL) {
1098                ret = fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", port_name);
1099                fclose(file);
1100                if (ret == 1)
1101                        mlx5_translate_port_name(port_name, &data);
1102        }
1103        file = fopen(phys_switch_id, "rb");
1104        if (file == NULL) {
1105                rte_errno = errno;
1106                return -rte_errno;
1107        }
1108        port_switch_id_set =
1109                fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
1110                c == '\n';
1111        fclose(file);
1112        dir = opendir(pci_device);
1113        if (dir != NULL) {
1114                closedir(dir);
1115                device_dir = true;
1116        }
1117        if (port_switch_id_set) {
1118                /* We have some E-Switch configuration. */
1119                mlx5_sysfs_check_switch_info(device_dir, &data);
1120        }
1121        *info = data;
1122        MLX5_ASSERT(!(data.master && data.representor));
1123        if (data.master && data.representor) {
1124                DRV_LOG(ERR, "ifindex %u device is recognized as master"
1125                             " and as representor", ifindex);
1126                rte_errno = ENODEV;
1127                return -rte_errno;
1128        }
1129        return 0;
1130}
1131
1132/**
1133 * Get bond information associated with network interface.
1134 *
1135 * @param pf_ifindex
1136 *   Network interface index of bond slave interface
1137 * @param[out] ifindex
1138 *   Pointer to bond ifindex.
1139 * @param[out] ifname
1140 *   Pointer to bond ifname.
1141 *
1142 * @return
1143 *   0 on success, a negative errno value otherwise and rte_errno is set.
1144 */
1145int
1146mlx5_sysfs_bond_info(unsigned int pf_ifindex, unsigned int *ifindex,
1147                     char *ifname)
1148{
1149        char name[IF_NAMESIZE];
1150        FILE *file;
1151        unsigned int index;
1152        int ret;
1153
1154        if (!if_indextoname(pf_ifindex, name) || !strlen(name)) {
1155                rte_errno = errno;
1156                return -rte_errno;
1157        }
1158        MKSTR(bond_if, "/sys/class/net/%s/master/ifindex", name);
1159        /* read bond ifindex */
1160        file = fopen(bond_if, "rb");
1161        if (file == NULL) {
1162                rte_errno = errno;
1163                return -rte_errno;
1164        }
1165        ret = fscanf(file, "%u", &index);
1166        fclose(file);
1167        if (ret <= 0) {
1168                rte_errno = errno;
1169                return -rte_errno;
1170        }
1171        if (ifindex)
1172                *ifindex = index;
1173
1174        /* read bond device name from symbol link */
1175        if (ifname) {
1176                if (!if_indextoname(index, ifname)) {
1177                        rte_errno = errno;
1178                        return -rte_errno;
1179                }
1180        }
1181        return 0;
1182}
1183
1184/**
1185 * DPDK callback to retrieve plug-in module EEPROM information (type and size).
1186 *
1187 * @param dev
1188 *   Pointer to Ethernet device structure.
1189 * @param[out] modinfo
1190 *   Storage for plug-in module EEPROM information.
1191 *
1192 * @return
1193 *   0 on success, a negative errno value otherwise and rte_errno is set.
1194 */
1195int
1196mlx5_get_module_info(struct rte_eth_dev *dev,
1197                     struct rte_eth_dev_module_info *modinfo)
1198{
1199        struct ethtool_modinfo info = {
1200                .cmd = ETHTOOL_GMODULEINFO,
1201        };
1202        struct ifreq ifr = (struct ifreq) {
1203                .ifr_data = (void *)&info,
1204        };
1205        int ret = 0;
1206
1207        if (!dev) {
1208                DRV_LOG(WARNING, "missing argument, cannot get module info");
1209                rte_errno = EINVAL;
1210                return -rte_errno;
1211        }
1212        ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1213        if (ret) {
1214                DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1215                        dev->data->port_id, strerror(rte_errno));
1216                return ret;
1217        }
1218        modinfo->type = info.type;
1219        modinfo->eeprom_len = info.eeprom_len;
1220        return ret;
1221}
1222
1223/**
1224 * DPDK callback to retrieve plug-in module EEPROM data.
1225 *
1226 * @param dev
1227 *   Pointer to Ethernet device structure.
1228 * @param[out] info
1229 *   Storage for plug-in module EEPROM data.
1230 *
1231 * @return
1232 *   0 on success, a negative errno value otherwise and rte_errno is set.
1233 */
1234int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
1235                           struct rte_dev_eeprom_info *info)
1236{
1237        struct ethtool_eeprom *eeprom;
1238        struct ifreq ifr;
1239        int ret = 0;
1240
1241        if (!dev) {
1242                DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
1243                rte_errno = EINVAL;
1244                return -rte_errno;
1245        }
1246        eeprom = mlx5_malloc(MLX5_MEM_ZERO,
1247                             (sizeof(struct ethtool_eeprom) + info->length), 0,
1248                             SOCKET_ID_ANY);
1249        if (!eeprom) {
1250                DRV_LOG(WARNING, "port %u cannot allocate memory for "
1251                        "eeprom data", dev->data->port_id);
1252                rte_errno = ENOMEM;
1253                return -rte_errno;
1254        }
1255        eeprom->cmd = ETHTOOL_GMODULEEEPROM;
1256        eeprom->offset = info->offset;
1257        eeprom->len = info->length;
1258        ifr = (struct ifreq) {
1259                .ifr_data = (void *)eeprom,
1260        };
1261        ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1262        if (ret)
1263                DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
1264                        dev->data->port_id, strerror(rte_errno));
1265        else
1266                rte_memcpy(info->data, eeprom->data, info->length);
1267        mlx5_free(eeprom);
1268        return ret;
1269}
1270
1271/**
1272 * Read device counters table.
1273 *
1274 * @param dev
1275 *   Pointer to Ethernet device.
1276 * @param[in] pf
1277 *   PF index in case of bonding device, -1 otherwise
1278 * @param[out] stats
1279 *   Counters table output buffer.
1280 *
1281 * @return
1282 *   0 on success and stats is filled, negative errno value otherwise and
1283 *   rte_errno is set.
1284 */
1285static int
1286_mlx5_os_read_dev_counters(struct rte_eth_dev *dev, int pf, uint64_t *stats)
1287{
1288        struct mlx5_priv *priv = dev->data->dev_private;
1289        struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1290        unsigned int i;
1291        struct ifreq ifr;
1292        unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t);
1293        unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
1294        struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
1295        int ret;
1296
1297        et_stats->cmd = ETHTOOL_GSTATS;
1298        et_stats->n_stats = xstats_ctrl->stats_n;
1299        ifr.ifr_data = (caddr_t)et_stats;
1300        if (pf >= 0)
1301                ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[pf].ifname,
1302                                           SIOCETHTOOL, &ifr);
1303        else
1304                ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1305        if (ret) {
1306                DRV_LOG(WARNING,
1307                        "port %u unable to read statistic values from device",
1308                        dev->data->port_id);
1309                return ret;
1310        }
1311        for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
1312                if (xstats_ctrl->info[i].dev)
1313                        continue;
1314                stats[i] += (uint64_t)
1315                            et_stats->data[xstats_ctrl->dev_table_idx[i]];
1316        }
1317        return 0;
1318}
1319
1320/**
1321 * Read device counters.
1322 *
1323 * @param dev
1324 *   Pointer to Ethernet device.
1325 * @param[out] stats
1326 *   Counters table output buffer.
1327 *
1328 * @return
1329 *   0 on success and stats is filled, negative errno value otherwise and
1330 *   rte_errno is set.
1331 */
1332int
1333mlx5_os_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
1334{
1335        struct mlx5_priv *priv = dev->data->dev_private;
1336        struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1337        int ret = 0, i;
1338
1339        memset(stats, 0, sizeof(*stats) * xstats_ctrl->mlx5_stats_n);
1340        /* Read ifreq counters. */
1341        if (priv->master && priv->pf_bond >= 0) {
1342                /* Sum xstats from bonding device member ports. */
1343                for (i = 0; i < priv->sh->bond.n_port; i++) {
1344                        ret = _mlx5_os_read_dev_counters(dev, i, stats);
1345                        if (ret)
1346                                return ret;
1347                }
1348        } else {
1349                ret = _mlx5_os_read_dev_counters(dev, -1, stats);
1350        }
1351        /* Read IB counters. */
1352        for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
1353                if (!xstats_ctrl->info[i].dev)
1354                        continue;
1355                ret = mlx5_os_read_dev_stat(priv, xstats_ctrl->info[i].ctr_name,
1356                                            &stats[i]);
1357                /* return last xstats counter if fail to read. */
1358                if (ret != 0)
1359                        xstats_ctrl->xstats[i] = stats[i];
1360                else
1361                        stats[i] = xstats_ctrl->xstats[i];
1362        }
1363        return ret;
1364}
1365
1366/**
1367 * Query the number of statistics provided by ETHTOOL.
1368 *
1369 * @param dev
1370 *   Pointer to Ethernet device.
1371 *
1372 * @return
1373 *   Number of statistics on success, negative errno value otherwise and
1374 *   rte_errno is set.
1375 */
1376int
1377mlx5_os_get_stats_n(struct rte_eth_dev *dev)
1378{
1379        struct mlx5_priv *priv = dev->data->dev_private;
1380        struct ethtool_drvinfo drvinfo;
1381        struct ifreq ifr;
1382        int ret;
1383
1384        drvinfo.cmd = ETHTOOL_GDRVINFO;
1385        ifr.ifr_data = (caddr_t)&drvinfo;
1386        if (priv->master && priv->pf_bond >= 0)
1387                /* Bonding PF. */
1388                ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
1389                                           SIOCETHTOOL, &ifr);
1390        else
1391                ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1392        if (ret) {
1393                DRV_LOG(WARNING, "port %u unable to query number of statistics",
1394                        dev->data->port_id);
1395                return ret;
1396        }
1397        return drvinfo.n_stats;
1398}
1399
1400static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
1401        {
1402                .dpdk_name = "rx_unicast_bytes",
1403                .ctr_name = "rx_vport_unicast_bytes",
1404        },
1405        {
1406                .dpdk_name = "rx_multicast_bytes",
1407                .ctr_name = "rx_vport_multicast_bytes",
1408        },
1409        {
1410                .dpdk_name = "rx_broadcast_bytes",
1411                .ctr_name = "rx_vport_broadcast_bytes",
1412        },
1413        {
1414                .dpdk_name = "rx_unicast_packets",
1415                .ctr_name = "rx_vport_unicast_packets",
1416        },
1417        {
1418                .dpdk_name = "rx_multicast_packets",
1419                .ctr_name = "rx_vport_multicast_packets",
1420        },
1421        {
1422                .dpdk_name = "rx_broadcast_packets",
1423                .ctr_name = "rx_vport_broadcast_packets",
1424        },
1425        {
1426                .dpdk_name = "tx_unicast_bytes",
1427                .ctr_name = "tx_vport_unicast_bytes",
1428        },
1429        {
1430                .dpdk_name = "tx_multicast_bytes",
1431                .ctr_name = "tx_vport_multicast_bytes",
1432        },
1433        {
1434                .dpdk_name = "tx_broadcast_bytes",
1435                .ctr_name = "tx_vport_broadcast_bytes",
1436        },
1437        {
1438                .dpdk_name = "tx_unicast_packets",
1439                .ctr_name = "tx_vport_unicast_packets",
1440        },
1441        {
1442                .dpdk_name = "tx_multicast_packets",
1443                .ctr_name = "tx_vport_multicast_packets",
1444        },
1445        {
1446                .dpdk_name = "tx_broadcast_packets",
1447                .ctr_name = "tx_vport_broadcast_packets",
1448        },
1449        {
1450                .dpdk_name = "rx_wqe_errors",
1451                .ctr_name = "rx_wqe_err",
1452        },
1453        {
1454                .dpdk_name = "rx_phy_crc_errors",
1455                .ctr_name = "rx_crc_errors_phy",
1456        },
1457        {
1458                .dpdk_name = "rx_phy_in_range_len_errors",
1459                .ctr_name = "rx_in_range_len_errors_phy",
1460        },
1461        {
1462                .dpdk_name = "rx_phy_symbol_errors",
1463                .ctr_name = "rx_symbol_err_phy",
1464        },
1465        {
1466                .dpdk_name = "tx_phy_errors",
1467                .ctr_name = "tx_errors_phy",
1468        },
1469        {
1470                .dpdk_name = "rx_out_of_buffer",
1471                .ctr_name = "out_of_buffer",
1472                .dev = 1,
1473        },
1474        {
1475                .dpdk_name = "tx_phy_packets",
1476                .ctr_name = "tx_packets_phy",
1477        },
1478        {
1479                .dpdk_name = "rx_phy_packets",
1480                .ctr_name = "rx_packets_phy",
1481        },
1482        {
1483                .dpdk_name = "tx_phy_discard_packets",
1484                .ctr_name = "tx_discards_phy",
1485        },
1486        {
1487                .dpdk_name = "rx_phy_discard_packets",
1488                .ctr_name = "rx_discards_phy",
1489        },
1490        {
1491                .dpdk_name = "tx_phy_bytes",
1492                .ctr_name = "tx_bytes_phy",
1493        },
1494        {
1495                .dpdk_name = "rx_phy_bytes",
1496                .ctr_name = "rx_bytes_phy",
1497        },
1498        /* Representor only */
1499        {
1500                .dpdk_name = "rx_vport_packets",
1501                .ctr_name = "vport_rx_packets",
1502        },
1503        {
1504                .dpdk_name = "rx_vport_bytes",
1505                .ctr_name = "vport_rx_bytes",
1506        },
1507        {
1508                .dpdk_name = "tx_vport_packets",
1509                .ctr_name = "vport_tx_packets",
1510        },
1511        {
1512                .dpdk_name = "tx_vport_bytes",
1513                .ctr_name = "vport_tx_bytes",
1514        },
1515};
1516
1517static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
1518
1519/**
1520 * Init the structures to read device counters.
1521 *
1522 * @param dev
1523 *   Pointer to Ethernet device.
1524 */
1525void
1526mlx5_os_stats_init(struct rte_eth_dev *dev)
1527{
1528        struct mlx5_priv *priv = dev->data->dev_private;
1529        struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
1530        struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
1531        unsigned int i;
1532        unsigned int j;
1533        struct ifreq ifr;
1534        struct ethtool_gstrings *strings = NULL;
1535        unsigned int dev_stats_n;
1536        unsigned int str_sz;
1537        int ret;
1538
1539        /* So that it won't aggregate for each init. */
1540        xstats_ctrl->mlx5_stats_n = 0;
1541        ret = mlx5_os_get_stats_n(dev);
1542        if (ret < 0) {
1543                DRV_LOG(WARNING, "port %u no extended statistics available",
1544                        dev->data->port_id);
1545                return;
1546        }
1547        dev_stats_n = ret;
1548        /* Allocate memory to grab stat names and values. */
1549        str_sz = dev_stats_n * ETH_GSTRING_LEN;
1550        strings = (struct ethtool_gstrings *)
1551                  mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
1552                              SOCKET_ID_ANY);
1553        if (!strings) {
1554                DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
1555                     dev->data->port_id);
1556                return;
1557        }
1558        strings->cmd = ETHTOOL_GSTRINGS;
1559        strings->string_set = ETH_SS_STATS;
1560        strings->len = dev_stats_n;
1561        ifr.ifr_data = (caddr_t)strings;
1562        if (priv->master && priv->pf_bond >= 0)
1563                /* Bonding master. */
1564                ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
1565                                           SIOCETHTOOL, &ifr);
1566        else
1567                ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
1568        if (ret) {
1569                DRV_LOG(WARNING, "port %u unable to get statistic names",
1570                        dev->data->port_id);
1571                goto free;
1572        }
1573        for (i = 0; i != dev_stats_n; ++i) {
1574                const char *curr_string = (const char *)
1575                        &strings->data[i * ETH_GSTRING_LEN];
1576
1577                for (j = 0; j != xstats_n; ++j) {
1578                        if (!strcmp(mlx5_counters_init[j].ctr_name,
1579                                    curr_string)) {
1580                                unsigned int idx = xstats_ctrl->mlx5_stats_n++;
1581
1582                                xstats_ctrl->dev_table_idx[idx] = i;
1583                                xstats_ctrl->info[idx] = mlx5_counters_init[j];
1584                                break;
1585                        }
1586                }
1587        }
1588        /* Add dev counters. */
1589        for (i = 0; i != xstats_n; ++i) {
1590                if (mlx5_counters_init[i].dev) {
1591                        unsigned int idx = xstats_ctrl->mlx5_stats_n++;
1592
1593                        xstats_ctrl->info[idx] = mlx5_counters_init[i];
1594                        xstats_ctrl->hw_stats[idx] = 0;
1595                }
1596        }
1597        MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS);
1598        xstats_ctrl->stats_n = dev_stats_n;
1599        /* Copy to base at first time. */
1600        ret = mlx5_os_read_dev_counters(dev, xstats_ctrl->base);
1601        if (ret)
1602                DRV_LOG(ERR, "port %u cannot read device counters: %s",
1603                        dev->data->port_id, strerror(rte_errno));
1604        mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
1605        stats_ctrl->imissed = 0;
1606free:
1607        mlx5_free(strings);
1608}
1609
1610/**
1611 * Get MAC address by querying netdevice.
1612 *
1613 * @param[in] dev
1614 *   Pointer to Ethernet device.
1615 * @param[out] mac
1616 *   MAC address output buffer.
1617 *
1618 * @return
1619 *   0 on success, a negative errno value otherwise and rte_errno is set.
1620 */
1621int
1622mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
1623{
1624        struct ifreq request;
1625        int ret;
1626
1627        ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
1628        if (ret)
1629                return ret;
1630        memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1631        return 0;
1632}
1633