linux/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/* Copyright (c) 2019 Mellanox Technologies. */
   3
   4#include "health.h"
   5#include "en/ptp.h"
   6
   7static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
   8{
   9        unsigned long exp_time = jiffies +
  10                                 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
  11
  12        while (time_before(jiffies, exp_time)) {
  13                if (sq->cc == sq->pc)
  14                        return 0;
  15
  16                msleep(20);
  17        }
  18
  19        netdev_err(sq->netdev,
  20                   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
  21                   sq->sqn, sq->cc, sq->pc);
  22
  23        return -ETIMEDOUT;
  24}
  25
  26static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
  27{
  28        WARN_ONCE(sq->cc != sq->pc,
  29                  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
  30                  sq->sqn, sq->cc, sq->pc);
  31        sq->cc = 0;
  32        sq->dma_fifo_cc = 0;
  33        sq->pc = 0;
  34}
  35
  36static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
  37{
  38        struct mlx5_core_dev *mdev;
  39        struct net_device *dev;
  40        struct mlx5e_txqsq *sq;
  41        u8 state;
  42        int err;
  43
  44        sq = ctx;
  45        mdev = sq->mdev;
  46        dev = sq->netdev;
  47
  48        if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
  49                return 0;
  50
  51        err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
  52        if (err) {
  53                netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
  54                           sq->sqn, err);
  55                goto out;
  56        }
  57
  58        if (state != MLX5_SQC_STATE_ERR)
  59                goto out;
  60
  61        mlx5e_tx_disable_queue(sq->txq);
  62
  63        err = mlx5e_wait_for_sq_flush(sq);
  64        if (err)
  65                goto out;
  66
  67        /* At this point, no new packets will arrive from the stack as TXQ is
  68         * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
  69         * pending WQEs. SQ can safely reset the SQ.
  70         */
  71
  72        err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn);
  73        if (err)
  74                goto out;
  75
  76        mlx5e_reset_txqsq_cc_pc(sq);
  77        sq->stats->recover++;
  78        clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
  79        mlx5e_activate_txqsq(sq);
  80
  81        return 0;
  82out:
  83        clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
  84        return err;
  85}
  86
  87struct mlx5e_tx_timeout_ctx {
  88        struct mlx5e_txqsq *sq;
  89        signed int status;
  90};
  91
  92static int mlx5e_tx_reporter_timeout_recover(void *ctx)
  93{
  94        struct mlx5e_tx_timeout_ctx *to_ctx;
  95        struct mlx5e_priv *priv;
  96        struct mlx5_eq_comp *eq;
  97        struct mlx5e_txqsq *sq;
  98        int err;
  99
 100        to_ctx = ctx;
 101        sq = to_ctx->sq;
 102        eq = sq->cq.mcq.eq;
 103        priv = sq->priv;
 104        err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
 105        if (!err) {
 106                to_ctx->status = 0; /* this sq recovered */
 107                return err;
 108        }
 109
 110        err = mlx5e_safe_reopen_channels(priv);
 111        if (!err) {
 112                to_ctx->status = 1; /* all channels recovered */
 113                return err;
 114        }
 115
 116        to_ctx->status = err;
 117        clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
 118        netdev_err(priv->netdev,
 119                   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
 120                   err);
 121
 122        return err;
 123}
 124
 125/* state lock cannot be grabbed within this function.
 126 * It can cause a dead lock or a read-after-free.
 127 */
 128static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
 129{
 130        return err_ctx->recover(err_ctx->ctx);
 131}
 132
 133static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
 134                                     void *context,
 135                                     struct netlink_ext_ack *extack)
 136{
 137        struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
 138        struct mlx5e_err_ctx *err_ctx = context;
 139
 140        return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
 141                         mlx5e_health_recover_channels(priv);
 142}
 143
 144static int
 145mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
 146                                                  struct mlx5e_txqsq *sq, int tc)
 147{
 148        bool stopped = netif_xmit_stopped(sq->txq);
 149        struct mlx5e_priv *priv = sq->priv;
 150        u8 state;
 151        int err;
 152
 153        err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
 154        if (err)
 155                return err;
 156
 157        err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
 158        if (err)
 159                return err;
 160
 161        err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
 162        if (err)
 163                return err;
 164
 165        err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
 166        if (err)
 167                return err;
 168
 169        err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
 170        if (err)
 171                return err;
 172
 173        err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
 174        if (err)
 175                return err;
 176
 177        err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
 178        if (err)
 179                return err;
 180
 181        err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
 182        if (err)
 183                return err;
 184
 185        err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
 186        if (err)
 187                return err;
 188
 189        return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
 190}
 191
 192static int
 193mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
 194                                        struct mlx5e_txqsq *sq, int tc)
 195{
 196        int err;
 197
 198        err = devlink_fmsg_obj_nest_start(fmsg);
 199        if (err)
 200                return err;
 201
 202        err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
 203        if (err)
 204                return err;
 205
 206        err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc);
 207        if (err)
 208                return err;
 209
 210        err = devlink_fmsg_obj_nest_end(fmsg);
 211        if (err)
 212                return err;
 213
 214        return 0;
 215}
 216
 217static int
 218mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg,
 219                                              struct mlx5e_ptpsq *ptpsq, int tc)
 220{
 221        int err;
 222
 223        err = devlink_fmsg_obj_nest_start(fmsg);
 224        if (err)
 225                return err;
 226
 227        err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp");
 228        if (err)
 229                return err;
 230
 231        err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc);
 232        if (err)
 233                return err;
 234
 235        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
 236        if (err)
 237                return err;
 238
 239        err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg);
 240        if (err)
 241                return err;
 242
 243        err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 244        if (err)
 245                return err;
 246
 247        err = devlink_fmsg_obj_nest_end(fmsg);
 248        if (err)
 249                return err;
 250
 251        return 0;
 252}
 253
 254static int
 255mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg,
 256                                         struct mlx5e_txqsq *txqsq)
 257{
 258        u32 sq_stride, sq_sz;
 259        int err;
 260
 261        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
 262        if (err)
 263                return err;
 264
 265        sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq);
 266        sq_stride = MLX5_SEND_WQE_BB;
 267
 268        err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
 269        if (err)
 270                return err;
 271
 272        err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
 273        if (err)
 274                return err;
 275
 276        err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg);
 277        if (err)
 278                return err;
 279
 280        return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 281}
 282
 283static int
 284mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg,
 285                                              struct mlx5e_ptpsq *ptpsq)
 286{
 287        int err;
 288
 289        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
 290        if (err)
 291                return err;
 292
 293        err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg);
 294        if (err)
 295                return err;
 296
 297        return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 298}
 299
 300static int
 301mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter,
 302                                         struct devlink_fmsg *fmsg)
 303{
 304        struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
 305        struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
 306        struct mlx5e_ptpsq *generic_ptpsq;
 307        int err;
 308
 309        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
 310        if (err)
 311                return err;
 312
 313        err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq);
 314        if (err)
 315                return err;
 316
 317        generic_ptpsq = priv->channels.port_ptp ?
 318                        &priv->channels.port_ptp->ptpsq[0] :
 319                        NULL;
 320        if (!generic_ptpsq)
 321                goto out;
 322
 323        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP");
 324        if (err)
 325                return err;
 326
 327        err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq);
 328        if (err)
 329                return err;
 330
 331        err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq);
 332        if (err)
 333                return err;
 334
 335        err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 336        if (err)
 337                return err;
 338
 339out:
 340        return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 341}
 342
 343static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
 344                                      struct devlink_fmsg *fmsg,
 345                                      struct netlink_ext_ack *extack)
 346{
 347        struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
 348        struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp;
 349
 350        int i, tc, err = 0;
 351
 352        mutex_lock(&priv->state_lock);
 353
 354        if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
 355                goto unlock;
 356
 357        err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg);
 358        if (err)
 359                goto unlock;
 360
 361        err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
 362        if (err)
 363                goto unlock;
 364
 365        for (i = 0; i < priv->channels.num; i++) {
 366                struct mlx5e_channel *c = priv->channels.c[i];
 367
 368                for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
 369                        struct mlx5e_txqsq *sq = &c->sq[tc];
 370
 371                        err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
 372                        if (err)
 373                                goto unlock;
 374                }
 375        }
 376
 377        if (!ptp_ch)
 378                goto close_sqs_nest;
 379
 380        for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
 381                err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg,
 382                                                                    &ptp_ch->ptpsq[tc],
 383                                                                    tc);
 384                if (err)
 385                        goto unlock;
 386        }
 387
 388close_sqs_nest:
 389        err = devlink_fmsg_arr_pair_nest_end(fmsg);
 390        if (err)
 391                goto unlock;
 392
 393unlock:
 394        mutex_unlock(&priv->state_lock);
 395        return err;
 396}
 397
 398static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
 399                                     void *ctx)
 400{
 401        struct mlx5_rsc_key key = {};
 402        struct mlx5e_txqsq *sq = ctx;
 403        int err;
 404
 405        if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
 406                return 0;
 407
 408        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
 409        if (err)
 410                return err;
 411
 412        key.size = PAGE_SIZE;
 413        key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
 414        err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
 415        if (err)
 416                return err;
 417
 418        err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 419        if (err)
 420                return err;
 421
 422        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
 423        if (err)
 424                return err;
 425
 426        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
 427        if (err)
 428                return err;
 429
 430        key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
 431        key.index1 = sq->sqn;
 432        key.num_of_obj1 = 1;
 433
 434        err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
 435        if (err)
 436                return err;
 437
 438        err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 439        if (err)
 440                return err;
 441
 442        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
 443        if (err)
 444                return err;
 445
 446        key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
 447        key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
 448        err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
 449        if (err)
 450                return err;
 451
 452        err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 453        if (err)
 454                return err;
 455
 456        return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 457}
 458
 459static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
 460                                          struct devlink_fmsg *fmsg)
 461{
 462        struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp;
 463        struct mlx5_rsc_key key = {};
 464        int i, tc, err;
 465
 466        if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
 467                return 0;
 468
 469        err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
 470        if (err)
 471                return err;
 472
 473        key.size = PAGE_SIZE;
 474        key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
 475        err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
 476        if (err)
 477                return err;
 478
 479        err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
 480        if (err)
 481                return err;
 482
 483        err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
 484        if (err)
 485                return err;
 486
 487        for (i = 0; i < priv->channels.num; i++) {
 488                struct mlx5e_channel *c = priv->channels.c[i];
 489
 490                for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
 491                        struct mlx5e_txqsq *sq = &c->sq[tc];
 492
 493                        err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
 494                        if (err)
 495                                return err;
 496                }
 497        }
 498
 499        if (ptp_ch) {
 500                for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
 501                        struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq;
 502
 503                        err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ");
 504                        if (err)
 505                                return err;
 506                }
 507        }
 508
 509        return devlink_fmsg_arr_pair_nest_end(fmsg);
 510}
 511
 512static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
 513                                           struct mlx5e_err_ctx *err_ctx,
 514                                           struct devlink_fmsg *fmsg)
 515{
 516        return err_ctx->dump(priv, fmsg, err_ctx->ctx);
 517}
 518
 519static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
 520                                  struct devlink_fmsg *fmsg, void *context,
 521                                  struct netlink_ext_ack *extack)
 522{
 523        struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
 524        struct mlx5e_err_ctx *err_ctx = context;
 525
 526        return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
 527                         mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
 528}
 529
 530void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
 531{
 532        char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
 533        struct mlx5e_priv *priv = sq->priv;
 534        struct mlx5e_err_ctx err_ctx = {};
 535
 536        err_ctx.ctx = sq;
 537        err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
 538        err_ctx.dump = mlx5e_tx_reporter_dump_sq;
 539        snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
 540
 541        mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
 542}
 543
 544int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
 545{
 546        char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
 547        struct mlx5e_tx_timeout_ctx to_ctx = {};
 548        struct mlx5e_priv *priv = sq->priv;
 549        struct mlx5e_err_ctx err_ctx = {};
 550
 551        to_ctx.sq = sq;
 552        err_ctx.ctx = &to_ctx;
 553        err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
 554        err_ctx.dump = mlx5e_tx_reporter_dump_sq;
 555        snprintf(err_str, sizeof(err_str),
 556                 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
 557                 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
 558                 jiffies_to_usecs(jiffies - sq->txq->trans_start));
 559
 560        mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
 561        return to_ctx.status;
 562}
 563
 564static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
 565                .name = "tx",
 566                .recover = mlx5e_tx_reporter_recover,
 567                .diagnose = mlx5e_tx_reporter_diagnose,
 568                .dump = mlx5e_tx_reporter_dump,
 569};
 570
 571#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
 572
 573void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
 574{
 575        struct devlink_health_reporter *reporter;
 576
 577        reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops,
 578                                                       MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
 579        if (IS_ERR(reporter)) {
 580                netdev_warn(priv->netdev,
 581                            "Failed to create tx reporter, err = %ld\n",
 582                            PTR_ERR(reporter));
 583                return;
 584        }
 585        priv->tx_reporter = reporter;
 586}
 587
 588void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
 589{
 590        if (!priv->tx_reporter)
 591                return;
 592
 593        devlink_port_health_reporter_destroy(priv->tx_reporter);
 594}
 595