1
2
3
4#include "health.h"
5#include "en/ptp.h"
6
7static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
8{
9 unsigned long exp_time = jiffies +
10 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
11
12 while (time_before(jiffies, exp_time)) {
13 if (sq->cc == sq->pc)
14 return 0;
15
16 msleep(20);
17 }
18
19 netdev_err(sq->netdev,
20 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
21 sq->sqn, sq->cc, sq->pc);
22
23 return -ETIMEDOUT;
24}
25
26static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
27{
28 WARN_ONCE(sq->cc != sq->pc,
29 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
30 sq->sqn, sq->cc, sq->pc);
31 sq->cc = 0;
32 sq->dma_fifo_cc = 0;
33 sq->pc = 0;
34}
35
36static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
37{
38 struct mlx5_core_dev *mdev;
39 struct net_device *dev;
40 struct mlx5e_txqsq *sq;
41 u8 state;
42 int err;
43
44 sq = ctx;
45 mdev = sq->mdev;
46 dev = sq->netdev;
47
48 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
49 return 0;
50
51 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
52 if (err) {
53 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
54 sq->sqn, err);
55 goto out;
56 }
57
58 if (state != MLX5_SQC_STATE_ERR)
59 goto out;
60
61 mlx5e_tx_disable_queue(sq->txq);
62
63 err = mlx5e_wait_for_sq_flush(sq);
64 if (err)
65 goto out;
66
67
68
69
70
71
72 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn);
73 if (err)
74 goto out;
75
76 mlx5e_reset_txqsq_cc_pc(sq);
77 sq->stats->recover++;
78 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
79 mlx5e_activate_txqsq(sq);
80
81 return 0;
82out:
83 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
84 return err;
85}
86
87struct mlx5e_tx_timeout_ctx {
88 struct mlx5e_txqsq *sq;
89 signed int status;
90};
91
92static int mlx5e_tx_reporter_timeout_recover(void *ctx)
93{
94 struct mlx5e_tx_timeout_ctx *to_ctx;
95 struct mlx5e_priv *priv;
96 struct mlx5_eq_comp *eq;
97 struct mlx5e_txqsq *sq;
98 int err;
99
100 to_ctx = ctx;
101 sq = to_ctx->sq;
102 eq = sq->cq.mcq.eq;
103 priv = sq->priv;
104 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
105 if (!err) {
106 to_ctx->status = 0;
107 return err;
108 }
109
110 err = mlx5e_safe_reopen_channels(priv);
111 if (!err) {
112 to_ctx->status = 1;
113 return err;
114 }
115
116 to_ctx->status = err;
117 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
118 netdev_err(priv->netdev,
119 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
120 err);
121
122 return err;
123}
124
125
126
127
128static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
129{
130 return err_ctx->recover(err_ctx->ctx);
131}
132
133static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
134 void *context,
135 struct netlink_ext_ack *extack)
136{
137 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
138 struct mlx5e_err_ctx *err_ctx = context;
139
140 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
141 mlx5e_health_recover_channels(priv);
142}
143
144static int
145mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
146 struct mlx5e_txqsq *sq, int tc)
147{
148 bool stopped = netif_xmit_stopped(sq->txq);
149 struct mlx5e_priv *priv = sq->priv;
150 u8 state;
151 int err;
152
153 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
154 if (err)
155 return err;
156
157 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
158 if (err)
159 return err;
160
161 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
162 if (err)
163 return err;
164
165 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
166 if (err)
167 return err;
168
169 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
170 if (err)
171 return err;
172
173 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
174 if (err)
175 return err;
176
177 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
178 if (err)
179 return err;
180
181 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
182 if (err)
183 return err;
184
185 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
186 if (err)
187 return err;
188
189 return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
190}
191
192static int
193mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
194 struct mlx5e_txqsq *sq, int tc)
195{
196 int err;
197
198 err = devlink_fmsg_obj_nest_start(fmsg);
199 if (err)
200 return err;
201
202 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
203 if (err)
204 return err;
205
206 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc);
207 if (err)
208 return err;
209
210 err = devlink_fmsg_obj_nest_end(fmsg);
211 if (err)
212 return err;
213
214 return 0;
215}
216
217static int
218mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg,
219 struct mlx5e_ptpsq *ptpsq, int tc)
220{
221 int err;
222
223 err = devlink_fmsg_obj_nest_start(fmsg);
224 if (err)
225 return err;
226
227 err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp");
228 if (err)
229 return err;
230
231 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc);
232 if (err)
233 return err;
234
235 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
236 if (err)
237 return err;
238
239 err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg);
240 if (err)
241 return err;
242
243 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
244 if (err)
245 return err;
246
247 err = devlink_fmsg_obj_nest_end(fmsg);
248 if (err)
249 return err;
250
251 return 0;
252}
253
254static int
255mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg,
256 struct mlx5e_txqsq *txqsq)
257{
258 u32 sq_stride, sq_sz;
259 int err;
260
261 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
262 if (err)
263 return err;
264
265 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq);
266 sq_stride = MLX5_SEND_WQE_BB;
267
268 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
269 if (err)
270 return err;
271
272 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
273 if (err)
274 return err;
275
276 err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg);
277 if (err)
278 return err;
279
280 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
281}
282
283static int
284mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg,
285 struct mlx5e_ptpsq *ptpsq)
286{
287 int err;
288
289 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
290 if (err)
291 return err;
292
293 err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg);
294 if (err)
295 return err;
296
297 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
298}
299
300static int
301mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter,
302 struct devlink_fmsg *fmsg)
303{
304 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
305 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
306 struct mlx5e_ptpsq *generic_ptpsq;
307 int err;
308
309 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
310 if (err)
311 return err;
312
313 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq);
314 if (err)
315 return err;
316
317 generic_ptpsq = priv->channels.port_ptp ?
318 &priv->channels.port_ptp->ptpsq[0] :
319 NULL;
320 if (!generic_ptpsq)
321 goto out;
322
323 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP");
324 if (err)
325 return err;
326
327 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq);
328 if (err)
329 return err;
330
331 err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq);
332 if (err)
333 return err;
334
335 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
336 if (err)
337 return err;
338
339out:
340 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
341}
342
343static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
344 struct devlink_fmsg *fmsg,
345 struct netlink_ext_ack *extack)
346{
347 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
348 struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp;
349
350 int i, tc, err = 0;
351
352 mutex_lock(&priv->state_lock);
353
354 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
355 goto unlock;
356
357 err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg);
358 if (err)
359 goto unlock;
360
361 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
362 if (err)
363 goto unlock;
364
365 for (i = 0; i < priv->channels.num; i++) {
366 struct mlx5e_channel *c = priv->channels.c[i];
367
368 for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
369 struct mlx5e_txqsq *sq = &c->sq[tc];
370
371 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
372 if (err)
373 goto unlock;
374 }
375 }
376
377 if (!ptp_ch)
378 goto close_sqs_nest;
379
380 for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
381 err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg,
382 &ptp_ch->ptpsq[tc],
383 tc);
384 if (err)
385 goto unlock;
386 }
387
388close_sqs_nest:
389 err = devlink_fmsg_arr_pair_nest_end(fmsg);
390 if (err)
391 goto unlock;
392
393unlock:
394 mutex_unlock(&priv->state_lock);
395 return err;
396}
397
398static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
399 void *ctx)
400{
401 struct mlx5_rsc_key key = {};
402 struct mlx5e_txqsq *sq = ctx;
403 int err;
404
405 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
406 return 0;
407
408 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
409 if (err)
410 return err;
411
412 key.size = PAGE_SIZE;
413 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
414 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
415 if (err)
416 return err;
417
418 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
419 if (err)
420 return err;
421
422 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
423 if (err)
424 return err;
425
426 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
427 if (err)
428 return err;
429
430 key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
431 key.index1 = sq->sqn;
432 key.num_of_obj1 = 1;
433
434 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
435 if (err)
436 return err;
437
438 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
439 if (err)
440 return err;
441
442 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
443 if (err)
444 return err;
445
446 key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
447 key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
448 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
449 if (err)
450 return err;
451
452 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
453 if (err)
454 return err;
455
456 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
457}
458
459static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
460 struct devlink_fmsg *fmsg)
461{
462 struct mlx5e_port_ptp *ptp_ch = priv->channels.port_ptp;
463 struct mlx5_rsc_key key = {};
464 int i, tc, err;
465
466 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
467 return 0;
468
469 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
470 if (err)
471 return err;
472
473 key.size = PAGE_SIZE;
474 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
475 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
476 if (err)
477 return err;
478
479 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
480 if (err)
481 return err;
482
483 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
484 if (err)
485 return err;
486
487 for (i = 0; i < priv->channels.num; i++) {
488 struct mlx5e_channel *c = priv->channels.c[i];
489
490 for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
491 struct mlx5e_txqsq *sq = &c->sq[tc];
492
493 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
494 if (err)
495 return err;
496 }
497 }
498
499 if (ptp_ch) {
500 for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
501 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq;
502
503 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ");
504 if (err)
505 return err;
506 }
507 }
508
509 return devlink_fmsg_arr_pair_nest_end(fmsg);
510}
511
512static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
513 struct mlx5e_err_ctx *err_ctx,
514 struct devlink_fmsg *fmsg)
515{
516 return err_ctx->dump(priv, fmsg, err_ctx->ctx);
517}
518
519static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
520 struct devlink_fmsg *fmsg, void *context,
521 struct netlink_ext_ack *extack)
522{
523 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
524 struct mlx5e_err_ctx *err_ctx = context;
525
526 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
527 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
528}
529
530void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
531{
532 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
533 struct mlx5e_priv *priv = sq->priv;
534 struct mlx5e_err_ctx err_ctx = {};
535
536 err_ctx.ctx = sq;
537 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
538 err_ctx.dump = mlx5e_tx_reporter_dump_sq;
539 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
540
541 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
542}
543
544int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
545{
546 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
547 struct mlx5e_tx_timeout_ctx to_ctx = {};
548 struct mlx5e_priv *priv = sq->priv;
549 struct mlx5e_err_ctx err_ctx = {};
550
551 to_ctx.sq = sq;
552 err_ctx.ctx = &to_ctx;
553 err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
554 err_ctx.dump = mlx5e_tx_reporter_dump_sq;
555 snprintf(err_str, sizeof(err_str),
556 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
557 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
558 jiffies_to_usecs(jiffies - sq->txq->trans_start));
559
560 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
561 return to_ctx.status;
562}
563
564static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
565 .name = "tx",
566 .recover = mlx5e_tx_reporter_recover,
567 .diagnose = mlx5e_tx_reporter_diagnose,
568 .dump = mlx5e_tx_reporter_dump,
569};
570
571#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
572
573void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
574{
575 struct devlink_health_reporter *reporter;
576
577 reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops,
578 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
579 if (IS_ERR(reporter)) {
580 netdev_warn(priv->netdev,
581 "Failed to create tx reporter, err = %ld\n",
582 PTR_ERR(reporter));
583 return;
584 }
585 priv->tx_reporter = reporter;
586}
587
588void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
589{
590 if (!priv->tx_reporter)
591 return;
592
593 devlink_port_health_reporter_destroy(priv->tx_reporter);
594}
595