linux/include/ras/ras_event.h
<<
>>
Prefs
   1#undef TRACE_SYSTEM
   2#define TRACE_SYSTEM ras
   3#define TRACE_INCLUDE_FILE ras_event
   4
   5#if !defined(_TRACE_HW_EVENT_MC_H) || defined(TRACE_HEADER_MULTI_READ)
   6#define _TRACE_HW_EVENT_MC_H
   7
   8#include <linux/tracepoint.h>
   9#include <linux/edac.h>
  10#include <linux/ktime.h>
  11#include <linux/pci.h>
  12#include <linux/aer.h>
  13#include <linux/cper.h>
  14#include <linux/mm.h>
  15
  16/*
  17 * MCE Extended Error Log trace event
  18 *
  19 * These events are generated when hardware detects a corrected or
  20 * uncorrected event.
  21 */
  22
  23/* memory trace event */
  24
  25#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
  26TRACE_EVENT(extlog_mem_event,
  27        TP_PROTO(struct cper_sec_mem_err *mem,
  28                 u32 err_seq,
  29                 const uuid_le *fru_id,
  30                 const char *fru_text,
  31                 u8 sev),
  32
  33        TP_ARGS(mem, err_seq, fru_id, fru_text, sev),
  34
  35        TP_STRUCT__entry(
  36                __field(u32, err_seq)
  37                __field(u8, etype)
  38                __field(u8, sev)
  39                __field(u64, pa)
  40                __field(u8, pa_mask_lsb)
  41                __field_struct(uuid_le, fru_id)
  42                __string(fru_text, fru_text)
  43                __field_struct(struct cper_mem_err_compact, data)
  44        ),
  45
  46        TP_fast_assign(
  47                __entry->err_seq = err_seq;
  48                if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
  49                        __entry->etype = mem->error_type;
  50                else
  51                        __entry->etype = ~0;
  52                __entry->sev = sev;
  53                if (mem->validation_bits & CPER_MEM_VALID_PA)
  54                        __entry->pa = mem->physical_addr;
  55                else
  56                        __entry->pa = ~0ull;
  57
  58                if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
  59                        __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
  60                else
  61                        __entry->pa_mask_lsb = ~0;
  62                __entry->fru_id = *fru_id;
  63                __assign_str(fru_text, fru_text);
  64                cper_mem_err_pack(mem, &__entry->data);
  65        ),
  66
  67        TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s",
  68                  __entry->err_seq,
  69                  cper_severity_str(__entry->sev),
  70                  cper_mem_err_type_str(__entry->etype),
  71                  __entry->pa,
  72                  __entry->pa_mask_lsb,
  73                  cper_mem_err_unpack(p, &__entry->data),
  74                  &__entry->fru_id,
  75                  __get_str(fru_text))
  76);
  77#endif
  78
  79/*
  80 * Hardware Events Report
  81 *
  82 * Those events are generated when hardware detected a corrected or
  83 * uncorrected event, and are meant to replace the current API to report
  84 * errors defined on both EDAC and MCE subsystems.
  85 *
  86 * FIXME: Add events for handling memory errors originated from the
  87 *        MCE subsystem.
  88 */
  89
  90/*
  91 * Hardware-independent Memory Controller specific events
  92 */
  93
  94/*
  95 * Default error mechanisms for Memory Controller errors (CE and UE)
  96 */
  97TRACE_EVENT(mc_event,
  98
  99        TP_PROTO(const unsigned int err_type,
 100                 const char *error_msg,
 101                 const char *label,
 102                 const int error_count,
 103                 const u8 mc_index,
 104                 const s8 top_layer,
 105                 const s8 mid_layer,
 106                 const s8 low_layer,
 107                 unsigned long address,
 108                 const u8 grain_bits,
 109                 unsigned long syndrome,
 110                 const char *driver_detail),
 111
 112        TP_ARGS(err_type, error_msg, label, error_count, mc_index,
 113                top_layer, mid_layer, low_layer, address, grain_bits,
 114                syndrome, driver_detail),
 115
 116        TP_STRUCT__entry(
 117                __field(        unsigned int,   error_type              )
 118                __string(       msg,            error_msg               )
 119                __string(       label,          label                   )
 120                __field(        u16,            error_count             )
 121                __field(        u8,             mc_index                )
 122                __field(        s8,             top_layer               )
 123                __field(        s8,             middle_layer            )
 124                __field(        s8,             lower_layer             )
 125                __field(        long,           address                 )
 126                __field(        u8,             grain_bits              )
 127                __field(        long,           syndrome                )
 128                __string(       driver_detail,  driver_detail           )
 129        ),
 130
 131        TP_fast_assign(
 132                __entry->error_type             = err_type;
 133                __assign_str(msg, error_msg);
 134                __assign_str(label, label);
 135                __entry->error_count            = error_count;
 136                __entry->mc_index               = mc_index;
 137                __entry->top_layer              = top_layer;
 138                __entry->middle_layer           = mid_layer;
 139                __entry->lower_layer            = low_layer;
 140                __entry->address                = address;
 141                __entry->grain_bits             = grain_bits;
 142                __entry->syndrome               = syndrome;
 143                __assign_str(driver_detail, driver_detail);
 144        ),
 145
 146        TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
 147                  __entry->error_count,
 148                  mc_event_error_type(__entry->error_type),
 149                  __entry->error_count > 1 ? "s" : "",
 150                  ((char *)__get_str(msg))[0] ? " " : "",
 151                  __get_str(msg),
 152                  __get_str(label),
 153                  __entry->mc_index,
 154                  __entry->top_layer,
 155                  __entry->middle_layer,
 156                  __entry->lower_layer,
 157                  __entry->address,
 158                  1 << __entry->grain_bits,
 159                  __entry->syndrome,
 160                  ((char *)__get_str(driver_detail))[0] ? " " : "",
 161                  __get_str(driver_detail))
 162);
 163
 164/*
 165 * PCIe AER Trace event
 166 *
 167 * These events are generated when hardware detects a corrected or
 168 * uncorrected event on a PCIe device. The event report has
 169 * the following structure:
 170 *
 171 * char * dev_name -    The name of the slot where the device resides
 172 *                      ([domain:]bus:device.function).
 173 * u32 status -         Either the correctable or uncorrectable register
 174 *                      indicating what error or errors have been seen
 175 * u8 severity -        error severity 0:NONFATAL 1:FATAL 2:CORRECTED
 176 */
 177
 178#define aer_correctable_errors                                  \
 179        {PCI_ERR_COR_RCVR,      "Receiver Error"},              \
 180        {PCI_ERR_COR_BAD_TLP,   "Bad TLP"},                     \
 181        {PCI_ERR_COR_BAD_DLLP,  "Bad DLLP"},                    \
 182        {PCI_ERR_COR_REP_ROLL,  "RELAY_NUM Rollover"},          \
 183        {PCI_ERR_COR_REP_TIMER, "Replay Timer Timeout"},        \
 184        {PCI_ERR_COR_ADV_NFAT,  "Advisory Non-Fatal Error"},    \
 185        {PCI_ERR_COR_INTERNAL,  "Corrected Internal Error"},    \
 186        {PCI_ERR_COR_LOG_OVER,  "Header Log Overflow"}
 187
 188#define aer_uncorrectable_errors                                \
 189        {PCI_ERR_UNC_UND,       "Undefined"},                   \
 190        {PCI_ERR_UNC_DLP,       "Data Link Protocol Error"},    \
 191        {PCI_ERR_UNC_SURPDN,    "Surprise Down Error"},         \
 192        {PCI_ERR_UNC_POISON_TLP,"Poisoned TLP"},                \
 193        {PCI_ERR_UNC_FCP,       "Flow Control Protocol Error"}, \
 194        {PCI_ERR_UNC_COMP_TIME, "Completion Timeout"},          \
 195        {PCI_ERR_UNC_COMP_ABORT,"Completer Abort"},             \
 196        {PCI_ERR_UNC_UNX_COMP,  "Unexpected Completion"},       \
 197        {PCI_ERR_UNC_RX_OVER,   "Receiver Overflow"},           \
 198        {PCI_ERR_UNC_MALF_TLP,  "Malformed TLP"},               \
 199        {PCI_ERR_UNC_ECRC,      "ECRC Error"},                  \
 200        {PCI_ERR_UNC_UNSUP,     "Unsupported Request Error"},   \
 201        {PCI_ERR_UNC_ACSV,      "ACS Violation"},               \
 202        {PCI_ERR_UNC_INTN,      "Uncorrectable Internal Error"},\
 203        {PCI_ERR_UNC_MCBTLP,    "MC Blocked TLP"},              \
 204        {PCI_ERR_UNC_ATOMEG,    "AtomicOp Egress Blocked"},     \
 205        {PCI_ERR_UNC_TLPPRE,    "TLP Prefix Blocked Error"}
 206
 207TRACE_EVENT(aer_event,
 208        TP_PROTO(const char *dev_name,
 209                 const u32 status,
 210                 const u8 severity),
 211
 212        TP_ARGS(dev_name, status, severity),
 213
 214        TP_STRUCT__entry(
 215                __string(       dev_name,       dev_name        )
 216                __field(        u32,            status          )
 217                __field(        u8,             severity        )
 218        ),
 219
 220        TP_fast_assign(
 221                __assign_str(dev_name, dev_name);
 222                __entry->status         = status;
 223                __entry->severity       = severity;
 224        ),
 225
 226        TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
 227                __get_str(dev_name),
 228                __entry->severity == AER_CORRECTABLE ? "Corrected" :
 229                        __entry->severity == AER_FATAL ?
 230                        "Fatal" : "Uncorrected, non-fatal",
 231                __entry->severity == AER_CORRECTABLE ?
 232                __print_flags(__entry->status, "|", aer_correctable_errors) :
 233                __print_flags(__entry->status, "|", aer_uncorrectable_errors))
 234);
 235
 236/*
 237 * memory-failure recovery action result event
 238 *
 239 * unsigned long pfn -  Page Frame Number of the corrupted page
 240 * int type     -       Page types of the corrupted page
 241 * int result   -       Result of recovery action
 242 */
 243
 244#ifdef CONFIG_MEMORY_FAILURE
 245#define MF_ACTION_RESULT        \
 246        EM ( MF_IGNORED, "Ignored" )    \
 247        EM ( MF_FAILED,  "Failed" )     \
 248        EM ( MF_DELAYED, "Delayed" )    \
 249        EMe ( MF_RECOVERED, "Recovered" )
 250
 251#define MF_PAGE_TYPE            \
 252        EM ( MF_MSG_KERNEL, "reserved kernel page" )                    \
 253        EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" )       \
 254        EM ( MF_MSG_SLAB, "kernel slab page" )                          \
 255        EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
 256        EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" )      \
 257        EM ( MF_MSG_HUGE, "huge page" )                                 \
 258        EM ( MF_MSG_FREE_HUGE, "free huge page" )                       \
 259        EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )             \
 260        EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )           \
 261        EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )           \
 262        EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" )       \
 263        EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" )       \
 264        EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" )       \
 265        EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" )       \
 266        EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" )                       \
 267        EM ( MF_MSG_CLEAN_LRU, "clean LRU page" )                       \
 268        EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" )       \
 269        EM ( MF_MSG_BUDDY, "free buddy page" )                          \
 270        EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" )            \
 271        EMe ( MF_MSG_UNKNOWN, "unknown page" )
 272
 273/*
 274 * First define the enums in MM_ACTION_RESULT to be exported to userspace
 275 * via TRACE_DEFINE_ENUM().
 276 */
 277#undef EM
 278#undef EMe
 279#define EM(a, b) TRACE_DEFINE_ENUM(a);
 280#define EMe(a, b)       TRACE_DEFINE_ENUM(a);
 281
 282MF_ACTION_RESULT
 283MF_PAGE_TYPE
 284
 285/*
 286 * Now redefine the EM() and EMe() macros to map the enums to the strings
 287 * that will be printed in the output.
 288 */
 289#undef EM
 290#undef EMe
 291#define EM(a, b)                { a, b },
 292#define EMe(a, b)       { a, b }
 293
 294TRACE_EVENT(memory_failure_event,
 295        TP_PROTO(unsigned long pfn,
 296                 int type,
 297                 int result),
 298
 299        TP_ARGS(pfn, type, result),
 300
 301        TP_STRUCT__entry(
 302                __field(unsigned long, pfn)
 303                __field(int, type)
 304                __field(int, result)
 305        ),
 306
 307        TP_fast_assign(
 308                __entry->pfn    = pfn;
 309                __entry->type   = type;
 310                __entry->result = result;
 311        ),
 312
 313        TP_printk("pfn %#lx: recovery action for %s: %s",
 314                __entry->pfn,
 315                __print_symbolic(__entry->type, MF_PAGE_TYPE),
 316                __print_symbolic(__entry->result, MF_ACTION_RESULT)
 317        )
 318);
 319#endif /* CONFIG_MEMORY_FAILURE */
 320#endif /* _TRACE_HW_EVENT_MC_H */
 321
 322/* This part must be outside protection */
 323#include <trace/define_trace.h>
 324