krishnanadh / rpms / rasdaemon

Forked from rpms/rasdaemon a year ago
Clone
Michel Alexandre Salim 09c31d
commit 2a194a220d4a377f2b25e308a01ffe9675b93c31
Michel Alexandre Salim 09c31d
Author: Krishna Dhulipala <krishnad@meta.com>
Michel Alexandre Salim 09c31d
Date:   Tue Jun 27 12:09:45 2023 -0700
Michel Alexandre Salim 09c31d
Michel Alexandre Salim 09c31d
    Unified SEL logging of AER events
Michel Alexandre Salim 09c31d
Michel Alexandre Salim 09c31d
diff --git a/Makefile.am b/Makefile.am
Michel Alexandre Salim 09c31d
index fabca78..1ea3356 100644
Michel Alexandre Salim 09c31d
--- a/Makefile.am
Michel Alexandre Salim 09c31d
+++ b/Makefile.am
Michel Alexandre Salim 09c31d
@@ -63,13 +63,17 @@ endif
Michel Alexandre Salim 09c31d
 if WITH_AMP_NS_DECODE
Michel Alexandre Salim 09c31d
    rasdaemon_SOURCES += non-standard-ampere.c
Michel Alexandre Salim 09c31d
 endif
Michel Alexandre Salim 09c31d
+if WITH_OPENBMC_UNIFIED_SEL
Michel Alexandre Salim 09c31d
+   rasdaemon_SOURCES += unified-sel.c
Michel Alexandre Salim 09c31d
+endif
Michel Alexandre Salim 09c31d
 rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
Michel Alexandre Salim 09c31d
 
Michel Alexandre Salim 09c31d
 include_HEADERS = config.h  ras-events.h  ras-logger.h  ras-mc-handler.h \
Michel Alexandre Salim 09c31d
 		  ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
Michel Alexandre Salim 09c31d
 		  ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
Michel Alexandre Salim 09c31d
 		  ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
Michel Alexandre Salim 09c31d
-		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h
Michel Alexandre Salim 09c31d
+		  non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
Michel Alexandre Salim 09c31d
+		  unified-sel.h
Michel Alexandre Salim 09c31d
 
Michel Alexandre Salim 09c31d
 # This rule can't be called with more than one Makefile job (like make -j8)
Michel Alexandre Salim 09c31d
 # I can't figure out a way to fix that
Michel Alexandre Salim 09c31d
diff --git a/configure.ac b/configure.ac
Michel Alexandre Salim 09c31d
index f7d1947..4a534b7 100644
Michel Alexandre Salim 09c31d
--- a/configure.ac
Michel Alexandre Salim 09c31d
+++ b/configure.ac
Michel Alexandre Salim 09c31d
@@ -161,6 +161,16 @@ AS_IF([test "x$enable_amp_ns_decode" = "xyes" || test "x$enable_all" == "xyes"],
Michel Alexandre Salim 09c31d
 AM_CONDITIONAL([WITH_AMP_NS_DECODE], [test x$enable_amp_ns_decode = xyes || test x$enable_all == xyes])
Michel Alexandre Salim 09c31d
 AM_COND_IF([WITH_AMP_NS_DECODE], [USE_AMP_NS_DECODE="yes"], [USE_AMP_NS_DECODE="no"])
Michel Alexandre Salim 09c31d
 
Michel Alexandre Salim 09c31d
+AC_ARG_ENABLE([openbmc_unified_sel],
Michel Alexandre Salim 09c31d
+    AS_HELP_STRING([--enable-openbmc-unified-sel], [enable OPENBMC_UNIFIED_SEL events (currently exprimental)]))
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+AS_IF([test "x$enable_openbmc_unified_sel" = "xyes" || test "x$enable_all" = "xyes"], [
Michel Alexandre Salim 09c31d
+  AC_DEFINE(HAVE_OPENBMC_UNIFIED_SEL,1,"have OpenBMC unified SEL")
Michel Alexandre Salim 09c31d
+  AC_SUBST([WITH_OPENBMC_UNIFIED_SEL])
Michel Alexandre Salim 09c31d
+])
Michel Alexandre Salim 09c31d
+AM_CONDITIONAL([WITH_OPENBMC_UNIFIED_SEL], [test x$enable_openbmc_unified_sel = xyes || test x$enabl_all = xyes])
Michel Alexandre Salim 09c31d
+AM_COND_IF([WITH_OPENBMC_UNIFIED_SEL], [USE_OPENBMC_UNIFIED_SEL="yes"], [USE_OPENBMC_UNIFIED_SEL="no"])
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
 test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
Michel Alexandre Salim 09c31d
 
Michel Alexandre Salim 09c31d
 CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
Michel Alexandre Salim 09c31d
@@ -201,4 +211,5 @@ compile time options summary
Michel Alexandre Salim 09c31d
     Memory Failure      : $USE_MEMORY_FAILURE
Michel Alexandre Salim 09c31d
     Memory CE PFA       : $USE_MEMORY_CE_PFA
Michel Alexandre Salim 09c31d
     AMP RAS errors      : $USE_AMP_NS_DECODE
Michel Alexandre Salim 09c31d
+    OpenBMC unified     : $USE_OPENBMC_UNIFIED_SEL
Michel Alexandre Salim 09c31d
 EOF
Michel Alexandre Salim 09c31d
diff --git a/ras-aer-handler.c b/ras-aer-handler.c
Michel Alexandre Salim 09c31d
index 8ddd439..c03f6ec 100644
Michel Alexandre Salim 09c31d
--- a/ras-aer-handler.c
Michel Alexandre Salim 09c31d
+++ b/ras-aer-handler.c
Michel Alexandre Salim 09c31d
@@ -25,6 +25,7 @@
Michel Alexandre Salim 09c31d
 #include "ras-logger.h"
Michel Alexandre Salim 09c31d
 #include "bitfield.h"
Michel Alexandre Salim 09c31d
 #include "ras-report.h"
Michel Alexandre Salim 09c31d
+#include "unified-sel.h"
Michel Alexandre Salim 09c31d
 
Michel Alexandre Salim 09c31d
 /* bit field meaning for correctable error */
Michel Alexandre Salim 09c31d
 static const char *aer_cor_errors[32] = {
Michel Alexandre Salim 09c31d
@@ -35,12 +36,15 @@ static const char *aer_cor_errors[32] = {
Michel Alexandre Salim 09c31d
 	[8]  = "RELAY_NUM Rollover",
Michel Alexandre Salim 09c31d
 	[12] = "Replay Timer Timeout",
Michel Alexandre Salim 09c31d
 	[13] = "Advisory Non-Fatal",
Michel Alexandre Salim 09c31d
+	[14] = "Corrected Internal",
Michel Alexandre Salim 09c31d
+	[15] = "Header Log Overflow",
Michel Alexandre Salim 09c31d
 };
Michel Alexandre Salim 09c31d
 
Michel Alexandre Salim 09c31d
 /* bit field meaning for uncorrectable error */
Michel Alexandre Salim 09c31d
 static const char *aer_uncor_errors[32] = {
Michel Alexandre Salim 09c31d
 	/* Uncorrectable errors */
Michel Alexandre Salim 09c31d
 	[4]  = "Data Link Protocol",
Michel Alexandre Salim 09c31d
+	[5]  = "Surprise Link Down",
Michel Alexandre Salim 09c31d
 	[12] = "Poisoned TLP",
Michel Alexandre Salim 09c31d
 	[13] = "Flow Control Protocol",
Michel Alexandre Salim 09c31d
 	[14] = "Completion Timeout",
Michel Alexandre Salim 09c31d
@@ -50,6 +54,12 @@ static const char *aer_uncor_errors[32] = {
Michel Alexandre Salim 09c31d
 	[18] = "Malformed TLP",
Michel Alexandre Salim 09c31d
 	[19] = "ECRC",
Michel Alexandre Salim 09c31d
 	[20] = "Unsupported Request",
Michel Alexandre Salim 09c31d
+	[21] = "ACS Violation",
Michel Alexandre Salim 09c31d
+	[22] = "Uncorrected Internal",
Michel Alexandre Salim 09c31d
+	[23] = "MC Blocked TLP",
Michel Alexandre Salim 09c31d
+	[24] = "AtomicOp Egress Blocked",
Michel Alexandre Salim 09c31d
+	[25] = "TLP Prefix Blocked",
Michel Alexandre Salim 09c31d
+	[26] = "Poisoned TLP Egrees Blocked",
Michel Alexandre Salim 09c31d
 };
Michel Alexandre Salim 09c31d
 
Michel Alexandre Salim 09c31d
 #define BUF_LEN	1024
Michel Alexandre Salim 09c31d
@@ -151,5 +161,10 @@ int ras_aer_event_handler(struct trace_seq *s,
Michel Alexandre Salim 09c31d
 	ras_report_aer_event(ras, &ev;;
Michel Alexandre Salim 09c31d
 #endif
Michel Alexandre Salim 09c31d
 
Michel Alexandre Salim 09c31d
+#ifdef HAVE_OPENBMC_UNIFIED_SEL
Michel Alexandre Salim 09c31d
+  if (openbmc_unified_sel_log(severity_val, ev.dev_name, status_val) < 0)
Michel Alexandre Salim 09c31d
+    return -1;
Michel Alexandre Salim 09c31d
+#endif
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
 	return 0;
Michel Alexandre Salim 09c31d
 }
Michel Alexandre Salim 09c31d
diff --git a/unified-sel.c b/unified-sel.c
Michel Alexandre Salim 09c31d
new file mode 100644
Michel Alexandre Salim 09c31d
index 0000000..287bb4f
Michel Alexandre Salim 09c31d
--- /dev/null
Michel Alexandre Salim 09c31d
+++ b/unified-sel.c
Michel Alexandre Salim 09c31d
@@ -0,0 +1,114 @@
Michel Alexandre Salim 09c31d
+/*
Michel Alexandre Salim 09c31d
+ * Copyright (c) 2023, Meta Platforms Inc.
Michel Alexandre Salim 09c31d
+ *
Michel Alexandre Salim 09c31d
+ * This program is free software; you can redistribute it and/or modify
Michel Alexandre Salim 09c31d
+ * it under the terms of the GNU General Public License as published by
Michel Alexandre Salim 09c31d
+ * the Free Software Foundation; either version 2 of the License, or
Michel Alexandre Salim 09c31d
+ * (at your option) any later version.
Michel Alexandre Salim 09c31d
+ *
Michel Alexandre Salim 09c31d
+ */
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+#include <stdio.h>
Michel Alexandre Salim 09c31d
+#include <stdlib.h>
Michel Alexandre Salim 09c31d
+#include <string.h>
Michel Alexandre Salim 09c31d
+#include <stdbool.h>
Michel Alexandre Salim 09c31d
+#include "ras-record.h"
Michel Alexandre Salim 09c31d
+#include "ras-logger.h"
Michel Alexandre Salim 09c31d
+#include "ras-report.h"
Michel Alexandre Salim 09c31d
+#include "unified-sel.h"
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+/* CPU Root Port Error ID corresponding to each status bit set */
Michel Alexandre Salim 09c31d
+static const char *cor_error_ids[32] = {
Michel Alexandre Salim 09c31d
+  /* Correctable errors */
Michel Alexandre Salim 09c31d
+  [0]  = "0x00", /* Receiver Error */
Michel Alexandre Salim 09c31d
+  [6]  = "0x01", /* Bad TLP */
Michel Alexandre Salim 09c31d
+  [7]  = "0x02", /* Bad DLLP */
Michel Alexandre Salim 09c31d
+  [8]  = "0x04", /* RELAY_NUM Rollover */
Michel Alexandre Salim 09c31d
+  [12] = "0x03", /* Replay Timer Timeout */
Michel Alexandre Salim 09c31d
+  [13] = "0x05", /* Advisory Non-Fatal */
Michel Alexandre Salim 09c31d
+  [14] = "0x06", /* Corrected Internal */
Michel Alexandre Salim 09c31d
+  [15] = "0x07", /* Header Log Overflow */
Michel Alexandre Salim 09c31d
+};
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+static const char *uncor_error_ids[32] = {
Michel Alexandre Salim 09c31d
+  /* Uncorrectable errors */
Michel Alexandre Salim 09c31d
+  [4]  = "0x20", /* Data Link Protocol */
Michel Alexandre Salim 09c31d
+  [5]  = "0x21", /* Surprise Link Down */
Michel Alexandre Salim 09c31d
+  [12] = "0x22", /* Poisoned TLP */
Michel Alexandre Salim 09c31d
+  [13] = "0x23", /* Flow Control Protocol */
Michel Alexandre Salim 09c31d
+  [14] = "0x24", /* Completion Timeout */
Michel Alexandre Salim 09c31d
+  [15] = "0x25", /* Completer Abort */
Michel Alexandre Salim 09c31d
+  [16] = "0x26", /* Unexpected Completion */
Michel Alexandre Salim 09c31d
+  [17] = "0x27", /* Receiver Overflow */
Michel Alexandre Salim 09c31d
+  [18] = "0x29", /* Malformed TLP */
Michel Alexandre Salim 09c31d
+  [19] = "0x29", /* ECRC */
Michel Alexandre Salim 09c31d
+  [20] = "0x2A", /* Unsupported Request */
Michel Alexandre Salim 09c31d
+  [21] = "0x2B", /* ACS Violation */
Michel Alexandre Salim 09c31d
+  [22] = "0x2C", /* Uncorrected Internal */
Michel Alexandre Salim 09c31d
+  [23] = "0x2D", /* MC Blocked TLP */
Michel Alexandre Salim 09c31d
+  [24] = "0x2E", /* AtomicOp Egress Blocked */
Michel Alexandre Salim 09c31d
+  [25] = "0x2F", /* TLP Prefix Blocked */
Michel Alexandre Salim 09c31d
+  [26] = "0x30", /* Poisoned TLP Egrees Blocked */
Michel Alexandre Salim 09c31d
+};
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+static int verify_id_log_sel(uint64_t status,
Michel Alexandre Salim 09c31d
+    const char **idarray,
Michel Alexandre Salim 09c31d
+    unsigned bus,
Michel Alexandre Salim 09c31d
+    unsigned dev_fn)
Michel Alexandre Salim 09c31d
+{
Michel Alexandre Salim 09c31d
+  int i;
Michel Alexandre Salim 09c31d
+  char openbmc_ipmi_add_sel[105];
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+  /*
Michel Alexandre Salim 09c31d
+   * Get PCIe AER error source bus/dev/fn and save it to the BMC SEL
Michel Alexandre Salim 09c31d
+   * as a OpenBMC unified SEL record type.
Michel Alexandre Salim 09c31d
+   * The IPMI command and record fields are defined in IPMI Specification v2.0 (IPMI Spec)
Michel Alexandre Salim 09c31d
+   * ipmitool raw 0x0a 0x44 is "Add SEL Entry Command" defined in IPMI spec chapter 31.6
Michel Alexandre Salim 09c31d
+   * The 16 byte that follow form the SEL Record
Michel Alexandre Salim 09c31d
+   * defined in IPMI spec chapter 32.1 "SEL Event Records"
Michel Alexandre Salim 09c31d
+   * Byte 1~2 are Record ID = 0x00 0x00, unused
Michel Alexandre Salim 09c31d
+   * Byte 3 is Record Type = 0xFB, OEM non-timestamped record type for OpenBMC unified SEL
Michel Alexandre Salim 09c31d
+   * Byte 4~16 are OEM defined
Michel Alexandre Salim 09c31d
+   * Byte 11:
Michel Alexandre Salim 09c31d
+     * Byte11[7:3] Device#
Michel Alexandre Salim 09c31d
+     * Byte11[2:0] Function#
Michel Alexandre Salim 09c31d
+   * Byte 12: Bus number
Michel Alexandre Salim 09c31d
+   * Byte 13-15: Reserved
Michel Alexandre Salim 09c31d
+   * Byte 16: ID of the error detected on the PCle device that triggered this SEL record
Michel Alexandre Salim 09c31d
+   */
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+  /* Potentially all error status bits could be set for a given PCIe device.
Michel Alexandre Salim 09c31d
+   * Therefore, iterate over all 32 bits each of cor and uncor errors
Michel Alexandre Salim 09c31d
+   */
Michel Alexandre Salim 09c31d
+  for (i = 0; i < 32; i++) {
Michel Alexandre Salim 09c31d
+    if ((status & (1 << i)) && idarray[i]) {
Michel Alexandre Salim 09c31d
+      sprintf(openbmc_ipmi_add_sel,
Michel Alexandre Salim 09c31d
+          "ipmitool raw 0x0a 0x44 0x00 0x00 0xFB 0x20 0x00 0x00 0x00 0x00 0x01 0x00 0x%02x 0x%02x 0x01 0x00 0xff %s",
Michel Alexandre Salim 09c31d
+          dev_fn, bus, idarray[i]);
Michel Alexandre Salim 09c31d
+      if (system(openbmc_ipmi_add_sel) != 0)
Michel Alexandre Salim 09c31d
+        return -1;
Michel Alexandre Salim 09c31d
+    }
Michel Alexandre Salim 09c31d
+  }
Michel Alexandre Salim 09c31d
+  return 0;
Michel Alexandre Salim 09c31d
+}
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status)
Michel Alexandre Salim 09c31d
+{
Michel Alexandre Salim 09c31d
+  int bus, dev, dev_fn, fn;
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+  sscanf(dev_name, "%*x:%x:%x.%x", &bus, &dev, &fn);
Michel Alexandre Salim 09c31d
+  dev_fn = (((dev & 0x1f) << 3) | (fn & 0x7));
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+  /* Use the appropriate correctable or uncorrectable error status ID
Michel Alexandre Salim 09c31d
+   * for a gien severity level
Michel Alexandre Salim 09c31d
+   */
Michel Alexandre Salim 09c31d
+  if (severity == HW_EVENT_AER_CORRECTED) {
Michel Alexandre Salim 09c31d
+      if (verify_id_log_sel(status, cor_error_ids, bus, dev_fn) < 0)
Michel Alexandre Salim 09c31d
+        return -1;
Michel Alexandre Salim 09c31d
+  }
Michel Alexandre Salim 09c31d
+  else {
Michel Alexandre Salim 09c31d
+      if (verify_id_log_sel(status, uncor_error_ids, bus, dev_fn) < 0)
Michel Alexandre Salim 09c31d
+        return -1;
Michel Alexandre Salim 09c31d
+  }
Michel Alexandre Salim 09c31d
+  return 0;
Michel Alexandre Salim 09c31d
+}
Michel Alexandre Salim 09c31d
diff --git a/unified-sel.h b/unified-sel.h
Michel Alexandre Salim 09c31d
new file mode 100644
Michel Alexandre Salim 09c31d
index 0000000..17458a5
Michel Alexandre Salim 09c31d
--- /dev/null
Michel Alexandre Salim 09c31d
+++ b/unified-sel.h
Michel Alexandre Salim 09c31d
@@ -0,0 +1,17 @@
Michel Alexandre Salim 09c31d
+/*
Michel Alexandre Salim 09c31d
+ * Copyright (c) 2023, Meta Platforms Inc.
Michel Alexandre Salim 09c31d
+ *
Michel Alexandre Salim 09c31d
+ * This program is free software; you can redistribute it and/or modify
Michel Alexandre Salim 09c31d
+ * it under the terms of the GNU General Public License as published by
Michel Alexandre Salim 09c31d
+ * the Free Software Foundation; either version 2 of the License, or
Michel Alexandre Salim 09c31d
+ * (at your option) any later version.
Michel Alexandre Salim 09c31d
+ *
Michel Alexandre Salim 09c31d
+ */
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+#ifndef _UNIFIED_SEL_H
Michel Alexandre Salim 09c31d
+#define _UNIFIED_SEL_H
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+int openbmc_unified_sel_log(uint64_t severity, const char *dev_name, uint64_t status);
Michel Alexandre Salim 09c31d
+
Michel Alexandre Salim 09c31d
+#endif