diff --git a/backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch b/backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch deleted file mode 100644 index 63ebacd..0000000 --- a/backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch +++ /dev/null @@ -1,104 +0,0 @@ -From e4d27840e173491ab29c2d97017da9344e2c2526 Mon Sep 17 00:00:00 2001 -From: lvying -Date: Sat, 31 Oct 2020 17:57:14 +0800 -Subject: [PATCH 1/2] ras-page-isolation: do_page_offline always considers page - offline was successful - -do_page_offline always consider page offline was successful even if -kernel soft/hard offline page failed. - -Calling rasdaemon with: - - /etc/sysconfig/rasdaemon PAGE_CE_THRESHOLD="1" - -i.e when a page's address occurs Corrected Error, rasdaemon should -trigger this page soft offline. - -However, after adding a livepatch into kernel's -store_soft_offline_page to observe this function's return value, -when injecting a CE into address 0x3f7ec30000, the Kernel -lot reports: - - soft_offline: 0x3f7ec30: unknown non LRU page type ffffe0000000000 () - [store_soft_offline_page]return from soft_offline_page: -5 - -While rasdaemon log reports: - - rasdaemon[73711]: cpu 00:rasdaemon: Corrected Errors at 0x3f7ec30000 exceed threshold - rasdaemon[73711]: rasdaemon: Result of offlining page at 0x3f7ec30000: offlined - -using strace to record rasdaemon's system call, it reports: - - strace -p 73711 - openat(AT_FDCWD, "/sys/devices/system/memory/soft_offline_page", - O_WRONLY|O_CREAT|O_TRUNC, 0666) = 28 - fstat(28, {st_mode=S_IFREG|0200, st_size=4096, ...}) = 0 - write(28, "0x3f7ec30000", 12) = -1 EIO (Input/output error) - close(28) = 0 - -So, kernel actually soft offline pfn 0x3f7ec30 failed and -store_soft_offline_page returned -EIO. However, rasdaemon always -considers the page offline to be successful. - -According to strace display, ferror was unable of detecting the -failure of the write syscall. - -This patch changes fopen-fprintf-ferror-fclose process to use -the lower I/O level, by using instead open-write-close, which -can detect such syscall failure. - -Signed-off-by: lvying -Signed-off-by: Mauro Carvalho Chehab ---- - ras-page-isolation.c | 25 ++++++++++++++++--------- - 1 file changed, 16 insertions(+), 9 deletions(-) - -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -index 50e4406..dc07545 100644 ---- a/ras-page-isolation.c -+++ b/ras-page-isolation.c -@@ -17,6 +17,9 @@ - #include - #include - #include -+#include -+#include -+#include - #include "ras-logger.h" - #include "ras-page-isolation.h" - -@@ -210,18 +213,22 @@ void ras_page_account_init(void) - - static int do_page_offline(unsigned long long addr, enum otype type) - { -- FILE *offline_file; -- int err; -+ int fd, rc; -+ char buf[20]; - -- offline_file = fopen(kernel_offline[type], "w"); -- if (!offline_file) -+ fd = open(kernel_offline[type], O_WRONLY); -+ if (fd == -1) { -+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, kernel_offline[type]); - return -1; -+ } - -- fprintf(offline_file, "%#llx", addr); -- err = ferror(offline_file) ? -1 : 0; -- fclose(offline_file); -- -- return err; -+ sprintf(buf, "%#llx", addr); -+ rc = write(fd, buf, strlen(buf)); -+ if (rc < 0) { -+ log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno); -+ } -+ close(fd); -+ return rc; - } - - static void page_offline(struct page_record *pr) --- -2.18.4 - diff --git a/backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch b/backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch deleted file mode 100644 index b6aba57..0000000 --- a/backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch +++ /dev/null @@ -1,63 +0,0 @@ -From b98880e2cf5fd15e4261676760b719963b956a0e Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Mon, 27 Jul 2020 15:38:37 +0800 -Subject: [PATCH 1/3] rasdaemon: delete the duplicate code about the definition - of hip08 DB fields - -Delete the duplicate code about the definition of DB fields for hip08 OEM -event format1 and format2. Because the two OEM event format is the same. - -Signed-off-By: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - non-standard-hisi_hip08.c | 23 +++++------------------ - 1 file changed, 5 insertions(+), 18 deletions(-) - -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 8bf10c1..7fc6939 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -504,7 +504,7 @@ static char *pcie_local_sub_module_name(uint8_t id) - } - - #ifdef HAVE_SQLITE3 --static const struct db_fields hip08_oem_type1_event_fields[] = { -+static const struct db_fields hip08_oem_event_fields[] = { - { .name = "id", .type = "INTEGER PRIMARY KEY" }, - { .name = "timestamp", .type = "TEXT" }, - { .name = "version", .type = "INTEGER" }, -@@ -519,27 +519,14 @@ static const struct db_fields hip08_oem_type1_event_fields[] = { - - static const struct db_table_descriptor hip08_oem_type1_event_tab = { - .name = "hip08_oem_type1_event_v2", -- .fields = hip08_oem_type1_event_fields, -- .num_fields = ARRAY_SIZE(hip08_oem_type1_event_fields), --}; -- --static const struct db_fields hip08_oem_type2_event_fields[] = { -- { .name = "id", .type = "INTEGER PRIMARY KEY" }, -- { .name = "timestamp", .type = "TEXT" }, -- { .name = "version", .type = "INTEGER" }, -- { .name = "soc_id", .type = "INTEGER" }, -- { .name = "socket_id", .type = "INTEGER" }, -- { .name = "nimbus_id", .type = "INTEGER" }, -- { .name = "module_id", .type = "TEXT" }, -- { .name = "sub_module_id", .type = "TEXT" }, -- { .name = "err_severity", .type = "TEXT" }, -- { .name = "regs_dump", .type = "TEXT" }, -+ .fields = hip08_oem_event_fields, -+ .num_fields = ARRAY_SIZE(hip08_oem_event_fields), - }; - - static const struct db_table_descriptor hip08_oem_type2_event_tab = { - .name = "hip08_oem_type2_event_v2", -- .fields = hip08_oem_type2_event_fields, -- .num_fields = ARRAY_SIZE(hip08_oem_type2_event_fields), -+ .fields = hip08_oem_event_fields, -+ .num_fields = ARRAY_SIZE(hip08_oem_event_fields), - }; - - static const struct db_fields hip08_pcie_local_event_fields[] = { --- -2.7.4 - diff --git a/backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch b/backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch deleted file mode 100644 index 724dc9f..0000000 --- a/backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch +++ /dev/null @@ -1,44 +0,0 @@ -From c329012ce4b44af08217f2a8f2b3b9b1b4b1c0d3 Mon Sep 17 00:00:00 2001 -From: lvying6 -Date: Sat, 31 Oct 2020 17:57:15 +0800 -Subject: [PATCH 2/2] ras-page-isolation: page which is PAGE_OFFLINE_FAILED can - be offlined again - -OS may fail to offline page at the previous time. After some time, -this page's state changed, and the page can be offlined by OS. -At this time, Correctable errors on this page reached the threshold. -Rasdaemon should trigger to offline this page again. - -Signed-off-by: lvying6 -Signed-off-by: Mauro Carvalho Chehab ---- - ras-page-isolation.c | 9 +++++++-- - 1 file changed, 7 insertions(+), 2 deletions(-) - -diff --git a/ras-page-isolation.c b/ras-page-isolation.c -index dc07545..fd7bd70 100644 ---- a/ras-page-isolation.c -+++ b/ras-page-isolation.c -@@ -237,12 +237,17 @@ static void page_offline(struct page_record *pr) - int ret; - - /* Offlining page is not required */ -- if (offline <= OFFLINE_ACCOUNT) -+ if (offline <= OFFLINE_ACCOUNT) { -+ log(TERM, LOG_INFO, "PAGE_CE_ACTION=%s, ignore to offline page at %#llx\n", -+ offline_choice[offline].name, addr); - return; -+ } - - /* Ignore offlined pages */ -- if (pr->offlined != PAGE_ONLINE) -+ if (pr->offlined == PAGE_OFFLINE) { -+ log(TERM, LOG_INFO, "page at %#llx is already offlined, ignore\n", addr); - return; -+ } - - /* Time to silence this noisy page */ - if (offline == OFFLINE_SOFT_THEN_HARD) { --- -2.18.4 - diff --git a/backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch b/backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch deleted file mode 100644 index 3a22ead..0000000 --- a/backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch +++ /dev/null @@ -1,190 +0,0 @@ -From 6ee76565274f31052868e970bce8768c314f6bb7 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Mon, 27 Jul 2020 15:38:38 +0800 -Subject: [PATCH 2/3] rasdaemon: delete the code of non-standard error decoder - for hip07 - -Delete the code of non-standard error decoder for hip07 that was never -used. Because the corresponding code in Linux kernel wasn't accepted. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - Makefile.am | 2 +- - non-standard-hisi_hip07.c | 151 ---------------------------------------------- - 2 files changed, 1 insertion(+), 152 deletions(-) - delete mode 100644 non-standard-hisi_hip07.c - -diff --git a/Makefile.am b/Makefile.am -index 51ef4de..23b4d60 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -52,7 +52,7 @@ if WITH_ABRT_REPORT - rasdaemon_SOURCES += ras-report.c - endif - if WITH_HISI_NS_DECODE -- rasdaemon_SOURCES += non-standard-hisi_hip07.c non-standard-hisi_hip08.c -+ rasdaemon_SOURCES += non-standard-hisi_hip08.c - endif - if WITH_MEMORY_CE_PFA - rasdaemon_SOURCES += rbtree.c ras-page-isolation.c -diff --git a/non-standard-hisi_hip07.c b/non-standard-hisi_hip07.c -deleted file mode 100644 -index 09ddcb2..0000000 ---- a/non-standard-hisi_hip07.c -+++ /dev/null -@@ -1,151 +0,0 @@ --/* -- * Copyright (c) 2017 Hisilicon Limited. -- * -- * This program is free software; you can redistribute it and/or modify -- * it under the terms of the GNU General Public License as published by -- * the Free Software Foundation; either version 2 of the License, or -- * (at your option) any later version. -- * -- */ -- --#include --#include --#include --#include "ras-record.h" --#include "ras-logger.h" --#include "ras-report.h" --#include "ras-non-standard-handler.h" -- --/* common definitions */ -- --/* HISI SAS definitions */ --#define HISI_SAS_VALID_PA BIT(0) --#define HISI_SAS_VALID_MB_ERR BIT(1) --#define HISI_SAS_VALID_ERR_TYPE BIT(2) --#define HISI_SAS_VALID_AXI_ERR_INFO BIT(3) -- --struct hisi_sas_err_sec { -- uint64_t val_bits; -- uint64_t physical_addr; -- uint32_t mb; -- uint32_t type; -- uint32_t axi_err_info; --}; -- --/* Common Functions */ --static char *err_bit_type(int etype) --{ -- switch (etype) { -- case 0x0: return "single-bit ecc"; -- case 0x1: return "multi-bit ecc"; -- } -- return "unknown error"; --} -- --/* SAS Functions */ --static char *sas_err_type(int etype) --{ -- switch (etype) { -- case 0x0001: return "hgc_dqe ecc"; -- case 0x0002: return "hgc_iost ecc"; -- case 0x0004: return "hgc_itct ecc"; -- case 0x0008: return "hgc_iostl ecc"; -- case 0x0010: return "hgc_itctl ecc"; -- case 0x0020: return "hgc_cqe ecc"; -- case 0x0040: return "rxm_mem0 ecc"; -- case 0x0080: return "rxm_mem1 ecc"; -- case 0x0100: return "rxm_mem2 ecc"; -- case 0x0200: return "rxm_mem3 ecc"; -- case 0x0400: return "wp_depth"; -- case 0x0800: return "iptt_slot_no_match"; -- case 0x1000: return "rp_depth"; -- case 0x2000: return "axi err"; -- case 0x4000: return "fifo err"; -- case 0x8000: return "lm_add_fetch_list"; -- case 0x10000: return "hgc_abt_fetch_lm"; -- } -- return "unknown error"; --} -- --static char *sas_axi_err_type(int etype) --{ -- switch (etype) { -- case 0x0001: return "IOST_AXI_W_ERR"; -- case 0x0002: return "IOST_AXI_R_ERR"; -- case 0x0004: return "ITCT_AXI_W_ERR"; -- case 0x0008: return "ITCT_AXI_R_ERR"; -- case 0x0010: return "SATA_AXI_W_ERR"; -- case 0x0020: return "SATA_AXI_R_ERR"; -- case 0x0040: return "DQE_AXI_R_ERR"; -- case 0x0080: return "CQE_AXI_W_ERR"; -- case 0x0100: return "CQE_WINFO_FIFO"; -- case 0x0200: return "CQE_MSG_FIFIO"; -- case 0x0400: return "GETDQE_FIFO"; -- case 0x0800: return "CMDP_FIFO"; -- case 0x1000: return "AWTCTRL_FIFO"; -- } -- return "unknown error"; --} -- --static int decode_hip07_sas_error(struct ras_events *ras, -- struct ras_ns_dec_tab *dec_tab, -- struct trace_seq *s, -- struct ras_non_standard_event *event) --{ -- char buf[1024]; -- char *p = buf; -- const struct hisi_sas_err_sec *err = -- (struct hisi_sas_err_sec *)event->error; -- -- if (err->val_bits == 0) { -- trace_seq_printf(s, "%s: no valid error data\n", -- __func__); -- return -1; -- } -- p += sprintf(p, "["); -- if (err->val_bits & HISI_SAS_VALID_PA) -- p += sprintf(p, "phy addr = 0x%p: ", -- (void *)err->physical_addr); -- -- if (err->val_bits & HISI_SAS_VALID_MB_ERR) -- p += sprintf(p, "%s: ", err_bit_type(err->mb)); -- -- if (err->val_bits & HISI_SAS_VALID_ERR_TYPE) -- p += sprintf(p, "error type = %s: ", -- sas_err_type(err->type)); -- -- if (err->val_bits & HISI_SAS_VALID_AXI_ERR_INFO) -- p += sprintf(p, "axi error type = %s", -- sas_axi_err_type(err->axi_err_info)); -- -- p += sprintf(p, "]"); -- -- trace_seq_printf(s, "\nHISI HIP07: SAS error: %s\n", buf); -- return 0; --} -- --static int decode_hip07_hns_error(struct ras_events *ras, -- struct ras_ns_dec_tab *dec_tab, -- struct trace_seq *s, -- struct ras_non_standard_event *event) --{ -- return 0; --} -- --struct ras_ns_dec_tab hisi_ns_dec_tab[] = { -- { -- .sec_type = "daffd8146eba4d8c8a91bc9bbf4aa301", -- .decode = decode_hip07_sas_error, -- }, -- { -- .sec_type = "fbc2d923ea7a453dab132949f5af9e53", -- .decode = decode_hip07_hns_error, -- }, -- { /* sentinel */ } --}; -- --__attribute__((constructor)) --static void hip07_init(void) --{ -- register_ns_dec_tab(hisi_ns_dec_tab); --} --- -2.7.4 - diff --git a/backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch b/backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch deleted file mode 100644 index 7eaa3f3..0000000 --- a/backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch +++ /dev/null @@ -1,527 +0,0 @@ -From 8c30a852493a6204ded59872bb3a0f0e43537713 Mon Sep 17 00:00:00 2001 -From: Xiaofei Tan -Date: Mon, 27 Jul 2020 15:38:39 +0800 -Subject: [PATCH 3/3] rasdaemon: add support for hisilicon common section - decoder - -Add a new non-standard error section, Hisilicon common section. -It is defined for the next generation SoC Kunpeng930. It also supports -Kunpeng920 and some modules of Kunpeng920 could be changed to use -this section. - -We put the code to an new source file, as it supports multiple Hardware -platform. Some code of hip08 could be shared. Move them to this new file. - -Signed-off-by: Xiaofei Tan -Signed-off-by: Mauro Carvalho Chehab ---- - Makefile.am | 2 +- - non-standard-hisi_hip08.c | 79 +----------- - non-standard-hisilicon.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++ - non-standard-hisilicon.h | 49 ++++++++ - 4 files changed, 358 insertions(+), 79 deletions(-) - create mode 100644 non-standard-hisilicon.c - create mode 100644 non-standard-hisilicon.h - -diff --git a/Makefile.am b/Makefile.am -index 23b4d60..18d1a92 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -52,7 +52,7 @@ if WITH_ABRT_REPORT - rasdaemon_SOURCES += ras-report.c - endif - if WITH_HISI_NS_DECODE -- rasdaemon_SOURCES += non-standard-hisi_hip08.c -+ rasdaemon_SOURCES += non-standard-hisi_hip08.c non-standard-hisilicon.c - endif - if WITH_MEMORY_CE_PFA - rasdaemon_SOURCES += rbtree.c ras-page-isolation.c -diff --git a/non-standard-hisi_hip08.c b/non-standard-hisi_hip08.c -index 7fc6939..2197f81 100644 ---- a/non-standard-hisi_hip08.c -+++ b/non-standard-hisi_hip08.c -@@ -15,6 +15,7 @@ - #include "ras-logger.h" - #include "ras-report.h" - #include "ras-non-standard-handler.h" -+#include "non-standard-hisilicon.h" - - /* HISI OEM error definitions */ - /* HISI OEM format1 error definitions */ -@@ -83,11 +84,6 @@ - #define HISI_PCIE_LOCAL_ERR_MISC_MAX 33 - #define HISI_BUF_LEN 1024 - --#define HISI_ERR_SEVERITY_NFE 0 --#define HISI_ERR_SEVERITY_FE 1 --#define HISI_ERR_SEVERITY_CE 2 --#define HISI_ERR_SEVERITY_NONE 3 -- - struct hisi_oem_type1_err_sec { - uint32_t val_bits; - uint8_t version; -@@ -145,12 +141,6 @@ struct hisi_pcie_local_err_sec { - uint32_t err_misc[HISI_PCIE_LOCAL_ERR_MISC_MAX]; - }; - --enum hisi_oem_data_type { -- HISI_OEM_DATA_TYPE_INT, -- HISI_OEM_DATA_TYPE_INT64, -- HISI_OEM_DATA_TYPE_TEXT, --}; -- - enum { - HIP08_OEM_TYPE1_FIELD_ID, - HIP08_OEM_TYPE1_FIELD_TIMESTAMP, -@@ -199,20 +189,6 @@ struct hisi_module_info { - int sub_num; - }; - --/* helper functions */ --static char *err_severity(uint8_t err_sev) --{ -- switch (err_sev) { -- case HISI_ERR_SEVERITY_NFE: return "recoverable"; -- case HISI_ERR_SEVERITY_FE: return "fatal"; -- case HISI_ERR_SEVERITY_CE: return "corrected"; -- case HISI_ERR_SEVERITY_NONE: return "none"; -- default: -- break; -- } -- return "unknown"; --} -- - static const char *pll_submodule_name[] = { - "TB_PLL0", - "TB_PLL1", -@@ -549,59 +525,6 @@ static const struct db_table_descriptor hip08_pcie_local_event_tab = { - .fields = hip08_pcie_local_event_fields, - .num_fields = ARRAY_SIZE(hip08_pcie_local_event_fields), - }; -- --static void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -- enum hisi_oem_data_type data_type, -- int id, int64_t data, const char *text) --{ -- switch (data_type) { -- case HISI_OEM_DATA_TYPE_INT: -- sqlite3_bind_int(dec_tab->stmt_dec_record, id, data); -- break; -- case HISI_OEM_DATA_TYPE_INT64: -- sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data); -- break; -- case HISI_OEM_DATA_TYPE_TEXT: -- sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL); -- break; -- default: -- break; -- } --} -- --static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, -- const char *name) --{ -- int rc; -- -- rc = sqlite3_step(dec_tab->stmt_dec_record); -- if (rc != SQLITE_OK && rc != SQLITE_DONE) -- log(TERM, LOG_ERR, -- "Failed to do %s step on sqlite: error = %d\n", name, rc); -- -- rc = sqlite3_reset(dec_tab->stmt_dec_record); -- if (rc != SQLITE_OK && rc != SQLITE_DONE) -- log(TERM, LOG_ERR, -- "Failed to reset %s on sqlite: error = %d\n", name, rc); -- -- rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record); -- if (rc != SQLITE_OK && rc != SQLITE_DONE) -- log(TERM, LOG_ERR, -- "Failed to clear bindings %s on sqlite: error = %d\n", -- name, rc); -- -- return rc; --} --#else --static void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -- enum hisi_oem_data_type data_type, -- int id, int64_t data, const char *text) --{ } -- --static int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, char *name) --{ -- return 0; --} - #endif - - #define IN_RANGE(p, start, end) ((p) >= (start) && (p) < (end)) -diff --git a/non-standard-hisilicon.c b/non-standard-hisilicon.c -new file mode 100644 -index 0000000..c9e1fa9 ---- /dev/null -+++ b/non-standard-hisilicon.c -@@ -0,0 +1,307 @@ -+/* -+ * Copyright (c) 2020 Hisilicon Limited. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+ -+#include -+#include -+#include -+#include "ras-record.h" -+#include "ras-logger.h" -+#include "ras-report.h" -+#include "non-standard-hisilicon.h" -+ -+#define HISI_BUF_LEN 2048 -+ -+struct hisi_common_error_section { -+ uint32_t val_bits; -+ uint8_t version; -+ uint8_t soc_id; -+ uint8_t socket_id; -+ uint8_t totem_id; -+ uint8_t nimbus_id; -+ uint8_t subsystem_id; -+ uint8_t module_id; -+ uint8_t submodule_id; -+ uint8_t core_id; -+ uint8_t port_id; -+ uint16_t err_type; -+ struct { -+ uint8_t function; -+ uint8_t device; -+ uint16_t segment; -+ uint8_t bus; -+ uint8_t reserved[3]; -+ } pcie_info; -+ uint8_t err_severity; -+ uint8_t reserved[3]; -+ uint32_t reg_array_size; -+ uint32_t reg_array[]; -+}; -+ -+enum { -+ HISI_COMMON_VALID_SOC_ID, -+ HISI_COMMON_VALID_SOCKET_ID, -+ HISI_COMMON_VALID_TOTEM_ID, -+ HISI_COMMON_VALID_NIMBUS_ID, -+ HISI_COMMON_VALID_SUBSYSTEM_ID, -+ HISI_COMMON_VALID_MODULE_ID, -+ HISI_COMMON_VALID_SUBMODULE_ID, -+ HISI_COMMON_VALID_CORE_ID, -+ HISI_COMMON_VALID_PORT_ID, -+ HISI_COMMON_VALID_ERR_TYPE, -+ HISI_COMMON_VALID_PCIE_INFO, -+ HISI_COMMON_VALID_ERR_SEVERITY, -+ HISI_COMMON_VALID_REG_ARRAY_SIZE, -+}; -+ -+enum { -+ HISI_COMMON_FIELD_ID, -+ HISI_COMMON_FIELD_TIMESTAMP, -+ HISI_COMMON_FIELD_ERR_INFO, -+ HISI_COMMON_FIELD_REGS_DUMP, -+}; -+ -+struct hisi_event { -+ char error_msg[HISI_BUF_LEN]; -+ char reg_msg[HISI_BUF_LEN]; -+}; -+ -+#ifdef HAVE_SQLITE3 -+void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+ enum hisi_oem_data_type data_type, -+ int id, int64_t data, const char *text) -+{ -+ switch (data_type) { -+ case HISI_OEM_DATA_TYPE_INT: -+ sqlite3_bind_int(dec_tab->stmt_dec_record, id, data); -+ break; -+ case HISI_OEM_DATA_TYPE_INT64: -+ sqlite3_bind_int64(dec_tab->stmt_dec_record, id, data); -+ break; -+ case HISI_OEM_DATA_TYPE_TEXT: -+ sqlite3_bind_text(dec_tab->stmt_dec_record, id, text, -1, NULL); -+ break; -+ } -+} -+ -+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name) -+{ -+ int rc; -+ -+ rc = sqlite3_step(dec_tab->stmt_dec_record); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to do %s step on sqlite: error = %d\n", name, rc); -+ -+ rc = sqlite3_reset(dec_tab->stmt_dec_record); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to reset %s on sqlite: error = %d\n", name, rc); -+ -+ rc = sqlite3_clear_bindings(dec_tab->stmt_dec_record); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to clear bindings %s on sqlite: error = %d\n", -+ name, rc); -+ -+ return rc; -+} -+#else -+void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+ enum hisi_oem_data_type data_type, -+ int id, int64_t data, const char *text) -+{ } -+ -+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name) -+{ -+ return 0; -+} -+#endif -+ -+#ifdef HAVE_SQLITE3 -+static const struct db_fields hisi_common_section_fields[] = { -+ { .name = "id", .type = "INTEGER PRIMARY KEY" }, -+ { .name = "timestamp", .type = "TEXT" }, -+ { .name = "err_info", .type = "TEXT" }, -+ { .name = "regs_dump", .type = "TEXT" }, -+}; -+ -+static const struct db_table_descriptor hisi_common_section_tab = { -+ .name = "hisi_common_section", -+ .fields = hisi_common_section_fields, -+ .num_fields = ARRAY_SIZE(hisi_common_section_fields), -+}; -+#endif -+ -+static const char* soc_desc[] = { -+ "Kunpeng916", -+ "Kunpeng920", -+ "Kunpeng930", -+}; -+ -+static const char* module_name[] = { -+ "MN", -+ "PLL", -+ "SLLC", -+ "AA", -+ "SIOE", -+ "POE", -+ "CPA", -+ "DISP", -+ "GIC", -+ "ITS", -+ "AVSBUS", -+ "CS", -+ "PPU", -+ "SMMU", -+ "PA", -+ "HLLC", -+ "DDRC", -+ "L3TAG", -+ "L3DATA", -+ "PCS", -+ "MATA", -+ "PCIe Local", -+ "SAS", -+ "SATA", -+ "NIC", -+ "RoCE", -+ "USB", -+ "ZIP", -+ "HPRE", -+ "SEC", -+ "RDE", -+ "MEE", -+ "HHA", -+}; -+ -+static const char* get_soc_desc(uint8_t soc_id) -+{ -+ if (soc_id >= sizeof(soc_desc)/sizeof(char *)) -+ return "unknown"; -+ -+ return soc_desc[soc_id]; -+} -+ -+static void decode_module(struct hisi_event *event, uint8_t module_id) -+{ -+ if (module_id >= sizeof(module_name)/sizeof(char *)) -+ HISI_SNPRINTF(event->error_msg, "module=unknown(id=%d) ", module_id); -+ else -+ HISI_SNPRINTF(event->error_msg, "module=%s ", module_name[module_id]); -+} -+ -+static void decode_hisi_common_section_hdr(struct ras_ns_dec_tab *dec_tab, -+ const struct hisi_common_error_section *err, -+ struct hisi_event *event) -+{ -+ HISI_SNPRINTF(event->error_msg, "[ table_version=%d", err->version); -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOC_ID)) -+ HISI_SNPRINTF(event->error_msg, "soc=%s", get_soc_desc(err->soc_id)); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SOCKET_ID)) -+ HISI_SNPRINTF(event->error_msg, "socket_id=%d", err->socket_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_TOTEM_ID)) -+ HISI_SNPRINTF(event->error_msg, "totem_id=%d", err->totem_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_NIMBUS_ID)) -+ HISI_SNPRINTF(event->error_msg, "nimbus_id=%d", err->nimbus_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBSYSTEM_ID)) -+ HISI_SNPRINTF(event->error_msg, "subsystem_id=%d", err->subsystem_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_MODULE_ID)) -+ decode_module(event, err->module_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_SUBMODULE_ID)) -+ HISI_SNPRINTF(event->error_msg, "submodule_id=%d", err->submodule_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_CORE_ID)) -+ HISI_SNPRINTF(event->error_msg, "core_id=%d", err->core_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_PORT_ID)) -+ HISI_SNPRINTF(event->error_msg, "port_id=%d", err->port_id); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_TYPE)) -+ HISI_SNPRINTF(event->error_msg, "err_type=%d", err->err_type); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_PCIE_INFO)) -+ HISI_SNPRINTF(event->error_msg, "pcie_device_id=%04x:%02x:%02x.%x", -+ err->pcie_info.segment, err->pcie_info.bus, -+ err->pcie_info.device, err->pcie_info.function); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_ERR_SEVERITY)) -+ HISI_SNPRINTF(event->error_msg, "err_severity=%s", err_severity(err->err_severity)); -+ -+ HISI_SNPRINTF(event->error_msg, "]"); -+} -+ -+static int decode_hisi_common_section(struct ras_events *ras, -+ struct ras_ns_dec_tab *dec_tab, -+ struct trace_seq *s, -+ struct ras_non_standard_event *event) -+{ -+ const struct hisi_common_error_section *err = -+ (struct hisi_common_error_section *)event->error; -+ struct hisi_event hevent; -+ -+#ifdef HAVE_SQLITE3 -+ if (ras->record_events && !dec_tab->stmt_dec_record) { -+ if (ras_mc_add_vendor_table(ras, &dec_tab->stmt_dec_record, -+ &hisi_common_section_tab) != SQLITE_OK) { -+ trace_seq_printf(s, "create sql hisi_common_section_tab fail\n"); -+ return -1; -+ } -+ } -+#endif -+ -+ memset(&hevent, 0, sizeof(struct hisi_event)); -+ trace_seq_printf(s, "\nHisilicon Common Error Section:\n"); -+ decode_hisi_common_section_hdr(dec_tab, err, &hevent); -+ trace_seq_printf(s, "%s\n", hevent.error_msg); -+ -+ if (err->val_bits & BIT(HISI_COMMON_VALID_REG_ARRAY_SIZE) && err->reg_array_size > 0) { -+ int i; -+ -+ trace_seq_printf(s, "Register Dump:\n"); -+ for (i = 0; i < err->reg_array_size / sizeof(uint32_t); i++) { -+ trace_seq_printf(s, "reg%02d=0x%08x\n", i, -+ err->reg_array[i]); -+ HISI_SNPRINTF(hevent.reg_msg, "reg%02d=0x%08x", -+ i, err->reg_array[i]); -+ } -+ } -+ -+ if (ras->record_events) { -+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_TIMESTAMP, -+ 0, event->timestamp); -+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_ERR_INFO, 0, hevent.error_msg); -+ record_vendor_data(dec_tab, HISI_OEM_DATA_TYPE_TEXT, -+ HISI_COMMON_FIELD_REGS_DUMP, 0, hevent.reg_msg); -+ step_vendor_data_tab(dec_tab, "hisi_common_section_tab"); -+ } -+ -+ return 0; -+} -+ -+struct ras_ns_dec_tab hisi_section_ns_tab[] = { -+ { -+ .sec_type = "c8b328a899174af69a132e08ab2e7586", -+ .decode = decode_hisi_common_section, -+ }, -+ { /* sentinel */ } -+}; -+ -+static void __attribute__((constructor)) hisi_ns_init(void) -+{ -+ register_ns_dec_tab(hisi_section_ns_tab); -+} -diff --git a/non-standard-hisilicon.h b/non-standard-hisilicon.h -new file mode 100644 -index 0000000..1ce210a ---- /dev/null -+++ b/non-standard-hisilicon.h -@@ -0,0 +1,49 @@ -+/* -+ * Copyright (c) 2020 Hisilicon Limited. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+ -+#ifndef __NON_STANDARD_HISILICON_H -+#define __NON_STANDARD_HISILICON_H -+ -+#include "ras-non-standard-handler.h" -+#include "ras-mc-handler.h" -+ -+#define HISI_SNPRINTF mce_snprintf -+ -+#define HISI_ERR_SEVERITY_NFE 0 -+#define HISI_ERR_SEVERITY_FE 1 -+#define HISI_ERR_SEVERITY_CE 2 -+#define HISI_ERR_SEVERITY_NONE 3 -+ -+enum hisi_oem_data_type { -+ HISI_OEM_DATA_TYPE_INT, -+ HISI_OEM_DATA_TYPE_INT64, -+ HISI_OEM_DATA_TYPE_TEXT, -+}; -+ -+/* helper functions */ -+static inline char *err_severity(uint8_t err_sev) -+{ -+ switch (err_sev) { -+ case HISI_ERR_SEVERITY_NFE: return "recoverable"; -+ case HISI_ERR_SEVERITY_FE: return "fatal"; -+ case HISI_ERR_SEVERITY_CE: return "corrected"; -+ case HISI_ERR_SEVERITY_NONE: return "none"; -+ default: -+ break; -+ } -+ return "unknown"; -+} -+ -+void record_vendor_data(struct ras_ns_dec_tab *dec_tab, -+ enum hisi_oem_data_type data_type, -+ int id, int64_t data, const char *text); -+int step_vendor_data_tab(struct ras_ns_dec_tab *dec_tab, const char *name); -+ -+#endif --- -2.7.4 - diff --git a/backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch b/backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch new file mode 100644 index 0000000..b4ba376 --- /dev/null +++ b/backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch @@ -0,0 +1,37 @@ +From 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4 Mon Sep 17 00:00:00 2001 +From: Matt Whitlock +Date: Wed, 9 Jun 2021 10:25:18 -0400 +Subject: [PATCH] configure.ac: fix SYSCONFDEFDIR default value + +configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like: + + # Check whether --with-sysconfdefdir was given. + if test "${with_sysconfdefdir+set}" = set; then : + withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval + else + "/etc/sysconfig" + fi + +This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command. + +Signed-off-by: Mauro Carvalho Chehab +--- + configure.ac | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/configure.ac b/configure.ac +index f7d1947..33b81fe 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR]) + AC_ARG_WITH(sysconfdefdir, + AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]), + [SYSCONFDEFDIR=$withval], +- ["/etc/sysconfig"]) ++ [SYSCONFDEFDIR=/etc/sysconfig]) + AC_SUBST([SYSCONFDEFDIR]) + + AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database]) +-- +2.27.0 + diff --git a/backport-rasdaemon-Fix-error-print.patch b/backport-rasdaemon-Fix-error-print.patch deleted file mode 100644 index 6e315ba..0000000 --- a/backport-rasdaemon-Fix-error-print.patch +++ /dev/null @@ -1,29 +0,0 @@ -From 00115dda854f4a50681ccc6c017daa991234411b Mon Sep 17 00:00:00 2001 -From: Liguang Zhang -Date: Mon, 10 Aug 2020 11:07:43 +0800 -Subject: [PATCH] rasdaemon: Fix error print - -Fix error print handle_ras_events. - -Signed-off-by: Liguang Zhang -Signed-off-by: Mauro Carvalho Chehab ---- - ras-events.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/ras-events.c b/ras-events.c -index a99fd29..c797b20 100644 ---- a/ras-events.c -+++ b/ras-events.c -@@ -874,7 +874,7 @@ int handle_ras_events(int record_events) - num_events++; - } else - log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", -- "ras", "aer_event"); -+ "ras", "extlog_mem_event"); - #endif - - #ifdef HAVE_DEVLINK --- -2.18.4 - diff --git a/bugfix-fix-disk-error-log-storm.patch b/bugfix-fix-disk-error-log-storm.patch index 8241cfc..5df0299 100644 --- a/bugfix-fix-disk-error-log-storm.patch +++ b/bugfix-fix-disk-error-log-storm.patch @@ -15,7 +15,7 @@ index e73a08a..04a0489 100644 @@ -4,7 +4,7 @@ After=syslog.target [Service] - EnvironmentFile=/etc/sysconfig/rasdaemon + EnvironmentFile=@SYSCONFDEFDIR@/rasdaemon -ExecStart=@sbindir@/rasdaemon -f -r +ExecStart=@sbindir@/rasdaemon -f ExecStartPost=@sbindir@/rasdaemon --enable diff --git a/bugfix-ras-events-memory-leak.patch b/bugfix-ras-events-memory-leak.patch deleted file mode 100644 index 977459a..0000000 --- a/bugfix-ras-events-memory-leak.patch +++ /dev/null @@ -1,18 +0,0 @@ -From d59e4d224b3271cf7a7fe53cd7c5d539b58eac32 Mon Sep 17 00:00:00 2001 -From: lvying -Date: Sat, 26 Jan 2019 15:54:17 +0800 -Subject: [PATCH] rasdaemon:fix ras events memory leak - -reason:fix ras events memory leak - -diff -uprN a/ras-events.c b/ras-events.c ---- a/ras-events.c 2018-06-22 14:20:42.880878700 +0800 -+++ b/ras-events.c 2018-06-22 14:38:24.420726900 +0800 -@@ -314,6 +314,7 @@ static void parse_ras_data(struct pthrea - trace_seq_init(&s); - pevent_print_event(pdata->ras->pevent, &s, &record); - trace_seq_do_printf(&s); -+ trace_seq_destroy(&s); - printf("\n"); - fflush(stdout); - } diff --git a/rasdaemon-0.6.6.tar.gz b/rasdaemon-0.6.6.tar.gz deleted file mode 100644 index ea4552e..0000000 Binary files a/rasdaemon-0.6.6.tar.gz and /dev/null differ diff --git a/rasdaemon-0.6.7.tar.gz b/rasdaemon-0.6.7.tar.gz new file mode 100644 index 0000000..553577b Binary files /dev/null and b/rasdaemon-0.6.7.tar.gz differ diff --git a/rasdaemon.spec b/rasdaemon.spec index 8e4b3c9..89c567d 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon -Version: 0.6.6 -Release: 6 +Version: 0.6.7 +Release: 1 License: GPLv2 Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events URL: https://github.com/mchehab/rasdaemon.git @@ -19,16 +19,10 @@ Requires(post): systemd Requires(preun): systemd Requires(postun): systemd -Patch1: bugfix-ras-events-memory-leak.patch -Patch2: bugfix-rasdaemon-wait-for-file-access.patch -Patch3: bugfix-fix-fd-check.patch -Patch4: backport-0001-ras-page-isolation-do_page_offline-always-considers-.patch -Patch5: backport-0002-ras-page-isolation-page-which-is-PAGE_OFFLINE_FAILED.patch -Patch6: backport-rasdaemon-Fix-error-print.patch -Patch7: bugfix-fix-disk-error-log-storm.patch -Patch8: backport-0001-rasdaemon-delete-the-duplicate-code-about-the-defini.patch -Patch9: backport-0002-rasdaemon-delete-the-code-of-non-standard-error-deco.patch -Patch10: backport-0003-rasdaemon-add-support-for-hisilicon-common-section-d.patch +Patch1: bugfix-rasdaemon-wait-for-file-access.patch +Patch2: bugfix-fix-fd-check.patch +Patch3: bugfix-fix-disk-error-log-storm.patch +Patch4: backport-configure.ac-fix-SYSCONFDEFDIR-default-value.patch %description The rasdaemon program is a daemon which monitors the platform @@ -67,7 +61,6 @@ rm INSTALL %{buildroot}/usr/include/*.h %{_sbindir}/ras-mc-ctl %{_mandir}/*/* %{_unitdir}/*.service -%{_sharedstatedir}/rasdaemon %{_sysconfdir}/ras/dimm_labels.d %config(noreplace) %{_sysconfdir}/sysconfig/%{name} @@ -75,6 +68,9 @@ rm INSTALL %{buildroot}/usr/include/*.h /usr/bin/systemctl enable rasdaemon.service >/dev/null 2>&1 || : %changelog +* Wed Dec 8 2021 xujing - 0.6.7-1 +- Update software to v0.6.7 + * Sat July 29 2021 tanxiaofei - 0.6.6-6 - Type:feature - ID:NA