add dynamic switch of ras events support and disable block_rq_complete

This commit is contained in:
caixiaomeng 2024-04-08 17:12:34 +08:00
parent 8daed7ec36
commit 544fd1a7d7
3 changed files with 516 additions and 1 deletions

View File

@ -0,0 +1,450 @@
From b26f624fbe12203b12b65e0674fea60c70e48a21 Mon Sep 17 00:00:00 2001
From: caixiaomeng 00662745 <caixiaomeng2@huawei.com>
Date: Wed, 21 Feb 2024 15:25:11 +0800
Subject: [PATCH] BACKPORT-Add-Dynamic-Switch
---
misc/rasdaemon.env | 5 +-
ras-disabled-events.h | 10 ++
ras-events.c | 247 +++++++++++++++++++++++++++---------------
rasdaemon.c | 36 ++++++
4 files changed, 208 insertions(+), 90 deletions(-)
create mode 100644 ras-disabled-events.h
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
index dc40af8..6780eb0 100644
--- a/misc/rasdaemon.env
+++ b/misc/rasdaemon.env
@@ -51,4 +51,7 @@ CPU_CE_THRESHOLD="18"
CPU_ISOLATION_CYCLE="24h"
# Prevent excessive isolation from causing an avalanche effect
-CPU_ISOLATION_LIMIT="10"
\ No newline at end of file
+CPU_ISOLATION_LIMIT="10"
+
+# Disable specified events by config
+DISABLE="block:block_rq_complete"
\ No newline at end of file
diff --git a/ras-disabled-events.h b/ras-disabled-events.h
new file mode 100644
index 0000000..298a5f3
--- /dev/null
+++ b/ras-disabled-events.h
@@ -0,0 +1,10 @@
+#ifndef __RAS_DISABLED_EVENTS_H
+#define __RAS_DISABLED_EVENTS_H
+#define DISABLE "DISABLE"
+#define MAX_DISABLED_TRACEPOINTS_NUM 50
+#define MAX_DISABLED_TRACEPOINTS_STR_LENGTH 255
+#define MAX_TRACEPOINTS_STR_LENGTH 50
+
+extern char choices_disable[MAX_DISABLED_TRACEPOINTS_NUM][MAX_TRACEPOINTS_STR_LENGTH];
+extern int disabled_tracepoints_num;
+#endif
\ No newline at end of file
diff --git a/ras-events.c b/ras-events.c
index bc7da34..675d020 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -43,6 +43,7 @@
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-cpu-isolation.h"
+#include "ras-disabled-events.h"
/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
@@ -172,6 +173,23 @@ static int get_tracing_dir(struct ras_events *ras)
return 0;
}
+static bool is_disabled_event(char *group, char *event) {
+ char ras_event_name[MAX_PATH + 1];
+
+ snprintf(ras_event_name, sizeof(ras_event_name), "%s:%s",
+ group, event);
+
+ if (disabled_tracepoints_num == 0) {
+ return false;
+ }
+ for (int i = 0; i < disabled_tracepoints_num; ++i) {
+ if (strcmp(choices_disable[i], ras_event_name) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
/*
* Tracing enable/disable code
*/
@@ -228,40 +246,41 @@ int toggle_ras_mc_event(int enable)
goto free_ras;
}
- rc = __toggle_ras_mc_event(ras, "ras", "mc_event", enable);
+ rc = __toggle_ras_mc_event(ras, "ras", "mc_event", enable > 0 ? (is_disabled_event("ras", "mc_event") ? 0 : 1) : enable);
#ifdef HAVE_AER
- rc |= __toggle_ras_mc_event(ras, "ras", "aer_event", enable);
+ rc |= __toggle_ras_mc_event(ras, "ras", "aer_event", enable > 0 ? (is_disabled_event("ras", "aer_event") ? 0 : 1) : enable);
#endif
#ifdef HAVE_MCE
- rc |= __toggle_ras_mc_event(ras, "mce", "mce_record", enable);
+ rc |= __toggle_ras_mc_event(ras, "mce", "mce_record", enable > 0 ? (is_disabled_event("mce", "mce_record") ? 0 : 1) : enable);
#endif
#ifdef HAVE_EXTLOG
- rc |= __toggle_ras_mc_event(ras, "ras", "extlog_mem_event", enable);
+ rc |= __toggle_ras_mc_event(ras, "ras", "extlog_mem_event", enable > 0 ? (is_disabled_event("ras", "extlog_mem_event") ? 0 : 1) : enable);
#endif
#ifdef HAVE_NON_STANDARD
- rc |= __toggle_ras_mc_event(ras, "ras", "non_standard_event", enable);
+ rc |= __toggle_ras_mc_event(ras, "ras", "non_standard_event", enable > 0 ? (is_disabled_event("ras", "non_standard_event") ? 0 : 1) : enable);
#endif
#ifdef HAVE_ARM
- rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable);
+ rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable > 0 ? (is_disabled_event("ras", "arm_event") ? 0 : 1) : enable);
#endif
#ifdef HAVE_DEVLINK
- rc |= __toggle_ras_mc_event(ras, "devlink", "devlink_health_report", enable);
+ rc |= __toggle_ras_mc_event(ras, "devlink", "devlink_health_report", enable > 0 ? (is_disabled_event("devlink", "devlink_health_report") ? 0 : 1) : enable);
#endif
#ifdef HAVE_DISKERROR
- rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable);
+ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable > 0 ? (is_disabled_event("block", "block_rq_complete") ? 0 : 1) : enable);
#endif
#ifdef HAVE_MEMORY_FAILURE
- rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable);
+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable > 0 ? (is_disabled_event("ras", "memory_failure_event") ? 0 : 1) : enable);
#endif
+
free_ras:
free(ras);
return rc;
@@ -870,42 +889,62 @@ int handle_ras_events(int record_events)
ras_page_account_init();
#endif
- rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event",
- ras_mc_event_handler, NULL, MC_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "ras", "mc_event");
+ if (is_disabled_event("ras", "mc_event")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "ras", "mc_event");
+ } else {
+ rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event",
+ ras_mc_event_handler, NULL, MC_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "mc_event");
+ }
#ifdef HAVE_AER
- rc = add_event_handler(ras, pevent, page_size, "ras", "aer_event",
- ras_aer_event_handler, NULL, AER_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "ras", "aer_event");
+ if (is_disabled_event("ras", "aer_event")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "ras", "aer_event");
+ } else {
+ rc = add_event_handler(ras, pevent, page_size, "ras", "aer_event",
+ ras_aer_event_handler, NULL, AER_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "aer_event");
+ }
#endif
#ifdef HAVE_NON_STANDARD
- rc = add_event_handler(ras, pevent, page_size, "ras", "non_standard_event",
- ras_non_standard_event_handler, NULL, NON_STANDARD_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "ras", "non_standard_event");
+ if (is_disabled_event("ras", "non_standard_event")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "ras", "non_standard_event");
+ } else {
+ rc = add_event_handler(ras, pevent, page_size, "ras", "non_standard_event",
+ ras_non_standard_event_handler, NULL, NON_STANDARD_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "non_standard_event");
+ }
#endif
#ifdef HAVE_ARM
- rc = add_event_handler(ras, pevent, page_size, "ras", "arm_event",
- ras_arm_event_handler, NULL, ARM_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "ras", "arm_event");
+ if (is_disabled_event("ras", "arm_event")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "ras", "arm_event");
+ } else {
+ rc = add_event_handler(ras, pevent, page_size, "ras", "arm_event",
+ ras_arm_event_handler, NULL, ARM_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "arm_event");
+ }
#endif
cpus = get_num_cpus(ras);
@@ -915,72 +954,102 @@ int handle_ras_events(int record_events)
#endif
#ifdef HAVE_MCE
- rc = register_mce_handler(ras, cpus);
- if (rc)
- log(ALL, LOG_INFO, "Can't register mce handler\n");
- if (ras->mce_priv) {
- rc = add_event_handler(ras, pevent, page_size,
- "mce", "mce_record",
- ras_mce_event_handler, NULL, MCE_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "mce", "mce_record");
+ if (is_disabled_event("mce", "mce_record")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "mce", "mce_record");
+ } else {
+ rc = register_mce_handler(ras, cpus);
+ if (rc)
+ log(ALL, LOG_INFO, "Can't register mce handler\n");
+ if (ras->mce_priv) {
+ rc = add_event_handler(ras, pevent, page_size,
+ "mce", "mce_record",
+ ras_mce_event_handler, NULL, MCE_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "mce", "mce_record");
+ }
}
#endif
#ifdef HAVE_EXTLOG
- rc = add_event_handler(ras, pevent, page_size, "ras", "extlog_mem_event",
- ras_extlog_mem_event_handler, NULL, EXTLOG_EVENT);
- if (!rc) {
- /* tell kernel we are listening, so don't printk to console */
- (void)open("/sys/kernel/debug/ras/daemon_active", 0);
- num_events++;
- } else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "ras", "extlog_mem_event");
+ if (is_disabled_event("ras", "extlog_mem_event")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "ras", "extlog_mem_event");
+ } else {
+ rc = add_event_handler(ras, pevent, page_size, "ras", "extlog_mem_event",
+ ras_extlog_mem_event_handler, NULL, EXTLOG_EVENT);
+ if (!rc) {
+ /* tell kernel we are listening, so don't printk to console */
+ (void)open("/sys/kernel/debug/ras/daemon_active", 0);
+ num_events++;
+ } else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "extlog_mem_event");
+ }
#endif
#ifdef HAVE_DEVLINK
- rc = add_event_handler(ras, pevent, page_size, "net",
- "net_dev_xmit_timeout",
- ras_net_xmit_timeout_handler, NULL, DEVLINK_EVENT);
- if (!rc)
- filter_str = "devlink/devlink_health_report:msg=~\'TX timeout*\'";
-
- rc = add_event_handler(ras, pevent, page_size, "devlink",
- "devlink_health_report",
- ras_devlink_event_handler, filter_str, DEVLINK_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "devlink", "devlink_health_report");
+ if (is_disabled_event("net", "net_dev_xmit_timeout")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "net", "net_dev_xmit_timeout");
+ } else {
+ rc = add_event_handler(ras, pevent, page_size, "net",
+ "net_dev_xmit_timeout",
+ ras_net_xmit_timeout_handler, NULL, DEVLINK_EVENT);
+ if (!rc)
+ filter_str = "devlink/devlink_health_report:msg=~\'TX timeout*\'";
+
+ if (is_disabled_event("devlink", "devlink_health_report")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "devlink", "devlink_health_report");
+ } else {
+ rc = add_event_handler(ras, pevent, page_size, "devlink",
+ "devlink_health_report",
+ ras_devlink_event_handler, filter_str, DEVLINK_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "devlink", "devlink_health_report");
+ }
+ }
#endif
#ifdef HAVE_DISKERROR
- rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0");
- if (!rc) {
- rc = add_event_handler(ras, pevent, page_size, "block",
- "block_rq_complete", ras_diskerror_event_handler,
- NULL, DISKERROR_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "block", "block_rq_complete");
+ if (is_disabled_event("block", "block_rq_complete")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "block", "block_rq_complete");
+ } else {
+ rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0");
+ if (!rc) {
+ rc = add_event_handler(ras, pevent, page_size, "block",
+ "block_rq_complete", ras_diskerror_event_handler,
+ NULL, DISKERROR_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "block", "block_rq_complete");
+ }
}
#endif
#ifdef HAVE_MEMORY_FAILURE
- rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event",
- ras_memory_failure_event_handler, NULL, MF_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "ras", "memory_failure_event");
+ if (is_disabled_event("ras", "memory_failure_event")) {
+ log(ALL, LOG_INFO, "Disabled %s:%s tracing from config\n",
+ "ras", "memory_failure_event");
+ } else {
+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event",
+ ras_memory_failure_event_handler, NULL, MF_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "memory_failure_event");
+ }
#endif
if (!num_events) {
diff --git a/rasdaemon.c b/rasdaemon.c
index 66f4dea..0437662 100644
--- a/rasdaemon.c
+++ b/rasdaemon.c
@@ -25,6 +25,7 @@
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-events.h"
+#include "ras-disabled-events.h"
/*
* Arguments(argp) handling logic and main
@@ -34,6 +35,9 @@
#define TOOL_DESCRIPTION "RAS daemon to log the RAS events."
#define ARGS_DOC "<options>"
+char choices_disable[MAX_DISABLED_TRACEPOINTS_NUM][MAX_TRACEPOINTS_STR_LENGTH];
+int disabled_tracepoints_num;
+
const char *argp_program_version = TOOL_NAME " " VERSION;
const char *argp_program_bug_address = "Mauro Carvalho Chehab <mchehab@kernel.org>";
@@ -43,6 +47,36 @@ struct arguments {
int foreground;
};
+static void parse_disabled_choices() {
+ char disabled_tracepoints_str[MAX_DISABLED_TRACEPOINTS_STR_LENGTH];
+ const char* sep = ";";
+ char* tracepoint_str;
+ char* config_disabled_tracepoints = getenv(DISABLE);
+ if (config_disabled_tracepoints == NULL) {
+ return;
+ }
+
+ if (strlen(config_disabled_tracepoints) >= MAX_DISABLED_TRACEPOINTS_STR_LENGTH) {
+ log(ALL, LOG_WARNING, "Failed to read disabled events config string, length exceeds %d characters.\n", MAX_DISABLED_TRACEPOINTS_STR_LENGTH);
+ return;
+ }
+ strcpy(disabled_tracepoints_str, config_disabled_tracepoints);
+
+ tracepoint_str = strtok(disabled_tracepoints_str, sep);
+ int index = 0;
+
+ while(tracepoint_str != NULL && index < MAX_DISABLED_TRACEPOINTS_NUM) {
+ if (strlen(tracepoint_str) >= MAX_TRACEPOINTS_STR_LENGTH) {
+ log(ALL, LOG_WARNING, "Failed to read disabled events config item %s string, length exceeds %d characters, skipped.\n", tracepoint_str, MAX_TRACEPOINTS_STR_LENGTH);
+ }
+ else {
+ strcpy(choices_disable[index++], tracepoint_str);
+ }
+ tracepoint_str = strtok(NULL, sep);
+ }
+ disabled_tracepoints_num = index;
+}
+
static error_t parse_opt(int k, char *arg, struct argp_state *state)
{
struct arguments *args = state->input;
@@ -102,6 +136,8 @@ int main(int argc, char *argv[])
return -1;
}
+ parse_disabled_choices();
+
if (args.enable_ras) {
int enable;
--
2.33.0

View File

@ -0,0 +1,57 @@
From 83f7052a8d8c9641809611d9485256d8ed843c31 Mon Sep 17 00:00:00 2001
From: caixiaomeng 00662745 <caixiaomeng2@huawei.com>
Date: Wed, 6 Mar 2024 14:21:41 +0800
Subject: [PATCH] huawei-fix-rasdaemon-print-loading-config-logs-multi
---
rasdaemon.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/rasdaemon.c b/rasdaemon.c
index 0437662..7ece6c1 100644
--- a/rasdaemon.c
+++ b/rasdaemon.c
@@ -47,7 +47,7 @@ struct arguments {
int foreground;
};
-static void parse_disabled_choices() {
+static void parse_disabled_choices(int enable_ras) {
char disabled_tracepoints_str[MAX_DISABLED_TRACEPOINTS_STR_LENGTH];
const char* sep = ";";
char* tracepoint_str;
@@ -57,16 +57,18 @@ static void parse_disabled_choices() {
}
if (strlen(config_disabled_tracepoints) >= MAX_DISABLED_TRACEPOINTS_STR_LENGTH) {
- log(ALL, LOG_WARNING, "Failed to read disabled events config string, length exceeds %d characters.\n", MAX_DISABLED_TRACEPOINTS_STR_LENGTH);
+ if (enable_ras) {
+ log(ALL, LOG_WARNING, "Failed to read disabled events config string, length exceeds %d characters.\n", MAX_DISABLED_TRACEPOINTS_STR_LENGTH);
+ }
return;
}
strcpy(disabled_tracepoints_str, config_disabled_tracepoints);
-
+
tracepoint_str = strtok(disabled_tracepoints_str, sep);
int index = 0;
while(tracepoint_str != NULL && index < MAX_DISABLED_TRACEPOINTS_NUM) {
- if (strlen(tracepoint_str) >= MAX_TRACEPOINTS_STR_LENGTH) {
+ if (enable_ras && strlen(tracepoint_str) >= MAX_TRACEPOINTS_STR_LENGTH) {
log(ALL, LOG_WARNING, "Failed to read disabled events config item %s string, length exceeds %d characters, skipped.\n", tracepoint_str, MAX_TRACEPOINTS_STR_LENGTH);
}
else {
@@ -136,7 +138,7 @@ int main(int argc, char *argv[])
return -1;
}
- parse_disabled_choices();
+ parse_disabled_choices(args.enable_ras);
if (args.enable_ras) {
int enable;
--
2.33.0

View File

@ -1,6 +1,6 @@
Name: rasdaemon
Version: 0.6.7
Release: 18
Release: 19
License: GPLv2
Summary: Utility to get Platform Reliability, Availability and Serviceability (RAS) reports via the Kernel tracing events
URL: https://github.com/mchehab/rasdaemon.git
@ -56,6 +56,8 @@ Patch9005: 0003-rasdaemon-Add-support-for-creating-the-vendor-error-.patch
Patch9006: 0004-rasdaemon-Add-four-modules-supported-by-HiSilicon-co.patch
Patch9007: fix-ras-events-quit-loop-in-read_ras_event-when-kbuf-dat.patch
Patch9008: 0001-rasdaemon-ras-mc-ctl-Modify-check-for-HiSilicon-KunP.patch
Patch9009: add-dynamic-switch-of-ras-events-support-and-disable-block-rq-complete.patch
Patch9010: fix-rasdaemon-print-loading-config-logs-multiple-times.patch
%description
The rasdaemon program is a daemon which monitors the platform
@ -109,6 +111,12 @@ if [ $1 -eq 0 ] ; then
fi
%changelog
* Mon Apr 8 2024 caixiaomeng <caixiaomeng2@huawei.com> - 0.6.7-19
- Type:bugfix
- ID:NA
- SUG:NA
- DESC:add-dynamic-switch-of-ras-events-support-and-disable-block-rq-complete
* Mon Mar 25 2024 zhangruifang <zhangruifang@h-partners.com> - 0.6.7-18
- Type:bugfix
- ID:NA