gala-anteater/Add-disk-throughput-detector.patch
Zhen Chen 1f46c219ad Model optimization and bugfix
- fix str2enum bug & data query refactor
- add systemd service for anteater
- remove 'sys-level' config param
- add chinese descriptions
- Update TCP Establish Model & Add Nic Loss Detector
- Add disk throughput detector

(cherry picked from commit f3c17e8c6a619a7803afd89b945ae3f36d17f9b0)
2023-01-17 22:40:46 +08:00

479 lines
17 KiB
Diff

From ac1383471f72420e3320eb7c7999021f3658fb7d Mon Sep 17 00:00:00 2001
From: lizhenxing11 <lizhenxing11@huawei.com>
Date: Wed, 7 Dec 2022 16:59:15 +0800
Subject: [PATCH] Add disk throughput detector
add keywords
extract cause metric to the attributes
update template
---
anteater/config.py | 3 -
anteater/core/kpi.py | 1 +
anteater/main.py | 2 +
anteater/model/algorithms/three_sigma.py | 2 +-
anteater/module/base.py | 6 +-
anteater/module/sys/disk_throughput.py | 62 +++++++++++++
anteater/module/sys/proc_io_latency.py | 4 +-
anteater/source/anomaly_report.py | 3 +-
anteater/template/app_anomaly_template.py | 4 +-
anteater/template/sys_anomaly_template.py | 4 +-
anteater/template/template.py | 3 +-
anteater/utils/data_load.py | 2 +
config/module/app_sli_rtt.json | 3 +
config/module/disk_throughput.json | 92 +++++++++++++++++++
config/module/proc_io_latency.json | 3 +
config/module/sys_io_latency.json | 3 +
config/module/sys_tcp_establish.json | 3 +
.../module/sys_tcp_transmission_latency.json | 3 +
.../sys_tcp_transmission_throughput.json | 3 +
19 files changed, 193 insertions(+), 13 deletions(-)
create mode 100644 anteater/module/sys/disk_throughput.py
create mode 100644 config/module/disk_throughput.json
diff --git a/anteater/config.py b/anteater/config.py
index ea02702..e9ab557 100644
--- a/anteater/config.py
+++ b/anteater/config.py
@@ -81,9 +81,6 @@ class AnteaterConf:
"""Loads config from yaml file"""
data_path = os.path.realpath(data_path)
- if not os.path.exists(data_path):
- os.makedirs(data_path)
-
try:
with open(os.path.join(data_path, "config", self.filename), "rb") as f:
result = yaml.safe_load(f)
diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py
index 5a9d8ab..3480139 100644
--- a/anteater/core/kpi.py
+++ b/anteater/core/kpi.py
@@ -48,6 +48,7 @@ class ModelConfig:
class JobConfig:
name: str
job_type: str
+ keywords: List[str]
root_cause_number: int
kpis: List[KPI]
features: List[Feature]
diff --git a/anteater/main.py b/anteater/main.py
index 11e0409..ba7be70 100644
--- a/anteater/main.py
+++ b/anteater/main.py
@@ -21,6 +21,7 @@ from apscheduler.schedulers.blocking import BlockingScheduler
from anteater.anomaly_detection import AnomalyDetection
from anteater.config import AnteaterConf
from anteater.module.app.app_sli_detector import APPSliDetector
+from anteater.module.sys.disk_throughput import DiskThroughputDetector
from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector
from anteater.module.sys.sys_io_latency import SysIOLatencyDetector
from anteater.module.sys.tcp_establish import SysTcpEstablishDetector
@@ -57,6 +58,7 @@ def main():
SysTcpTransmissionLatencyDetector(loader, report),
SysIOLatencyDetector(loader, report),
ProcIOLatencyDetector(loader, report),
+ DiskThroughputDetector(loader, report),
]
else:
detectors = [
diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py
index 457b606..49b9952 100644
--- a/anteater/model/algorithms/three_sigma.py
+++ b/anteater/model/algorithms/three_sigma.py
@@ -31,7 +31,7 @@ def three_sigma(values, obs_size, n=3, method="abs"):
elif method == 'min':
outlier = [val for val in obs_val if val < mean - n * std]
elif method == 'max':
- outlier = [val for val in obs_val if val > mean + 3 * std]
+ outlier = [val for val in obs_val if val > mean + n * std]
else:
raise ValueError(f'Unknown method {method}')
diff --git a/anteater/module/base.py b/anteater/module/base.py
index 7b5fc84..63436ac 100644
--- a/anteater/module/base.py
+++ b/anteater/module/base.py
@@ -48,14 +48,14 @@ class E2EDetector:
for detector in self.detectors:
anomalies = detector.execute(self.job_config)
for anomaly in anomalies:
- self.report(anomaly)
+ self.report(anomaly, self.job_config.keywords)
@abstractmethod
def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
"""Parses the cause metrics into the specific formats"""
pass
- def report(self, anomaly: Anomaly):
+ def report(self, anomaly: Anomaly, keywords):
"""Parses the anomaly into a specific formats
based on the template and reports parsed results
"""
@@ -63,4 +63,4 @@ class E2EDetector:
timestamp = dt.utc_now()
template = self.template(timestamp, anomaly.machine_id,
anomaly.metric, anomaly.entity_name)
- self.reporter.sent_anomaly(anomaly, cause_metrics, template)
+ self.reporter.sent_anomaly(anomaly, cause_metrics, keywords, template)
diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py
new file mode 100644
index 0000000..9a192fb
--- /dev/null
+++ b/anteater/module/sys/disk_throughput.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python3
+# ******************************************************************************
+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# gala-anteater is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+
+from typing import List, Dict
+
+from anteater.core.anomaly import Anomaly
+from anteater.module.base import E2EDetector
+from anteater.model.detector.online_vae_detector import OnlineVAEDetector
+from anteater.model.detector.n_sigma_detector import NSigmaDetector
+from anteater.source.anomaly_report import AnomalyReport
+from anteater.source.metric_loader import MetricLoader
+from anteater.template.sys_anomaly_template import SysAnomalyTemplate
+
+
+class DiskThroughputDetector(E2EDetector):
+ """Disk throughput e2e detector which detects the disk read or write
+ await time performance deteriorates
+ """
+
+ config_file = 'disk_throughput.json'
+
+ def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport):
+ """The disk throughput e2e detector initializer"""
+ super().__init__(reporter, SysAnomalyTemplate)
+
+ self.detectors = self.init_detectors(data_loader)
+
+ def init_detectors(self, data_loader):
+ if self.job_config.model_config.enable:
+ detectors = [
+ NSigmaDetector(data_loader, method='max'),
+ OnlineVAEDetector(data_loader, self.job_config.model_config)
+ ]
+ else:
+ detectors = [
+ NSigmaDetector(data_loader, method='max')
+ ]
+
+ return detectors
+
+ def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
+ """Parses the cause metrics into the specific formats"""
+ cause_metrics = [
+ {
+ 'metric': cause.ts.metric,
+ 'labels': cause.ts.labels,
+ 'score': cause.score,
+ 'description': cause.description.format(
+ cause.ts.labels.get('disk_name', ''))}
+ for cause in anomaly.root_causes]
+
+ return cause_metrics
diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
index 94fd05d..43e069f 100644
--- a/anteater/module/sys/proc_io_latency.py
+++ b/anteater/module/sys/proc_io_latency.py
@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
def init_detectors(self, data_loader):
if self.job_config.model_config.enable:
detectors = [
- NSigmaDetector(data_loader, method='min'),
+ NSigmaDetector(data_loader, method='abs'),
OnlineVAEDetector(data_loader, self.job_config.model_config)
]
else:
detectors = [
- NSigmaDetector(data_loader, method='min')
+ NSigmaDetector(data_loader, method='abs')
]
return detectors
diff --git a/anteater/source/anomaly_report.py b/anteater/source/anomaly_report.py
index b226763..3d3bb09 100644
--- a/anteater/source/anomaly_report.py
+++ b/anteater/source/anomaly_report.py
@@ -42,7 +42,7 @@ class AnomalyReport:
return keys
- def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, template: Template):
+ def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, keywords: List[str], template: Template):
keys = self.get_keys(template.entity_name)
machine_id = template.machine_id
entity_name = template.entity_name
@@ -54,6 +54,7 @@ class AnomalyReport:
template.keys = keys
template.description = anomaly.description
template.cause_metrics = cause_metrics
+ template.keywords = keywords
msg = template.get_template()
self.provider.send_message(msg)
diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py
index 5b8caf8..a509c96 100644
--- a/anteater/template/app_anomaly_template.py
+++ b/anteater/template/app_anomaly_template.py
@@ -31,7 +31,9 @@ class AppAnomalyTemplate(Template):
'entity_id': self.entity_id,
'event_id': f'{timestamp}_{self.entity_id}',
'event_type': 'app',
- 'event_source': 'gala-anteater'
+ 'event_source': 'gala-anteater',
+ 'keywords': self.keywords,
+ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
},
'Resource': {
'metric': self.metric,
diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py
index 1083fb3..4ac6abb 100644
--- a/anteater/template/sys_anomaly_template.py
+++ b/anteater/template/sys_anomaly_template.py
@@ -31,7 +31,9 @@ class SysAnomalyTemplate(Template):
'entity_id': self.entity_id,
'event_id': f'{timestamp}_{self.entity_id}',
'event_type': 'sys',
- 'event_source': 'gala-anteater'
+ 'event_source': 'gala-anteater',
+ 'keywords': self.keywords,
+ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
},
'Resource': {
'metric': self.metric,
diff --git a/anteater/template/template.py b/anteater/template/template.py
index 9e4461a..794c121 100644
--- a/anteater/template/template.py
+++ b/anteater/template/template.py
@@ -26,7 +26,8 @@ class Template:
self.labels = {}
self.entity_id = ""
self.description = ""
- self.cause_metrics = {}
+ self.cause_metrics = []
+ self.keywords = []
@abstractmethod
def get_template(self):
diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
index 6ac92c7..b6991c6 100644
--- a/anteater/utils/data_load.py
+++ b/anteater/utils/data_load.py
@@ -45,6 +45,7 @@ def load_job_config(file_name) -> JobConfig:
name = config['name']
job_type = config['job_type']
+ keywords = config['keywords']
root_cause_number = config['root_cause_number']
kpis = [KPI(**_conf) for _conf in config['KPI']]
features = [Feature(**_conf) for _conf in config['Features']]
@@ -74,6 +75,7 @@ def load_job_config(file_name) -> JobConfig:
return JobConfig(
name=name,
job_type=job_type,
+ keywords=keywords,
root_cause_number=root_cause_number,
kpis=kpis,
features=features,
diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
index 7c05094..db29392 100644
--- a/config/module/app_sli_rtt.json
+++ b/config/module/app_sli_rtt.json
@@ -1,6 +1,9 @@
{
"name": "app_sli_rtt",
"job_type": "app",
+ "keywords": [
+ "app"
+ ],
"root_cause_number": 20,
"KPI": [
{
diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json
new file mode 100644
index 0000000..00276c0
--- /dev/null
+++ b/config/module/disk_throughput.json
@@ -0,0 +1,92 @@
+{
+ "name": "disk_throughput",
+ "job_type": "sys",
+ "keywords": [
+ "disk"
+ ],
+ "root_cause_number": 1,
+ "KPI": [
+ {
+ "metric": "gala_gopher_disk_r_await",
+ "kpi_type": "",
+ "entity_name": "disk",
+ "enable": true,
+ "description": "Disk read await time is increasing!",
+ "params": {
+ "look_back": 20,
+ "obs_size": 25,
+ "outlier_ratio_th": 0.3,
+ "smooth_params": {
+ "method": "conv_smooth",
+ "box_pts": 3
+ }
+ }
+ },
+ {
+ "metric": "gala_gopher_disk_w_await",
+ "kpi_type": "",
+ "entity_name": "disk",
+ "enable": true,
+ "description": "Disk write await time is increasing!",
+ "params": {
+ "look_back": 20,
+ "obs_size": 25,
+ "outlier_ratio_th": 0.3,
+ "smooth_params": {
+ "method": "conv_smooth",
+ "box_pts": 3
+ }
+ }
+ }
+ ],
+ "OnlineModel": {
+ "name": "online_vae_model",
+ "enable": false,
+ "params": {
+ "th": 0.5,
+ "max_error_rate": 0.7,
+ "min_retrain_hours": 24,
+ "min_predict_minutes": 20,
+ "norm": {},
+ "vae": {
+ "hidden_sizes": [25, 10, 5],
+ "latent_size": 5,
+ "dropout_rate": 0.25,
+ "batch_size": 1024,
+ "num_epochs": 30,
+ "learning_rate": 0.001,
+ "k": 120,
+ "step_size": 60,
+ "num_eval_samples": 10
+ },
+ "calibrate": {},
+ "threshold": {}
+ }
+ },
+ "Features": [
+ {
+ "metric": "gala_gopher_disk_rspeed_kB",
+ "priority": 0,
+ "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})",
+ "atrend": "rise"
+ },
+ {
+ "metric": "gala_gopher_disk_wspeed_kB",
+ "priority": 0,
+ "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})",
+ "atrend": "rise"
+ },
+ {
+ "metric": "gala_gopher_disk_rareq",
+ "priority": 0,
+ "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})",
+ "atrend": "rise"
+ },
+ {
+ "metric": "gala_gopher_disk_wareq",
+ "priority": 0,
+ "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})",
+ "atrend": "rise"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json
index c45b7df..c6c03c1 100644
--- a/config/module/proc_io_latency.json
+++ b/config/module/proc_io_latency.json
@@ -1,6 +1,9 @@
{
"name": "proc_io_latency",
"job_type": "sys",
+ "keywords": [
+ "process"
+ ],
"root_cause_number": 3,
"KPI": [
{
diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json
index e92dd4c..e58990d 100644
--- a/config/module/sys_io_latency.json
+++ b/config/module/sys_io_latency.json
@@ -1,6 +1,9 @@
{
"name": "sys_io_latency",
"job_type": "sys",
+ "keywords": [
+ "block"
+ ],
"root_cause_number": 3,
"KPI": [
{
diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
index b6f8eb4..61ae72d 100644
--- a/config/module/sys_tcp_establish.json
+++ b/config/module/sys_tcp_establish.json
@@ -1,6 +1,9 @@
{
"name": "sys_tcp_establish",
"job_type": "sys",
+ "keywords": [
+ "tcp"
+ ],
"root_cause_number": 3,
"KPI": [
{
diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json
index 4927d8e..d9e7f80 100644
--- a/config/module/sys_tcp_transmission_latency.json
+++ b/config/module/sys_tcp_transmission_latency.json
@@ -1,6 +1,9 @@
{
"name": "sys_tcp_transmission_latency",
"job_type": "sys",
+ "keywords": [
+ "tcp"
+ ],
"root_cause_number": 3,
"KPI": [
{
diff --git a/config/module/sys_tcp_transmission_throughput.json b/config/module/sys_tcp_transmission_throughput.json
index 060f640..28ee784 100644
--- a/config/module/sys_tcp_transmission_throughput.json
+++ b/config/module/sys_tcp_transmission_throughput.json
@@ -1,6 +1,9 @@
{
"name": "sys_tcp_transmission_throughput",
"job_type": "sys",
+ "keywords": [
+ "net"
+ ],
"root_cause_number": 3,
"KPI": [
{
--
2.33.0