- fix str2enum bug & data query refactor - add systemd service for anteater - remove 'sys-level' config param - add chinese descriptions - Update TCP Establish Model & Add Nic Loss Detector - Add disk throughput detector (cherry picked from commit f3c17e8c6a619a7803afd89b945ae3f36d17f9b0)
479 lines
17 KiB
Diff
479 lines
17 KiB
Diff
From ac1383471f72420e3320eb7c7999021f3658fb7d Mon Sep 17 00:00:00 2001
|
|
From: lizhenxing11 <lizhenxing11@huawei.com>
|
|
Date: Wed, 7 Dec 2022 16:59:15 +0800
|
|
Subject: [PATCH] Add disk throughput detector
|
|
|
|
add keywords
|
|
|
|
extract cause metric to the attributes
|
|
|
|
update template
|
|
---
|
|
anteater/config.py | 3 -
|
|
anteater/core/kpi.py | 1 +
|
|
anteater/main.py | 2 +
|
|
anteater/model/algorithms/three_sigma.py | 2 +-
|
|
anteater/module/base.py | 6 +-
|
|
anteater/module/sys/disk_throughput.py | 62 +++++++++++++
|
|
anteater/module/sys/proc_io_latency.py | 4 +-
|
|
anteater/source/anomaly_report.py | 3 +-
|
|
anteater/template/app_anomaly_template.py | 4 +-
|
|
anteater/template/sys_anomaly_template.py | 4 +-
|
|
anteater/template/template.py | 3 +-
|
|
anteater/utils/data_load.py | 2 +
|
|
config/module/app_sli_rtt.json | 3 +
|
|
config/module/disk_throughput.json | 92 +++++++++++++++++++
|
|
config/module/proc_io_latency.json | 3 +
|
|
config/module/sys_io_latency.json | 3 +
|
|
config/module/sys_tcp_establish.json | 3 +
|
|
.../module/sys_tcp_transmission_latency.json | 3 +
|
|
.../sys_tcp_transmission_throughput.json | 3 +
|
|
19 files changed, 193 insertions(+), 13 deletions(-)
|
|
create mode 100644 anteater/module/sys/disk_throughput.py
|
|
create mode 100644 config/module/disk_throughput.json
|
|
|
|
diff --git a/anteater/config.py b/anteater/config.py
|
|
index ea02702..e9ab557 100644
|
|
--- a/anteater/config.py
|
|
+++ b/anteater/config.py
|
|
@@ -81,9 +81,6 @@ class AnteaterConf:
|
|
"""Loads config from yaml file"""
|
|
data_path = os.path.realpath(data_path)
|
|
|
|
- if not os.path.exists(data_path):
|
|
- os.makedirs(data_path)
|
|
-
|
|
try:
|
|
with open(os.path.join(data_path, "config", self.filename), "rb") as f:
|
|
result = yaml.safe_load(f)
|
|
diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py
|
|
index 5a9d8ab..3480139 100644
|
|
--- a/anteater/core/kpi.py
|
|
+++ b/anteater/core/kpi.py
|
|
@@ -48,6 +48,7 @@ class ModelConfig:
|
|
class JobConfig:
|
|
name: str
|
|
job_type: str
|
|
+ keywords: List[str]
|
|
root_cause_number: int
|
|
kpis: List[KPI]
|
|
features: List[Feature]
|
|
diff --git a/anteater/main.py b/anteater/main.py
|
|
index 11e0409..ba7be70 100644
|
|
--- a/anteater/main.py
|
|
+++ b/anteater/main.py
|
|
@@ -21,6 +21,7 @@ from apscheduler.schedulers.blocking import BlockingScheduler
|
|
from anteater.anomaly_detection import AnomalyDetection
|
|
from anteater.config import AnteaterConf
|
|
from anteater.module.app.app_sli_detector import APPSliDetector
|
|
+from anteater.module.sys.disk_throughput import DiskThroughputDetector
|
|
from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector
|
|
from anteater.module.sys.sys_io_latency import SysIOLatencyDetector
|
|
from anteater.module.sys.tcp_establish import SysTcpEstablishDetector
|
|
@@ -57,6 +58,7 @@ def main():
|
|
SysTcpTransmissionLatencyDetector(loader, report),
|
|
SysIOLatencyDetector(loader, report),
|
|
ProcIOLatencyDetector(loader, report),
|
|
+ DiskThroughputDetector(loader, report),
|
|
]
|
|
else:
|
|
detectors = [
|
|
diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py
|
|
index 457b606..49b9952 100644
|
|
--- a/anteater/model/algorithms/three_sigma.py
|
|
+++ b/anteater/model/algorithms/three_sigma.py
|
|
@@ -31,7 +31,7 @@ def three_sigma(values, obs_size, n=3, method="abs"):
|
|
elif method == 'min':
|
|
outlier = [val for val in obs_val if val < mean - n * std]
|
|
elif method == 'max':
|
|
- outlier = [val for val in obs_val if val > mean + 3 * std]
|
|
+ outlier = [val for val in obs_val if val > mean + n * std]
|
|
else:
|
|
raise ValueError(f'Unknown method {method}')
|
|
|
|
diff --git a/anteater/module/base.py b/anteater/module/base.py
|
|
index 7b5fc84..63436ac 100644
|
|
--- a/anteater/module/base.py
|
|
+++ b/anteater/module/base.py
|
|
@@ -48,14 +48,14 @@ class E2EDetector:
|
|
for detector in self.detectors:
|
|
anomalies = detector.execute(self.job_config)
|
|
for anomaly in anomalies:
|
|
- self.report(anomaly)
|
|
+ self.report(anomaly, self.job_config.keywords)
|
|
|
|
@abstractmethod
|
|
def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
|
|
"""Parses the cause metrics into the specific formats"""
|
|
pass
|
|
|
|
- def report(self, anomaly: Anomaly):
|
|
+ def report(self, anomaly: Anomaly, keywords):
|
|
"""Parses the anomaly into a specific formats
|
|
based on the template and reports parsed results
|
|
"""
|
|
@@ -63,4 +63,4 @@ class E2EDetector:
|
|
timestamp = dt.utc_now()
|
|
template = self.template(timestamp, anomaly.machine_id,
|
|
anomaly.metric, anomaly.entity_name)
|
|
- self.reporter.sent_anomaly(anomaly, cause_metrics, template)
|
|
+ self.reporter.sent_anomaly(anomaly, cause_metrics, keywords, template)
|
|
diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py
|
|
new file mode 100644
|
|
index 0000000..9a192fb
|
|
--- /dev/null
|
|
+++ b/anteater/module/sys/disk_throughput.py
|
|
@@ -0,0 +1,62 @@
|
|
+#!/usr/bin/python3
|
|
+# ******************************************************************************
|
|
+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
|
|
+# gala-anteater is licensed under Mulan PSL v2.
|
|
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
+# You may obtain a copy of Mulan PSL v2 at:
|
|
+# http://license.coscl.org.cn/MulanPSL2
|
|
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
+# See the Mulan PSL v2 for more details.
|
|
+# ******************************************************************************/
|
|
+
|
|
+from typing import List, Dict
|
|
+
|
|
+from anteater.core.anomaly import Anomaly
|
|
+from anteater.module.base import E2EDetector
|
|
+from anteater.model.detector.online_vae_detector import OnlineVAEDetector
|
|
+from anteater.model.detector.n_sigma_detector import NSigmaDetector
|
|
+from anteater.source.anomaly_report import AnomalyReport
|
|
+from anteater.source.metric_loader import MetricLoader
|
|
+from anteater.template.sys_anomaly_template import SysAnomalyTemplate
|
|
+
|
|
+
|
|
+class DiskThroughputDetector(E2EDetector):
|
|
+ """Disk throughput e2e detector which detects the disk read or write
|
|
+ await time performance deteriorates
|
|
+ """
|
|
+
|
|
+ config_file = 'disk_throughput.json'
|
|
+
|
|
+ def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport):
|
|
+ """The disk throughput e2e detector initializer"""
|
|
+ super().__init__(reporter, SysAnomalyTemplate)
|
|
+
|
|
+ self.detectors = self.init_detectors(data_loader)
|
|
+
|
|
+ def init_detectors(self, data_loader):
|
|
+ if self.job_config.model_config.enable:
|
|
+ detectors = [
|
|
+ NSigmaDetector(data_loader, method='max'),
|
|
+ OnlineVAEDetector(data_loader, self.job_config.model_config)
|
|
+ ]
|
|
+ else:
|
|
+ detectors = [
|
|
+ NSigmaDetector(data_loader, method='max')
|
|
+ ]
|
|
+
|
|
+ return detectors
|
|
+
|
|
+ def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
|
|
+ """Parses the cause metrics into the specific formats"""
|
|
+ cause_metrics = [
|
|
+ {
|
|
+ 'metric': cause.ts.metric,
|
|
+ 'labels': cause.ts.labels,
|
|
+ 'score': cause.score,
|
|
+ 'description': cause.description.format(
|
|
+ cause.ts.labels.get('disk_name', ''))}
|
|
+ for cause in anomaly.root_causes]
|
|
+
|
|
+ return cause_metrics
|
|
diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
|
|
index 94fd05d..43e069f 100644
|
|
--- a/anteater/module/sys/proc_io_latency.py
|
|
+++ b/anteater/module/sys/proc_io_latency.py
|
|
@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
|
|
def init_detectors(self, data_loader):
|
|
if self.job_config.model_config.enable:
|
|
detectors = [
|
|
- NSigmaDetector(data_loader, method='min'),
|
|
+ NSigmaDetector(data_loader, method='abs'),
|
|
OnlineVAEDetector(data_loader, self.job_config.model_config)
|
|
]
|
|
else:
|
|
detectors = [
|
|
- NSigmaDetector(data_loader, method='min')
|
|
+ NSigmaDetector(data_loader, method='abs')
|
|
]
|
|
|
|
return detectors
|
|
diff --git a/anteater/source/anomaly_report.py b/anteater/source/anomaly_report.py
|
|
index b226763..3d3bb09 100644
|
|
--- a/anteater/source/anomaly_report.py
|
|
+++ b/anteater/source/anomaly_report.py
|
|
@@ -42,7 +42,7 @@ class AnomalyReport:
|
|
|
|
return keys
|
|
|
|
- def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, template: Template):
|
|
+ def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, keywords: List[str], template: Template):
|
|
keys = self.get_keys(template.entity_name)
|
|
machine_id = template.machine_id
|
|
entity_name = template.entity_name
|
|
@@ -54,6 +54,7 @@ class AnomalyReport:
|
|
template.keys = keys
|
|
template.description = anomaly.description
|
|
template.cause_metrics = cause_metrics
|
|
+ template.keywords = keywords
|
|
|
|
msg = template.get_template()
|
|
self.provider.send_message(msg)
|
|
diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py
|
|
index 5b8caf8..a509c96 100644
|
|
--- a/anteater/template/app_anomaly_template.py
|
|
+++ b/anteater/template/app_anomaly_template.py
|
|
@@ -31,7 +31,9 @@ class AppAnomalyTemplate(Template):
|
|
'entity_id': self.entity_id,
|
|
'event_id': f'{timestamp}_{self.entity_id}',
|
|
'event_type': 'app',
|
|
- 'event_source': 'gala-anteater'
|
|
+ 'event_source': 'gala-anteater',
|
|
+ 'keywords': self.keywords,
|
|
+ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
|
|
},
|
|
'Resource': {
|
|
'metric': self.metric,
|
|
diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py
|
|
index 1083fb3..4ac6abb 100644
|
|
--- a/anteater/template/sys_anomaly_template.py
|
|
+++ b/anteater/template/sys_anomaly_template.py
|
|
@@ -31,7 +31,9 @@ class SysAnomalyTemplate(Template):
|
|
'entity_id': self.entity_id,
|
|
'event_id': f'{timestamp}_{self.entity_id}',
|
|
'event_type': 'sys',
|
|
- 'event_source': 'gala-anteater'
|
|
+ 'event_source': 'gala-anteater',
|
|
+ 'keywords': self.keywords,
|
|
+ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
|
|
},
|
|
'Resource': {
|
|
'metric': self.metric,
|
|
diff --git a/anteater/template/template.py b/anteater/template/template.py
|
|
index 9e4461a..794c121 100644
|
|
--- a/anteater/template/template.py
|
|
+++ b/anteater/template/template.py
|
|
@@ -26,7 +26,8 @@ class Template:
|
|
self.labels = {}
|
|
self.entity_id = ""
|
|
self.description = ""
|
|
- self.cause_metrics = {}
|
|
+ self.cause_metrics = []
|
|
+ self.keywords = []
|
|
|
|
@abstractmethod
|
|
def get_template(self):
|
|
diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
|
|
index 6ac92c7..b6991c6 100644
|
|
--- a/anteater/utils/data_load.py
|
|
+++ b/anteater/utils/data_load.py
|
|
@@ -45,6 +45,7 @@ def load_job_config(file_name) -> JobConfig:
|
|
|
|
name = config['name']
|
|
job_type = config['job_type']
|
|
+ keywords = config['keywords']
|
|
root_cause_number = config['root_cause_number']
|
|
kpis = [KPI(**_conf) for _conf in config['KPI']]
|
|
features = [Feature(**_conf) for _conf in config['Features']]
|
|
@@ -74,6 +75,7 @@ def load_job_config(file_name) -> JobConfig:
|
|
return JobConfig(
|
|
name=name,
|
|
job_type=job_type,
|
|
+ keywords=keywords,
|
|
root_cause_number=root_cause_number,
|
|
kpis=kpis,
|
|
features=features,
|
|
diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
|
|
index 7c05094..db29392 100644
|
|
--- a/config/module/app_sli_rtt.json
|
|
+++ b/config/module/app_sli_rtt.json
|
|
@@ -1,6 +1,9 @@
|
|
{
|
|
"name": "app_sli_rtt",
|
|
"job_type": "app",
|
|
+ "keywords": [
|
|
+ "app"
|
|
+ ],
|
|
"root_cause_number": 20,
|
|
"KPI": [
|
|
{
|
|
diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json
|
|
new file mode 100644
|
|
index 0000000..00276c0
|
|
--- /dev/null
|
|
+++ b/config/module/disk_throughput.json
|
|
@@ -0,0 +1,92 @@
|
|
+{
|
|
+ "name": "disk_throughput",
|
|
+ "job_type": "sys",
|
|
+ "keywords": [
|
|
+ "disk"
|
|
+ ],
|
|
+ "root_cause_number": 1,
|
|
+ "KPI": [
|
|
+ {
|
|
+ "metric": "gala_gopher_disk_r_await",
|
|
+ "kpi_type": "",
|
|
+ "entity_name": "disk",
|
|
+ "enable": true,
|
|
+ "description": "Disk read await time is increasing!",
|
|
+ "params": {
|
|
+ "look_back": 20,
|
|
+ "obs_size": 25,
|
|
+ "outlier_ratio_th": 0.3,
|
|
+ "smooth_params": {
|
|
+ "method": "conv_smooth",
|
|
+ "box_pts": 3
|
|
+ }
|
|
+ }
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_disk_w_await",
|
|
+ "kpi_type": "",
|
|
+ "entity_name": "disk",
|
|
+ "enable": true,
|
|
+ "description": "Disk write await time is increasing!",
|
|
+ "params": {
|
|
+ "look_back": 20,
|
|
+ "obs_size": 25,
|
|
+ "outlier_ratio_th": 0.3,
|
|
+ "smooth_params": {
|
|
+ "method": "conv_smooth",
|
|
+ "box_pts": 3
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ ],
|
|
+ "OnlineModel": {
|
|
+ "name": "online_vae_model",
|
|
+ "enable": false,
|
|
+ "params": {
|
|
+ "th": 0.5,
|
|
+ "max_error_rate": 0.7,
|
|
+ "min_retrain_hours": 24,
|
|
+ "min_predict_minutes": 20,
|
|
+ "norm": {},
|
|
+ "vae": {
|
|
+ "hidden_sizes": [25, 10, 5],
|
|
+ "latent_size": 5,
|
|
+ "dropout_rate": 0.25,
|
|
+ "batch_size": 1024,
|
|
+ "num_epochs": 30,
|
|
+ "learning_rate": 0.001,
|
|
+ "k": 120,
|
|
+ "step_size": 60,
|
|
+ "num_eval_samples": 10
|
|
+ },
|
|
+ "calibrate": {},
|
|
+ "threshold": {}
|
|
+ }
|
|
+ },
|
|
+ "Features": [
|
|
+ {
|
|
+ "metric": "gala_gopher_disk_rspeed_kB",
|
|
+ "priority": 0,
|
|
+ "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})",
|
|
+ "atrend": "rise"
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_disk_wspeed_kB",
|
|
+ "priority": 0,
|
|
+ "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})",
|
|
+ "atrend": "rise"
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_disk_rareq",
|
|
+ "priority": 0,
|
|
+ "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})",
|
|
+ "atrend": "rise"
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_disk_wareq",
|
|
+ "priority": 0,
|
|
+ "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})",
|
|
+ "atrend": "rise"
|
|
+ }
|
|
+ ]
|
|
+}
|
|
\ No newline at end of file
|
|
diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json
|
|
index c45b7df..c6c03c1 100644
|
|
--- a/config/module/proc_io_latency.json
|
|
+++ b/config/module/proc_io_latency.json
|
|
@@ -1,6 +1,9 @@
|
|
{
|
|
"name": "proc_io_latency",
|
|
"job_type": "sys",
|
|
+ "keywords": [
|
|
+ "process"
|
|
+ ],
|
|
"root_cause_number": 3,
|
|
"KPI": [
|
|
{
|
|
diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json
|
|
index e92dd4c..e58990d 100644
|
|
--- a/config/module/sys_io_latency.json
|
|
+++ b/config/module/sys_io_latency.json
|
|
@@ -1,6 +1,9 @@
|
|
{
|
|
"name": "sys_io_latency",
|
|
"job_type": "sys",
|
|
+ "keywords": [
|
|
+ "block"
|
|
+ ],
|
|
"root_cause_number": 3,
|
|
"KPI": [
|
|
{
|
|
diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
|
|
index b6f8eb4..61ae72d 100644
|
|
--- a/config/module/sys_tcp_establish.json
|
|
+++ b/config/module/sys_tcp_establish.json
|
|
@@ -1,6 +1,9 @@
|
|
{
|
|
"name": "sys_tcp_establish",
|
|
"job_type": "sys",
|
|
+ "keywords": [
|
|
+ "tcp"
|
|
+ ],
|
|
"root_cause_number": 3,
|
|
"KPI": [
|
|
{
|
|
diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json
|
|
index 4927d8e..d9e7f80 100644
|
|
--- a/config/module/sys_tcp_transmission_latency.json
|
|
+++ b/config/module/sys_tcp_transmission_latency.json
|
|
@@ -1,6 +1,9 @@
|
|
{
|
|
"name": "sys_tcp_transmission_latency",
|
|
"job_type": "sys",
|
|
+ "keywords": [
|
|
+ "tcp"
|
|
+ ],
|
|
"root_cause_number": 3,
|
|
"KPI": [
|
|
{
|
|
diff --git a/config/module/sys_tcp_transmission_throughput.json b/config/module/sys_tcp_transmission_throughput.json
|
|
index 060f640..28ee784 100644
|
|
--- a/config/module/sys_tcp_transmission_throughput.json
|
|
+++ b/config/module/sys_tcp_transmission_throughput.json
|
|
@@ -1,6 +1,9 @@
|
|
{
|
|
"name": "sys_tcp_transmission_throughput",
|
|
"job_type": "sys",
|
|
+ "keywords": [
|
|
+ "net"
|
|
+ ],
|
|
"root_cause_number": 3,
|
|
"KPI": [
|
|
{
|
|
--
|
|
2.33.0
|
|
|