From ac1383471f72420e3320eb7c7999021f3658fb7d Mon Sep 17 00:00:00 2001 From: lizhenxing11 Date: Wed, 7 Dec 2022 16:59:15 +0800 Subject: [PATCH] Add disk throughput detector add keywords extract cause metric to the attributes update template --- anteater/config.py | 3 - anteater/core/kpi.py | 1 + anteater/main.py | 2 + anteater/model/algorithms/three_sigma.py | 2 +- anteater/module/base.py | 6 +- anteater/module/sys/disk_throughput.py | 62 +++++++++++++ anteater/module/sys/proc_io_latency.py | 4 +- anteater/source/anomaly_report.py | 3 +- anteater/template/app_anomaly_template.py | 4 +- anteater/template/sys_anomaly_template.py | 4 +- anteater/template/template.py | 3 +- anteater/utils/data_load.py | 2 + config/module/app_sli_rtt.json | 3 + config/module/disk_throughput.json | 92 +++++++++++++++++++ config/module/proc_io_latency.json | 3 + config/module/sys_io_latency.json | 3 + config/module/sys_tcp_establish.json | 3 + .../module/sys_tcp_transmission_latency.json | 3 + .../sys_tcp_transmission_throughput.json | 3 + 19 files changed, 193 insertions(+), 13 deletions(-) create mode 100644 anteater/module/sys/disk_throughput.py create mode 100644 config/module/disk_throughput.json diff --git a/anteater/config.py b/anteater/config.py index ea02702..e9ab557 100644 --- a/anteater/config.py +++ b/anteater/config.py @@ -81,9 +81,6 @@ class AnteaterConf: """Loads config from yaml file""" data_path = os.path.realpath(data_path) - if not os.path.exists(data_path): - os.makedirs(data_path) - try: with open(os.path.join(data_path, "config", self.filename), "rb") as f: result = yaml.safe_load(f) diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py index 5a9d8ab..3480139 100644 --- a/anteater/core/kpi.py +++ b/anteater/core/kpi.py @@ -48,6 +48,7 @@ class ModelConfig: class JobConfig: name: str job_type: str + keywords: List[str] root_cause_number: int kpis: List[KPI] features: List[Feature] diff --git a/anteater/main.py b/anteater/main.py index 11e0409..ba7be70 100644 --- a/anteater/main.py +++ b/anteater/main.py @@ -21,6 +21,7 @@ from apscheduler.schedulers.blocking import BlockingScheduler from anteater.anomaly_detection import AnomalyDetection from anteater.config import AnteaterConf from anteater.module.app.app_sli_detector import APPSliDetector +from anteater.module.sys.disk_throughput import DiskThroughputDetector from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector from anteater.module.sys.sys_io_latency import SysIOLatencyDetector from anteater.module.sys.tcp_establish import SysTcpEstablishDetector @@ -57,6 +58,7 @@ def main(): SysTcpTransmissionLatencyDetector(loader, report), SysIOLatencyDetector(loader, report), ProcIOLatencyDetector(loader, report), + DiskThroughputDetector(loader, report), ] else: detectors = [ diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py index 457b606..49b9952 100644 --- a/anteater/model/algorithms/three_sigma.py +++ b/anteater/model/algorithms/three_sigma.py @@ -31,7 +31,7 @@ def three_sigma(values, obs_size, n=3, method="abs"): elif method == 'min': outlier = [val for val in obs_val if val < mean - n * std] elif method == 'max': - outlier = [val for val in obs_val if val > mean + 3 * std] + outlier = [val for val in obs_val if val > mean + n * std] else: raise ValueError(f'Unknown method {method}') diff --git a/anteater/module/base.py b/anteater/module/base.py index 7b5fc84..63436ac 100644 --- a/anteater/module/base.py +++ b/anteater/module/base.py @@ -48,14 +48,14 @@ class E2EDetector: for detector in self.detectors: anomalies = detector.execute(self.job_config) for anomaly in anomalies: - self.report(anomaly) + self.report(anomaly, self.job_config.keywords) @abstractmethod def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: """Parses the cause metrics into the specific formats""" pass - def report(self, anomaly: Anomaly): + def report(self, anomaly: Anomaly, keywords): """Parses the anomaly into a specific formats based on the template and reports parsed results """ @@ -63,4 +63,4 @@ class E2EDetector: timestamp = dt.utc_now() template = self.template(timestamp, anomaly.machine_id, anomaly.metric, anomaly.entity_name) - self.reporter.sent_anomaly(anomaly, cause_metrics, template) + self.reporter.sent_anomaly(anomaly, cause_metrics, keywords, template) diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py new file mode 100644 index 0000000..9a192fb --- /dev/null +++ b/anteater/module/sys/disk_throughput.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 +# ****************************************************************************** +# Copyright (c) 2022 Huawei Technologies Co., Ltd. +# gala-anteater is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ******************************************************************************/ + +from typing import List, Dict + +from anteater.core.anomaly import Anomaly +from anteater.module.base import E2EDetector +from anteater.model.detector.online_vae_detector import OnlineVAEDetector +from anteater.model.detector.n_sigma_detector import NSigmaDetector +from anteater.source.anomaly_report import AnomalyReport +from anteater.source.metric_loader import MetricLoader +from anteater.template.sys_anomaly_template import SysAnomalyTemplate + + +class DiskThroughputDetector(E2EDetector): + """Disk throughput e2e detector which detects the disk read or write + await time performance deteriorates + """ + + config_file = 'disk_throughput.json' + + def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport): + """The disk throughput e2e detector initializer""" + super().__init__(reporter, SysAnomalyTemplate) + + self.detectors = self.init_detectors(data_loader) + + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ + NSigmaDetector(data_loader, method='max'), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ + NSigmaDetector(data_loader, method='max') + ] + + return detectors + + def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: + """Parses the cause metrics into the specific formats""" + cause_metrics = [ + { + 'metric': cause.ts.metric, + 'labels': cause.ts.labels, + 'score': cause.score, + 'description': cause.description.format( + cause.ts.labels.get('disk_name', ''))} + for cause in anomaly.root_causes] + + return cause_metrics diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py index 94fd05d..43e069f 100644 --- a/anteater/module/sys/proc_io_latency.py +++ b/anteater/module/sys/proc_io_latency.py @@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector): def init_detectors(self, data_loader): if self.job_config.model_config.enable: detectors = [ - NSigmaDetector(data_loader, method='min'), + NSigmaDetector(data_loader, method='abs'), OnlineVAEDetector(data_loader, self.job_config.model_config) ] else: detectors = [ - NSigmaDetector(data_loader, method='min') + NSigmaDetector(data_loader, method='abs') ] return detectors diff --git a/anteater/source/anomaly_report.py b/anteater/source/anomaly_report.py index b226763..3d3bb09 100644 --- a/anteater/source/anomaly_report.py +++ b/anteater/source/anomaly_report.py @@ -42,7 +42,7 @@ class AnomalyReport: return keys - def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, template: Template): + def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, keywords: List[str], template: Template): keys = self.get_keys(template.entity_name) machine_id = template.machine_id entity_name = template.entity_name @@ -54,6 +54,7 @@ class AnomalyReport: template.keys = keys template.description = anomaly.description template.cause_metrics = cause_metrics + template.keywords = keywords msg = template.get_template() self.provider.send_message(msg) diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py index 5b8caf8..a509c96 100644 --- a/anteater/template/app_anomaly_template.py +++ b/anteater/template/app_anomaly_template.py @@ -31,7 +31,9 @@ class AppAnomalyTemplate(Template): 'entity_id': self.entity_id, 'event_id': f'{timestamp}_{self.entity_id}', 'event_type': 'app', - 'event_source': 'gala-anteater' + 'event_source': 'gala-anteater', + 'keywords': self.keywords, + 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} }, 'Resource': { 'metric': self.metric, diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py index 1083fb3..4ac6abb 100644 --- a/anteater/template/sys_anomaly_template.py +++ b/anteater/template/sys_anomaly_template.py @@ -31,7 +31,9 @@ class SysAnomalyTemplate(Template): 'entity_id': self.entity_id, 'event_id': f'{timestamp}_{self.entity_id}', 'event_type': 'sys', - 'event_source': 'gala-anteater' + 'event_source': 'gala-anteater', + 'keywords': self.keywords, + 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} }, 'Resource': { 'metric': self.metric, diff --git a/anteater/template/template.py b/anteater/template/template.py index 9e4461a..794c121 100644 --- a/anteater/template/template.py +++ b/anteater/template/template.py @@ -26,7 +26,8 @@ class Template: self.labels = {} self.entity_id = "" self.description = "" - self.cause_metrics = {} + self.cause_metrics = [] + self.keywords = [] @abstractmethod def get_template(self): diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py index 6ac92c7..b6991c6 100644 --- a/anteater/utils/data_load.py +++ b/anteater/utils/data_load.py @@ -45,6 +45,7 @@ def load_job_config(file_name) -> JobConfig: name = config['name'] job_type = config['job_type'] + keywords = config['keywords'] root_cause_number = config['root_cause_number'] kpis = [KPI(**_conf) for _conf in config['KPI']] features = [Feature(**_conf) for _conf in config['Features']] @@ -74,6 +75,7 @@ def load_job_config(file_name) -> JobConfig: return JobConfig( name=name, job_type=job_type, + keywords=keywords, root_cause_number=root_cause_number, kpis=kpis, features=features, diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json index 7c05094..db29392 100644 --- a/config/module/app_sli_rtt.json +++ b/config/module/app_sli_rtt.json @@ -1,6 +1,9 @@ { "name": "app_sli_rtt", "job_type": "app", + "keywords": [ + "app" + ], "root_cause_number": 20, "KPI": [ { diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json new file mode 100644 index 0000000..00276c0 --- /dev/null +++ b/config/module/disk_throughput.json @@ -0,0 +1,92 @@ +{ + "name": "disk_throughput", + "job_type": "sys", + "keywords": [ + "disk" + ], + "root_cause_number": 1, + "KPI": [ + { + "metric": "gala_gopher_disk_r_await", + "kpi_type": "", + "entity_name": "disk", + "enable": true, + "description": "Disk read await time is increasing!", + "params": { + "look_back": 20, + "obs_size": 25, + "outlier_ratio_th": 0.3, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 + } + } + }, + { + "metric": "gala_gopher_disk_w_await", + "kpi_type": "", + "entity_name": "disk", + "enable": true, + "description": "Disk write await time is increasing!", + "params": { + "look_back": 20, + "obs_size": 25, + "outlier_ratio_th": 0.3, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 + } + } + } + ], + "OnlineModel": { + "name": "online_vae_model", + "enable": false, + "params": { + "th": 0.5, + "max_error_rate": 0.7, + "min_retrain_hours": 24, + "min_predict_minutes": 20, + "norm": {}, + "vae": { + "hidden_sizes": [25, 10, 5], + "latent_size": 5, + "dropout_rate": 0.25, + "batch_size": 1024, + "num_epochs": 30, + "learning_rate": 0.001, + "k": 120, + "step_size": 60, + "num_eval_samples": 10 + }, + "calibrate": {}, + "threshold": {} + } + }, + "Features": [ + { + "metric": "gala_gopher_disk_rspeed_kB", + "priority": 0, + "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})", + "atrend": "rise" + }, + { + "metric": "gala_gopher_disk_wspeed_kB", + "priority": 0, + "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})", + "atrend": "rise" + }, + { + "metric": "gala_gopher_disk_rareq", + "priority": 0, + "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})", + "atrend": "rise" + }, + { + "metric": "gala_gopher_disk_wareq", + "priority": 0, + "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})", + "atrend": "rise" + } + ] +} \ No newline at end of file diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json index c45b7df..c6c03c1 100644 --- a/config/module/proc_io_latency.json +++ b/config/module/proc_io_latency.json @@ -1,6 +1,9 @@ { "name": "proc_io_latency", "job_type": "sys", + "keywords": [ + "process" + ], "root_cause_number": 3, "KPI": [ { diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json index e92dd4c..e58990d 100644 --- a/config/module/sys_io_latency.json +++ b/config/module/sys_io_latency.json @@ -1,6 +1,9 @@ { "name": "sys_io_latency", "job_type": "sys", + "keywords": [ + "block" + ], "root_cause_number": 3, "KPI": [ { diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json index b6f8eb4..61ae72d 100644 --- a/config/module/sys_tcp_establish.json +++ b/config/module/sys_tcp_establish.json @@ -1,6 +1,9 @@ { "name": "sys_tcp_establish", "job_type": "sys", + "keywords": [ + "tcp" + ], "root_cause_number": 3, "KPI": [ { diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json index 4927d8e..d9e7f80 100644 --- a/config/module/sys_tcp_transmission_latency.json +++ b/config/module/sys_tcp_transmission_latency.json @@ -1,6 +1,9 @@ { "name": "sys_tcp_transmission_latency", "job_type": "sys", + "keywords": [ + "tcp" + ], "root_cause_number": 3, "KPI": [ { diff --git a/config/module/sys_tcp_transmission_throughput.json b/config/module/sys_tcp_transmission_throughput.json index 060f640..28ee784 100644 --- a/config/module/sys_tcp_transmission_throughput.json +++ b/config/module/sys_tcp_transmission_throughput.json @@ -1,6 +1,9 @@ { "name": "sys_tcp_transmission_throughput", "job_type": "sys", + "keywords": [ + "net" + ], "root_cause_number": 3, "KPI": [ { -- 2.33.0