diff --git a/Add-disk-throughput-detector.patch b/Add-disk-throughput-detector.patch new file mode 100644 index 0000000..7a0cbed --- /dev/null +++ b/Add-disk-throughput-detector.patch @@ -0,0 +1,478 @@ +From ac1383471f72420e3320eb7c7999021f3658fb7d Mon Sep 17 00:00:00 2001 +From: lizhenxing11 +Date: Wed, 7 Dec 2022 16:59:15 +0800 +Subject: [PATCH] Add disk throughput detector + +add keywords + +extract cause metric to the attributes + +update template +--- + anteater/config.py | 3 - + anteater/core/kpi.py | 1 + + anteater/main.py | 2 + + anteater/model/algorithms/three_sigma.py | 2 +- + anteater/module/base.py | 6 +- + anteater/module/sys/disk_throughput.py | 62 +++++++++++++ + anteater/module/sys/proc_io_latency.py | 4 +- + anteater/source/anomaly_report.py | 3 +- + anteater/template/app_anomaly_template.py | 4 +- + anteater/template/sys_anomaly_template.py | 4 +- + anteater/template/template.py | 3 +- + anteater/utils/data_load.py | 2 + + config/module/app_sli_rtt.json | 3 + + config/module/disk_throughput.json | 92 +++++++++++++++++++ + config/module/proc_io_latency.json | 3 + + config/module/sys_io_latency.json | 3 + + config/module/sys_tcp_establish.json | 3 + + .../module/sys_tcp_transmission_latency.json | 3 + + .../sys_tcp_transmission_throughput.json | 3 + + 19 files changed, 193 insertions(+), 13 deletions(-) + create mode 100644 anteater/module/sys/disk_throughput.py + create mode 100644 config/module/disk_throughput.json + +diff --git a/anteater/config.py b/anteater/config.py +index ea02702..e9ab557 100644 +--- a/anteater/config.py ++++ b/anteater/config.py +@@ -81,9 +81,6 @@ class AnteaterConf: + """Loads config from yaml file""" + data_path = os.path.realpath(data_path) + +- if not os.path.exists(data_path): +- os.makedirs(data_path) +- + try: + with open(os.path.join(data_path, "config", self.filename), "rb") as f: + result = yaml.safe_load(f) +diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py +index 5a9d8ab..3480139 100644 +--- a/anteater/core/kpi.py ++++ b/anteater/core/kpi.py +@@ -48,6 +48,7 @@ class ModelConfig: + class JobConfig: + name: str + job_type: str ++ keywords: List[str] + root_cause_number: int + kpis: List[KPI] + features: List[Feature] +diff --git a/anteater/main.py b/anteater/main.py +index 11e0409..ba7be70 100644 +--- a/anteater/main.py ++++ b/anteater/main.py +@@ -21,6 +21,7 @@ from apscheduler.schedulers.blocking import BlockingScheduler + from anteater.anomaly_detection import AnomalyDetection + from anteater.config import AnteaterConf + from anteater.module.app.app_sli_detector import APPSliDetector ++from anteater.module.sys.disk_throughput import DiskThroughputDetector + from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector + from anteater.module.sys.sys_io_latency import SysIOLatencyDetector + from anteater.module.sys.tcp_establish import SysTcpEstablishDetector +@@ -57,6 +58,7 @@ def main(): + SysTcpTransmissionLatencyDetector(loader, report), + SysIOLatencyDetector(loader, report), + ProcIOLatencyDetector(loader, report), ++ DiskThroughputDetector(loader, report), + ] + else: + detectors = [ +diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py +index 457b606..49b9952 100644 +--- a/anteater/model/algorithms/three_sigma.py ++++ b/anteater/model/algorithms/three_sigma.py +@@ -31,7 +31,7 @@ def three_sigma(values, obs_size, n=3, method="abs"): + elif method == 'min': + outlier = [val for val in obs_val if val < mean - n * std] + elif method == 'max': +- outlier = [val for val in obs_val if val > mean + 3 * std] ++ outlier = [val for val in obs_val if val > mean + n * std] + else: + raise ValueError(f'Unknown method {method}') + +diff --git a/anteater/module/base.py b/anteater/module/base.py +index 7b5fc84..63436ac 100644 +--- a/anteater/module/base.py ++++ b/anteater/module/base.py +@@ -48,14 +48,14 @@ class E2EDetector: + for detector in self.detectors: + anomalies = detector.execute(self.job_config) + for anomaly in anomalies: +- self.report(anomaly) ++ self.report(anomaly, self.job_config.keywords) + + @abstractmethod + def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: + """Parses the cause metrics into the specific formats""" + pass + +- def report(self, anomaly: Anomaly): ++ def report(self, anomaly: Anomaly, keywords): + """Parses the anomaly into a specific formats + based on the template and reports parsed results + """ +@@ -63,4 +63,4 @@ class E2EDetector: + timestamp = dt.utc_now() + template = self.template(timestamp, anomaly.machine_id, + anomaly.metric, anomaly.entity_name) +- self.reporter.sent_anomaly(anomaly, cause_metrics, template) ++ self.reporter.sent_anomaly(anomaly, cause_metrics, keywords, template) +diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py +new file mode 100644 +index 0000000..9a192fb +--- /dev/null ++++ b/anteater/module/sys/disk_throughput.py +@@ -0,0 +1,62 @@ ++#!/usr/bin/python3 ++# ****************************************************************************** ++# Copyright (c) 2022 Huawei Technologies Co., Ltd. ++# gala-anteater is licensed under Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, ++# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, ++# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. ++# See the Mulan PSL v2 for more details. ++# ******************************************************************************/ ++ ++from typing import List, Dict ++ ++from anteater.core.anomaly import Anomaly ++from anteater.module.base import E2EDetector ++from anteater.model.detector.online_vae_detector import OnlineVAEDetector ++from anteater.model.detector.n_sigma_detector import NSigmaDetector ++from anteater.source.anomaly_report import AnomalyReport ++from anteater.source.metric_loader import MetricLoader ++from anteater.template.sys_anomaly_template import SysAnomalyTemplate ++ ++ ++class DiskThroughputDetector(E2EDetector): ++ """Disk throughput e2e detector which detects the disk read or write ++ await time performance deteriorates ++ """ ++ ++ config_file = 'disk_throughput.json' ++ ++ def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport): ++ """The disk throughput e2e detector initializer""" ++ super().__init__(reporter, SysAnomalyTemplate) ++ ++ self.detectors = self.init_detectors(data_loader) ++ ++ def init_detectors(self, data_loader): ++ if self.job_config.model_config.enable: ++ detectors = [ ++ NSigmaDetector(data_loader, method='max'), ++ OnlineVAEDetector(data_loader, self.job_config.model_config) ++ ] ++ else: ++ detectors = [ ++ NSigmaDetector(data_loader, method='max') ++ ] ++ ++ return detectors ++ ++ def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: ++ """Parses the cause metrics into the specific formats""" ++ cause_metrics = [ ++ { ++ 'metric': cause.ts.metric, ++ 'labels': cause.ts.labels, ++ 'score': cause.score, ++ 'description': cause.description.format( ++ cause.ts.labels.get('disk_name', ''))} ++ for cause in anomaly.root_causes] ++ ++ return cause_metrics +diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py +index 94fd05d..43e069f 100644 +--- a/anteater/module/sys/proc_io_latency.py ++++ b/anteater/module/sys/proc_io_latency.py +@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector): + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ +- NSigmaDetector(data_loader, method='min'), ++ NSigmaDetector(data_loader, method='abs'), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ +- NSigmaDetector(data_loader, method='min') ++ NSigmaDetector(data_loader, method='abs') + ] + + return detectors +diff --git a/anteater/source/anomaly_report.py b/anteater/source/anomaly_report.py +index b226763..3d3bb09 100644 +--- a/anteater/source/anomaly_report.py ++++ b/anteater/source/anomaly_report.py +@@ -42,7 +42,7 @@ class AnomalyReport: + + return keys + +- def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, template: Template): ++ def sent_anomaly(self, anomaly: Anomaly, cause_metrics: List, keywords: List[str], template: Template): + keys = self.get_keys(template.entity_name) + machine_id = template.machine_id + entity_name = template.entity_name +@@ -54,6 +54,7 @@ class AnomalyReport: + template.keys = keys + template.description = anomaly.description + template.cause_metrics = cause_metrics ++ template.keywords = keywords + + msg = template.get_template() + self.provider.send_message(msg) +diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py +index 5b8caf8..a509c96 100644 +--- a/anteater/template/app_anomaly_template.py ++++ b/anteater/template/app_anomaly_template.py +@@ -31,7 +31,9 @@ class AppAnomalyTemplate(Template): + 'entity_id': self.entity_id, + 'event_id': f'{timestamp}_{self.entity_id}', + 'event_type': 'app', +- 'event_source': 'gala-anteater' ++ 'event_source': 'gala-anteater', ++ 'keywords': self.keywords, ++ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} + }, + 'Resource': { + 'metric': self.metric, +diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py +index 1083fb3..4ac6abb 100644 +--- a/anteater/template/sys_anomaly_template.py ++++ b/anteater/template/sys_anomaly_template.py +@@ -31,7 +31,9 @@ class SysAnomalyTemplate(Template): + 'entity_id': self.entity_id, + 'event_id': f'{timestamp}_{self.entity_id}', + 'event_type': 'sys', +- 'event_source': 'gala-anteater' ++ 'event_source': 'gala-anteater', ++ 'keywords': self.keywords, ++ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} + }, + 'Resource': { + 'metric': self.metric, +diff --git a/anteater/template/template.py b/anteater/template/template.py +index 9e4461a..794c121 100644 +--- a/anteater/template/template.py ++++ b/anteater/template/template.py +@@ -26,7 +26,8 @@ class Template: + self.labels = {} + self.entity_id = "" + self.description = "" +- self.cause_metrics = {} ++ self.cause_metrics = [] ++ self.keywords = [] + + @abstractmethod + def get_template(self): +diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py +index 6ac92c7..b6991c6 100644 +--- a/anteater/utils/data_load.py ++++ b/anteater/utils/data_load.py +@@ -45,6 +45,7 @@ def load_job_config(file_name) -> JobConfig: + + name = config['name'] + job_type = config['job_type'] ++ keywords = config['keywords'] + root_cause_number = config['root_cause_number'] + kpis = [KPI(**_conf) for _conf in config['KPI']] + features = [Feature(**_conf) for _conf in config['Features']] +@@ -74,6 +75,7 @@ def load_job_config(file_name) -> JobConfig: + return JobConfig( + name=name, + job_type=job_type, ++ keywords=keywords, + root_cause_number=root_cause_number, + kpis=kpis, + features=features, +diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json +index 7c05094..db29392 100644 +--- a/config/module/app_sli_rtt.json ++++ b/config/module/app_sli_rtt.json +@@ -1,6 +1,9 @@ + { + "name": "app_sli_rtt", + "job_type": "app", ++ "keywords": [ ++ "app" ++ ], + "root_cause_number": 20, + "KPI": [ + { +diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json +new file mode 100644 +index 0000000..00276c0 +--- /dev/null ++++ b/config/module/disk_throughput.json +@@ -0,0 +1,92 @@ ++{ ++ "name": "disk_throughput", ++ "job_type": "sys", ++ "keywords": [ ++ "disk" ++ ], ++ "root_cause_number": 1, ++ "KPI": [ ++ { ++ "metric": "gala_gopher_disk_r_await", ++ "kpi_type": "", ++ "entity_name": "disk", ++ "enable": true, ++ "description": "Disk read await time is increasing!", ++ "params": { ++ "look_back": 20, ++ "obs_size": 25, ++ "outlier_ratio_th": 0.3, ++ "smooth_params": { ++ "method": "conv_smooth", ++ "box_pts": 3 ++ } ++ } ++ }, ++ { ++ "metric": "gala_gopher_disk_w_await", ++ "kpi_type": "", ++ "entity_name": "disk", ++ "enable": true, ++ "description": "Disk write await time is increasing!", ++ "params": { ++ "look_back": 20, ++ "obs_size": 25, ++ "outlier_ratio_th": 0.3, ++ "smooth_params": { ++ "method": "conv_smooth", ++ "box_pts": 3 ++ } ++ } ++ } ++ ], ++ "OnlineModel": { ++ "name": "online_vae_model", ++ "enable": false, ++ "params": { ++ "th": 0.5, ++ "max_error_rate": 0.7, ++ "min_retrain_hours": 24, ++ "min_predict_minutes": 20, ++ "norm": {}, ++ "vae": { ++ "hidden_sizes": [25, 10, 5], ++ "latent_size": 5, ++ "dropout_rate": 0.25, ++ "batch_size": 1024, ++ "num_epochs": 30, ++ "learning_rate": 0.001, ++ "k": 120, ++ "step_size": 60, ++ "num_eval_samples": 10 ++ }, ++ "calibrate": {}, ++ "threshold": {} ++ } ++ }, ++ "Features": [ ++ { ++ "metric": "gala_gopher_disk_rspeed_kB", ++ "priority": 0, ++ "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})", ++ "atrend": "rise" ++ }, ++ { ++ "metric": "gala_gopher_disk_wspeed_kB", ++ "priority": 0, ++ "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})", ++ "atrend": "rise" ++ }, ++ { ++ "metric": "gala_gopher_disk_rareq", ++ "priority": 0, ++ "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})", ++ "atrend": "rise" ++ }, ++ { ++ "metric": "gala_gopher_disk_wareq", ++ "priority": 0, ++ "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})", ++ "atrend": "rise" ++ } ++ ] ++} +\ No newline at end of file +diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json +index c45b7df..c6c03c1 100644 +--- a/config/module/proc_io_latency.json ++++ b/config/module/proc_io_latency.json +@@ -1,6 +1,9 @@ + { + "name": "proc_io_latency", + "job_type": "sys", ++ "keywords": [ ++ "process" ++ ], + "root_cause_number": 3, + "KPI": [ + { +diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json +index e92dd4c..e58990d 100644 +--- a/config/module/sys_io_latency.json ++++ b/config/module/sys_io_latency.json +@@ -1,6 +1,9 @@ + { + "name": "sys_io_latency", + "job_type": "sys", ++ "keywords": [ ++ "block" ++ ], + "root_cause_number": 3, + "KPI": [ + { +diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json +index b6f8eb4..61ae72d 100644 +--- a/config/module/sys_tcp_establish.json ++++ b/config/module/sys_tcp_establish.json +@@ -1,6 +1,9 @@ + { + "name": "sys_tcp_establish", + "job_type": "sys", ++ "keywords": [ ++ "tcp" ++ ], + "root_cause_number": 3, + "KPI": [ + { +diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json +index 4927d8e..d9e7f80 100644 +--- a/config/module/sys_tcp_transmission_latency.json ++++ b/config/module/sys_tcp_transmission_latency.json +@@ -1,6 +1,9 @@ + { + "name": "sys_tcp_transmission_latency", + "job_type": "sys", ++ "keywords": [ ++ "tcp" ++ ], + "root_cause_number": 3, + "KPI": [ + { +diff --git a/config/module/sys_tcp_transmission_throughput.json b/config/module/sys_tcp_transmission_throughput.json +index 060f640..28ee784 100644 +--- a/config/module/sys_tcp_transmission_throughput.json ++++ b/config/module/sys_tcp_transmission_throughput.json +@@ -1,6 +1,9 @@ + { + "name": "sys_tcp_transmission_throughput", + "job_type": "sys", ++ "keywords": [ ++ "net" ++ ], + "root_cause_number": 3, + "KPI": [ + { +-- +2.33.0 + diff --git a/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch b/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch new file mode 100644 index 0000000..782a879 --- /dev/null +++ b/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch @@ -0,0 +1,377 @@ +From dd870b17120f3c7961c4613d454f1653fbd42214 Mon Sep 17 00:00:00 2001 +From: lizhenxing11 +Date: Tue, 27 Dec 2022 18:39:32 +0800 +Subject: [PATCH] Update TCP Establish Model & Add Nic Loss Detector + +change method 'abs' to 'max' +--- + anteater/main.py | 2 + + anteater/model/algorithms/three_sigma.py | 4 +- + anteater/model/detector/n_sigma_detector.py | 4 +- + .../tcp_establish_n_sigma_detector.py | 12 +++- + anteater/model/detector/th_base_detector.py | 66 +++++++++++++++++++ + anteater/module/sys/nic_loss.py | 59 +++++++++++++++++ + anteater/module/sys/proc_io_latency.py | 4 +- + anteater/template/app_anomaly_template.py | 2 + + anteater/template/sys_anomaly_template.py | 1 + + config/module/sys_nic_loss.json | 53 +++++++++++++++ + config/module/sys_tcp_establish.json | 3 +- + 11 files changed, 200 insertions(+), 10 deletions(-) + create mode 100644 anteater/model/detector/th_base_detector.py + create mode 100644 anteater/module/sys/nic_loss.py + create mode 100644 config/module/sys_nic_loss.json + +diff --git a/anteater/main.py b/anteater/main.py +index ba7be70..4de72f9 100644 +--- a/anteater/main.py ++++ b/anteater/main.py +@@ -22,6 +22,7 @@ from anteater.anomaly_detection import AnomalyDetection + from anteater.config import AnteaterConf + from anteater.module.app.app_sli_detector import APPSliDetector + from anteater.module.sys.disk_throughput import DiskThroughputDetector ++from anteater.module.sys.nic_loss import NICLossDetector + from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector + from anteater.module.sys.sys_io_latency import SysIOLatencyDetector + from anteater.module.sys.tcp_establish import SysTcpEstablishDetector +@@ -59,6 +60,7 @@ def main(): + SysIOLatencyDetector(loader, report), + ProcIOLatencyDetector(loader, report), + DiskThroughputDetector(loader, report), ++ NICLossDetector(loader, report), + ] + else: + detectors = [ +diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py +index 49b9952..0865417 100644 +--- a/anteater/model/algorithms/three_sigma.py ++++ b/anteater/model/algorithms/three_sigma.py +@@ -14,8 +14,8 @@ + import numpy as np + + +-def three_sigma(values, obs_size, n=3, method="abs"): +- """The '3-sigma rule' outlier detect function""" ++def n_sigma(values, obs_size, n=3, method="abs"): ++ """The 'N-sigma rule' outlier detect function""" + if obs_size <= 0: + raise ValueError("The obs_size should great than zero!") + if len(values) <= obs_size: +diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py +index f632326..3a2ab01 100644 +--- a/anteater/model/detector/n_sigma_detector.py ++++ b/anteater/model/detector/n_sigma_detector.py +@@ -19,7 +19,7 @@ from anteater.core.kpi import KPI + from anteater.core.time_series import TimeSeriesScore + from anteater.model.detector.base import Detector + from anteater.model.algorithms.smooth import smoothing +-from anteater.model.algorithms.three_sigma import three_sigma ++from anteater.model.algorithms.three_sigma import n_sigma + from anteater.source.metric_loader import MetricLoader + from anteater.utils.common import divide + from anteater.utils.datetime import DateTimeManager as dt +@@ -91,7 +91,7 @@ class NSigmaDetector(Detector): + ratio = 0 + else: + smoothed_val = smoothing(_ts.values, **smooth_params) +- outlier, mean, std = three_sigma( ++ outlier, mean, std = n_sigma( + smoothed_val, obs_size=obs_size, n=n, method=self.method) + ratio = divide(len(outlier), obs_size) + +diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py +index 8dcf9ae..82d7837 100644 +--- a/anteater/model/detector/tcp_establish_n_sigma_detector.py ++++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py +@@ -42,8 +42,13 @@ class TcpEstablishNSigmaDetector(Detector): + start, _ = dt.last(minutes=look_back) + mid, _ = dt.last(minutes=3) + ++ filtered_ts_list = [] + ts_list = self.data_loader.get_metric(start, mid, kpi.metric) +- establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in ts_list]) ++ for _ts in ts_list: ++ if sum(_ts.values) > 0: ++ filtered_ts_list.append(_ts) ++ ++ establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in filtered_ts_list]) + + self.mean = np.mean(establish_time) + self.std = np.std(establish_time) +@@ -65,6 +70,7 @@ class TcpEstablishNSigmaDetector(Detector): + """Detects kpi based on signal time series anomaly detection model""" + outlier_ratio_th = kpi.params.get('outlier_ratio_th') + look_back = kpi.params.get('obs_size') ++ min_rtt = kpi.params.get('min_rtt') + + start, end = dt.last(minutes=look_back) + ts_list = self.data_loader.\ +@@ -72,9 +78,9 @@ class TcpEstablishNSigmaDetector(Detector): + + anomalies = [] + for _ts in ts_list: +- outlier = [val for val in _ts.values if abs(val - self.mean) > 3 * self.std] ++ outlier = [val for val in _ts.values if val > self.mean + 5 * self.std] + ratio = divide(len(outlier), len(_ts.values)) +- if outlier and ratio > outlier_ratio_th: ++ if outlier and ratio > outlier_ratio_th and np.average(outlier) >= min_rtt: + anomalies.append( + Anomaly( + machine_id=machine_id, +diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py +new file mode 100644 +index 0000000..bec9705 +--- /dev/null ++++ b/anteater/model/detector/th_base_detector.py +@@ -0,0 +1,66 @@ ++#!/usr/bin/python3 ++# ****************************************************************************** ++# Copyright (c) 2022 Huawei Technologies Co., Ltd. ++# gala-anteater is licensed under Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, ++# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, ++# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. ++# See the Mulan PSL v2 for more details. ++# ******************************************************************************/ ++ ++from typing import List ++ ++from anteater.core.anomaly import Anomaly ++from anteater.core.kpi import KPI ++from anteater.model.detector.base import Detector ++from anteater.source.metric_loader import MetricLoader ++from anteater.utils.datetime import DateTimeManager as dt ++from anteater.utils.log import logger ++ ++ ++class ThBaseDetector(Detector): ++ """The threshold-based anomaly detector""" ++ ++ def __init__(self, data_loader: MetricLoader): ++ """The detector base class initializer""" ++ super().__init__(data_loader) ++ ++ def detect_kpis(self, kpis: List[KPI]): ++ """Executes anomaly detection on kpis""" ++ start, end = dt.last(minutes=1) ++ machine_ids = self.get_unique_machine_id(start, end, kpis) ++ anomalies = [] ++ for _id in machine_ids: ++ for kpi in kpis: ++ anomalies.extend(self.detect_signal_kpi(kpi, _id)) ++ ++ return anomalies ++ ++ def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]: ++ """Detects kpi based on threshold based anomaly detection model""" ++ look_back = kpi.params.get('look_back') ++ th = kpi.params.get('th') ++ start, end = dt.last(minutes=look_back) ++ ts_list = self.data_loader.\ ++ get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id) ++ ++ if not ts_list: ++ logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!') ++ return [] ++ ++ anomalies = [ ++ Anomaly( ++ machine_id=machine_id, ++ metric=_ts.metric, ++ labels=_ts.labels, ++ score=1, ++ entity_name=kpi.entity_name, ++ description=kpi.description) ++ for _ts in ts_list ++ if sum(_ts.values) >= th ++ ] ++ ++ return anomalies +diff --git a/anteater/module/sys/nic_loss.py b/anteater/module/sys/nic_loss.py +new file mode 100644 +index 0000000..d24e06f +--- /dev/null ++++ b/anteater/module/sys/nic_loss.py +@@ -0,0 +1,59 @@ ++#!/usr/bin/python3 ++# ****************************************************************************** ++# Copyright (c) 2022 Huawei Technologies Co., Ltd. ++# gala-anteater is licensed under Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, ++# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, ++# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. ++# See the Mulan PSL v2 for more details. ++# ******************************************************************************/ ++ ++from typing import List, Dict ++ ++from anteater.core.anomaly import Anomaly ++from anteater.model.detector.th_base_detector import ThBaseDetector ++from anteater.module.base import E2EDetector ++from anteater.source.anomaly_report import AnomalyReport ++from anteater.source.metric_loader import MetricLoader ++from anteater.template.sys_anomaly_template import SysAnomalyTemplate ++ ++ ++class NICLossDetector(E2EDetector): ++ """SYS nic loss e2e detector which detects the network loss. ++ """ ++ ++ config_file = 'sys_nic_loss.json' ++ ++ def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport): ++ """The system tcp transmission latency e2e detector initializer""" ++ super().__init__(reporter, SysAnomalyTemplate) ++ ++ self.detectors = [ ++ ThBaseDetector(data_loader) ++ ] ++ ++ def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: ++ """Parses the cause metrics into the specific formats""" ++ cause_metrics = [] ++ for _cs in anomaly.root_causes: ++ tmp = { ++ 'metric': _cs.ts.metric, ++ 'labels': _cs.ts.labels, ++ 'score': _cs.score, ++ } ++ if 'tcp' in _cs.ts.metric: ++ tmp['description'] = _cs.description.format( ++ _cs.ts.labels.get('tgid', ''), ++ _cs.ts.labels.get('client_port', ''), ++ _cs.ts.labels.get('server_ip', ''), ++ _cs.ts.labels.get('server_port', '')) ++ else: ++ tmp['description'] = _cs.description.format( ++ _cs.ts.labels.get('dev_name', '')) ++ ++ cause_metrics.append(tmp) ++ ++ return cause_metrics +diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py +index 43e069f..a34c48d 100644 +--- a/anteater/module/sys/proc_io_latency.py ++++ b/anteater/module/sys/proc_io_latency.py +@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector): + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ +- NSigmaDetector(data_loader, method='abs'), ++ NSigmaDetector(data_loader, method='max'), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ +- NSigmaDetector(data_loader, method='abs') ++ NSigmaDetector(data_loader, method='max') + ] + + return detectors +diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py +index a509c96..4df4a35 100644 +--- a/anteater/template/app_anomaly_template.py ++++ b/anteater/template/app_anomaly_template.py +@@ -46,6 +46,8 @@ class AppAnomalyTemplate(Template): + 'SeverityNumber': 13, + 'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.', + 'event_id': f'{timestamp}_{self.entity_id}', ++ "keywords": self.keywords, ++ 'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} + } + + return result +diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py +index 4ac6abb..aec6ea0 100644 +--- a/anteater/template/sys_anomaly_template.py ++++ b/anteater/template/sys_anomaly_template.py +@@ -46,6 +46,7 @@ class SysAnomalyTemplate(Template): + 'SeverityNumber': 13, + 'Body': f'{self.timestamp.strftime("%c")} WARN, SYS may be impacting performance issues.', + 'event_id': f'{timestamp}_{self.entity_id}', ++ "keywords": self.keywords + } + + return result +diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json +new file mode 100644 +index 0000000..793f82f +--- /dev/null ++++ b/config/module/sys_nic_loss.json +@@ -0,0 +1,53 @@ ++{ ++ "name": "sys_tcp_transmission_latency", ++ "job_type": "sys", ++ "keywords": [ ++ "net" ++ ], ++ "root_cause_number": 3, ++ "KPI": [ ++ { ++ "metric": "gala_gopher_nic_tc_sent_drop", ++ "kpi_type": "", ++ "entity_name": "nic", ++ "enable": true, ++ "description": "TC发送丢包数异常", ++ "params": { ++ "look_back": 2, ++ "th": 1 ++ } ++ } ++ ], ++ "Features": [ ++ { ++ "metric": "gala_gopher_nic_tx_dropped", ++ "priority": 0, ++ "description": "网卡发送丢弃的数据包数异常。(dev_name = {})" ++ }, ++ { ++ "metric": "gala_gopher_nic_rx_dropped", ++ "priority": 0, ++ "description": "网卡接收丢弃的数据包数异常。(dev_name = {})" ++ }, ++ { ++ "metric": "gala_gopher_tcp_link_sk_drops", ++ "priority": 3, ++ "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ }, ++ { ++ "metric": "gala_gopher_tcp_link_retran_packets", ++ "priority": 1, ++ "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ }, ++ { ++ "metric": "gala_gopher_tcp_link_lost_out", ++ "priority": 3, ++ "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ }, ++ { ++ "metric": "gala_gopher_tcp_link_notsent_bytes", ++ "priority": 4, ++ "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" ++ } ++ ] ++} +\ No newline at end of file +diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json +index 61ae72d..2c158c0 100644 +--- a/config/module/sys_tcp_establish.json ++++ b/config/module/sys_tcp_establish.json +@@ -15,7 +15,8 @@ + "params": { + "look_back": 30, + "outlier_ratio_th": 0.5, +- "obs_size": 3 ++ "obs_size": 3, ++ "min_rtt": 500000 + } + } + ], +-- +2.33.0 + diff --git a/add-chinese-descriptions.patch b/add-chinese-descriptions.patch new file mode 100644 index 0000000..43bfbd3 --- /dev/null +++ b/add-chinese-descriptions.patch @@ -0,0 +1,533 @@ +From e0e99ac8fc3de9e8781f5d7acd5e9fe1832461b0 Mon Sep 17 00:00:00 2001 +From: lizhenxing11 +Date: Tue, 3 Jan 2023 15:27:45 +0800 +Subject: [PATCH] add chinese descriptions + +update description + +fix typo + +update th +--- + anteater/core/kpi.py | 2 +- + anteater/template/app_anomaly_template.py | 5 ++- + anteater/template/sys_anomaly_template.py | 2 +- + anteater/utils/data_load.py | 14 ++++++-- + config/module/app_sli_rtt.json | 2 ++ + config/module/disk_throughput.json | 6 ++++ + config/module/proc_io_latency.json | 31 +++++++++++----- + config/module/sys_io_latency.json | 25 ++++++++----- + config/module/sys_nic_loss.json | 21 +++++++---- + config/module/sys_tcp_establish.json | 4 ++- + .../module/sys_tcp_transmission_latency.json | 36 ++++++++++++------- + 11 files changed, 104 insertions(+), 44 deletions(-) + +diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py +index 3480139..f83b666 100644 +--- a/anteater/core/kpi.py ++++ b/anteater/core/kpi.py +@@ -23,7 +23,7 @@ class KPI: + kpi_type: str + entity_name: str + enable: bool +- description: str = "" ++ description: str + params: dict = field(default=dict) + atrend: AnomalyTrend = AnomalyTrend.DEFAULT + +diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py +index 4df4a35..3770d2e 100644 +--- a/anteater/template/app_anomaly_template.py ++++ b/anteater/template/app_anomaly_template.py +@@ -33,7 +33,7 @@ class AppAnomalyTemplate(Template): + 'event_type': 'app', + 'event_source': 'gala-anteater', + 'keywords': self.keywords, +- 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} ++ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': self.description} + }, + 'Resource': { + 'metric': self.metric, +@@ -46,8 +46,7 @@ class AppAnomalyTemplate(Template): + 'SeverityNumber': 13, + 'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.', + 'event_id': f'{timestamp}_{self.entity_id}', +- "keywords": self.keywords, +- 'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} ++ "keywords": self.keywords + } + + return result +diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py +index aec6ea0..d3c7e82 100644 +--- a/anteater/template/sys_anomaly_template.py ++++ b/anteater/template/sys_anomaly_template.py +@@ -33,7 +33,7 @@ class SysAnomalyTemplate(Template): + 'event_type': 'sys', + 'event_source': 'gala-anteater', + 'keywords': self.keywords, +- 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} ++ 'cause_metric': self.cause_metrics[0] if self.cause_metrics else {'description': self.description} + }, + 'Resource': { + 'metric': self.metric, +diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py +index b6991c6..730c9c6 100644 +--- a/anteater/utils/data_load.py ++++ b/anteater/utils/data_load.py +@@ -47,8 +47,9 @@ def load_job_config(file_name) -> JobConfig: + job_type = config['job_type'] + keywords = config['keywords'] + root_cause_number = config['root_cause_number'] +- kpis = [KPI(**_conf) for _conf in config['KPI']] +- features = [Feature(**_conf) for _conf in config['Features']] ++ ++ kpis = [KPI(**update_description(_conf)) for _conf in config['KPI']] ++ features = [Feature(**update_description(_conf)) for _conf in config['Features']] + + model_config = None + if 'OnlineModel' in config: +@@ -81,3 +82,12 @@ def load_job_config(file_name) -> JobConfig: + features=features, + model_config=model_config + ) ++ ++ ++def update_description(conf: dict): ++ """Changes description to zh""" ++ if 'description-zh' in conf: ++ conf['description'] = conf['description-zh'] ++ del conf['description-zh'] ++ ++ return conf +diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json +index db29392..0146883 100644 +--- a/config/module/app_sli_rtt.json ++++ b/config/module/app_sli_rtt.json +@@ -12,6 +12,7 @@ + "entity_name": "sli", + "enable": false, + "description": "sli rtt 异常", ++ "description-zh": "应用级请求往返时延(RTT)异常", + "params": { + "look_back": 10, + "obs_size": 25, +@@ -28,6 +29,7 @@ + "entity_name": "sli", + "enable": true, + "description": "sli tps 异常", ++ "description-zh": "应用级请求吞吐量(TPS)异常", + "params": { + "look_back": 10, + "obs_size": 25, +diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json +index 00276c0..f6244f6 100644 +--- a/config/module/disk_throughput.json ++++ b/config/module/disk_throughput.json +@@ -12,6 +12,7 @@ + "entity_name": "disk", + "enable": true, + "description": "Disk read await time is increasing!", ++ "description-zh": "磁盘读响应时间升高,性能发生劣化", + "params": { + "look_back": 20, + "obs_size": 25, +@@ -28,6 +29,7 @@ + "entity_name": "disk", + "enable": true, + "description": "Disk write await time is increasing!", ++ "description-zh": "磁盘写响应时间升高,性能发生劣化", + "params": { + "look_back": 20, + "obs_size": 25, +@@ -68,24 +70,28 @@ + "metric": "gala_gopher_disk_rspeed_kB", + "priority": 0, + "description": "The disk I/O await time performance deteriorates due to read throughput rise (read kbytes/second).(Disk = {})", ++ "description-zh": "磁盘读吞吐量异常升高,导致I/O等待时间性能劣化(Disk = {})", + "atrend": "rise" + }, + { + "metric": "gala_gopher_disk_wspeed_kB", + "priority": 0, + "description": "The disk I/O await time performance deteriorates due to write throughput rise (write kbytes/second).(Disk = {})", ++ "description-zh": "磁盘写吞吐量异常升高,导致I/O等待时间性能劣化(Disk = {})", + "atrend": "rise" + }, + { + "metric": "gala_gopher_disk_rareq", + "priority": 0, + "description": "The disk I/O await time performance deteriorates due to read saturation rise.(Disk = {})", ++ "description-zh": "磁盘读饱和度量异常升高,导致I/O等待时间性能劣化(Disk = {})", + "atrend": "rise" + }, + { + "metric": "gala_gopher_disk_wareq", + "priority": 0, + "description": "The disk I/O await time performance deteriorates due to write saturation rise.(Disk = {})", ++ "description-zh": "磁盘读写饱和度量异常升高,导致I/O等待时间性能劣化(Disk = {})", + "atrend": "rise" + } + ] +diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json +index c6c03c1..f086b87 100644 +--- a/config/module/proc_io_latency.json ++++ b/config/module/proc_io_latency.json +@@ -12,6 +12,7 @@ + "entity_name": "proc", + "enable": true, + "description": "I/O operation delay at the BIO layer (unit: us)", ++ "description-zh": "BIO层I/O操作延时高(单位:us)", + "params": { + "look_back": 20, + "obs_size": 37, +@@ -28,6 +29,7 @@ + "entity_name": "proc", + "enable": true, + "description": "Number of small I/O (less than 4 KB) read operations at the BIO layer.", ++ "description-zh": "BIO层小数据I/O读操作数量异常(小于4KB)", + "params": { + "look_back": 20, + "obs_size": 25, +@@ -44,6 +46,7 @@ + "entity_name": "proc", + "enable": true, + "description": "Number of small I/O (less than 4 KB) write operations at the BIO layer.", ++ "description-zh": "BIO层小数据I/O写操作数量异常(小于4KB)", + "params": { + "look_back": 20, + "obs_size": 25, +@@ -61,6 +64,7 @@ + "entity_name": "proc", + "enable": true, + "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.", ++ "description-zh": "BIO层大数据I/O读操作数量异常(大于4KB)", + "params": { + "look_back": 20, + "obs_size": 25, +@@ -76,7 +80,8 @@ + "kpi_type": "", + "entity_name": "proc", + "enable": true, +- "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.", ++ "description": "Number of big I/O (greater than 4 KB) write operations at the BIO layer.", ++ "description-zh": "BIO层大数据写操作数量异常(大于4KB)", + "params": { + "look_back": 20, + "obs_size": 25, +@@ -116,42 +121,50 @@ + { + "metric": "gala_gopher_block_latency_req_max", + "priority": 4, +- "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})" ++ "description": "Process I/O performance deteriorates due to system I/O bandwidth insufficient.(Disk = {})", ++ "description-zh": "系统I/O带宽不足引起进程I/O性能劣化(Disk={})" + }, + { + "metric": "gala_gopher_block_latency_device_max", + "priority": 3, +- "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})" ++ "description": "Process I/O performance deteriorates due to device I/O bandwidth insufficient.(Disk = {})", ++ "description-zh": "设备I/O带宽不足引起进程I/O性能劣化(Disk={})" + }, + { + "metric": "gala_gopher_block_read_bytes", + "priority": 2, +- "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})" ++ "description": "Process I/O performance deteriorates due to frequent read I/O operations.(Disk = {})", ++ "description-zh": "频繁I/O读操作引起进程I/O性能劣化(Disk={})" + }, + { + "metric": "gala_gopher_block_write_bytes", + "priority": 2, +- "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})" ++ "description": "Process I/O performance deteriorates due to frequent write I/O operations.(Disk = {})", ++ "description-zh": "频繁写操作引起进程I/O性能劣化(Disk={})" + }, + { + "metric": "gala_gopher_proc_less_4k_io_read", + "priority": 0, +- "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})" ++ "description": "Process I/O performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})", ++ "description-zh": "频繁小数据量(小于4KB)读操作引起进程I/O性能劣化(Disk={},PID={},comm={})" + }, + { + "metric": "gala_gopher_proc_less_4k_io_write", + "priority": 0, +- "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})" ++ "description": "Process I/O performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})", ++ "description-zh": "频繁小数据量(小于4KB)写操作引起进程I/O性能劣化(Disk={},PID={},comm={})" + }, + { + "metric": "gala_gopher_proc_greater_4k_io_read", + "priority": 1, +- "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})" ++ "description": "Process I/O performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})", ++ "description-zh": "频繁大数据量(大于4KB)读操作引起进程I/O性能劣化(Disk={},PID={},comm={})" + }, + { + "metric": "gala_gopher_proc_greater_4k_io_write", + "priority": 1, +- "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})" ++ "description": "Process I/O performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})", ++ "description-zh": "频繁大数据量(大于4KB)写操作引起进程I/O性能劣化(Disk={},PID={},comm={})" + } + ] + } +\ No newline at end of file +diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json +index e58990d..bdf17d3 100644 +--- a/config/module/sys_io_latency.json ++++ b/config/module/sys_io_latency.json +@@ -12,6 +12,7 @@ + "entity_name": "block", + "enable": true, + "description": "Block I/O latency performance is deteriorating!", ++ "description-zh": "Block层I/O操作时延性能劣化", + "params": { + "look_back": 20, + "obs_size": 25, +@@ -51,42 +52,50 @@ + { + "metric": "gala_gopher_block_latency_driver_max", + "priority": 4, +- "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})" ++ "description": "The system I/O performance deteriorates due to a drive failure.(Disk = {})", ++ "description-zh": "驱动异常引起系统I/O性能劣化(Disk={})" + }, + { + "metric": "gala_gopher_block_latency_device_max", + "priority": 3, +- "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})" ++ "description": "Degraded system I/O performance due to device (disk) failure.(Disk = {})", ++ "description-zh": "设备(磁盘)异常引起系统I/O性能劣化(Disk={})" + }, + { + "metric": "gala_gopher_block_read_bytes", + "priority": 2, +- "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})" ++ "description": "System performance deteriorates due to frequent read I/O operations.(Disk = {})", ++ "description-zh": "频繁读操作引起系统I/O性能劣化(Disk={})" + }, + { + "metric": "gala_gopher_block_write_bytes", + "priority": 2, +- "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})" ++ "description": "System performance deteriorates due to frequent write I/O operations.(Disk = {})", ++ "description-zh": "频繁写操作引起系统I/O性能劣化(Disk={})" + }, + { + "metric": "gala_gopher_proc_less_4k_io_read", + "priority": 0, +- "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})" ++ "description": "System performance degrades due to frequent small I/O read operations.(Disk = {}, PID = {}, comm = {})", ++ "description-zh": "频繁小数据量(小于4KB)读操作引起系统I/O性能劣化(Disk={},PID={},comm={})" + }, + { + "metric": "gala_gopher_proc_less_4k_io_write", + "priority": 0, +- "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})" ++ "description": "System performance degrades due to frequent small I/O write operations.(Disk = {}, PID = {}, comm = {})", ++ "description-zh": "频繁小数据量(小于4KB)写操作引起系统I/O性能劣化(Disk={},PID={},comm={})" + }, + { + "metric": "gala_gopher_proc_greater_4k_io_read", + "priority": 1, +- "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})" ++ "description": "System performance degrades due to frequent big I/O read operations.(Disk = {}, PID = {}, comm = {})", ++ "description-zh": "频繁大数据量(大于4KB)读操作引起系统I/O性能劣化(Disk={},PID={},comm={})" + }, + { + "metric": "gala_gopher_proc_greater_4k_io_write", + "priority": 1, +- "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})" ++ "description": "System performance degrades due to frequent big I/O write operations.(Disk = {}, PID = {}, comm = {})", ++ "description-zh": "频繁大数据量(大于4KB)写操作引起系统I/O性能劣化(Disk={},PID={},comm={})" + } + ] + } +\ No newline at end of file +diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json +index 793f82f..8a1feb8 100644 +--- a/config/module/sys_nic_loss.json ++++ b/config/module/sys_nic_loss.json +@@ -11,7 +11,8 @@ + "kpi_type": "", + "entity_name": "nic", + "enable": true, +- "description": "TC发送丢包数异常", ++ "description": "TC sent dropped packets", ++ "description-zh": "TC发送丢包数异常", + "params": { + "look_back": 2, + "th": 1 +@@ -22,32 +23,38 @@ + { + "metric": "gala_gopher_nic_tx_dropped", + "priority": 0, +- "description": "网卡发送丢弃的数据包数异常。(dev_name = {})" ++ "description": "The number of lost packets sent by the nic card are increasing and the NIC performance deteriorates.(dev_name = {})", ++ "description-zh": "网卡发送丢弃的数据包数增加,导致网卡性能劣化(dev_name={})" + }, + { + "metric": "gala_gopher_nic_rx_dropped", + "priority": 0, +- "description": "网卡接收丢弃的数据包数异常。(dev_name = {})" ++ "description": "The number of lost packets received by the nic card are increasing and the NIC performance deteriorates.(dev_name = {})", ++ "description-zh": "网卡接收丢弃的数据包数增加,导致网卡性能劣化(dev_name={})" + }, + { + "metric": "gala_gopher_tcp_link_sk_drops", + "priority": 3, +- "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "Packets are lost in the host protocol stack due to unknown causes, and the NIC performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "由于未知原因,数据包在主机协议栈中丢失,导致网卡性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_retran_packets", + "priority": 1, +- "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "TCP retransmission is triggered due to network faults, resulting in the NIC performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "网络故障触发TCP重传,导致网卡性能下降(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_lost_out", + "priority": 3, +- "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "The network may be congested, causing abnormal NIC packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "网络拥塞,导致网卡异常丢包,性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_notsent_bytes", + "priority": 4, +- "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "由于网络延迟或对端应用程序性能,滑动窗口中累积了太多要发送的数据包,导致网卡性能劣化(PID={},client IP={},Server IP={},Port={})" + } + ] + } +\ No newline at end of file +diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json +index 2c158c0..7cd2369 100644 +--- a/config/module/sys_tcp_establish.json ++++ b/config/module/sys_tcp_establish.json +@@ -12,6 +12,7 @@ + "entity_name": "tcp_link", + "enable": true, + "description": "RTT of syn packet(us): the max syn packets rtt is {:.0f} us", ++ "description-zh": "SYN数据包时延异常:最大SYN数据包时延为:{:.0f}us。", + "params": { + "look_back": 30, + "outlier_ratio_th": 0.5, +@@ -24,7 +25,8 @@ + { + "metric": "gala_gopher_endpoint_retran_synacks", + "priority": 0, +- "description": "TCP established performance deteriorates due to loss of SYN/ACK packets.(PID = {}, TCP Listen Port = {})" ++ "description": "TCP established performance deteriorates due to loss of SYN/ACK packets.(PID = {}, TCP Listen Port = {})", ++ "description-zh": "由于SYN/ACK数据包丢失,TCP建链性能劣化(PID={},TCP Listen Port={})" + } + ] + } +\ No newline at end of file +diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json +index d9e7f80..0527487 100644 +--- a/config/module/sys_tcp_transmission_latency.json ++++ b/config/module/sys_tcp_transmission_latency.json +@@ -12,10 +12,11 @@ + "entity_name": "tcp_link", + "enable": true, + "description": "Smoothed Round Trip Time(us)", ++ "description-zh": "TCP链接往返时延异常,性能劣化", + "params": { + "look_back": 20, + "obs_size": 25, +- "n": 4, ++ "n": 3, + "outlier_ratio_th": 0.4, + "smooth_params": { + "method": "conv_smooth", +@@ -52,57 +53,68 @@ + { + "metric": "gala_gopher_tcp_link_notsent_bytes", + "priority": 4, +- "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "由于网络延迟或对端应用程序性能,滑动窗口中累积了太多要发送的数据包,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_notack_bytes", + "priority": 4, +- "description": "Due to network delay or peer application performance, too many NO ACK packets are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "Due to network delay or peer application performance, too many NO ACK packets are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "由于网络延迟或对端应用程序性能,滑动窗口中累积了过多的NO ACK数据包,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_snd_wnd", + "priority": 4, +- "description": "The TCP send window is abnormal due to peer application performance or network congestion. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "The TCP send window is abnormal due to peer application performance or network congestion. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "对端应用性能或网络拥塞导致TCP发送窗口异常,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_rcv_wnd", + "priority": 4, +- "description": "The TCP receive window becomes abnormal due to the local application performance. As a result, the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "The TCP receive window becomes abnormal due to the local application performance. As a result, the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "本地应用性能导致TCP接收窗口异常,传输性能变差(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_avl_snd_wnd", + "priority": 4, +- "description": "The available TCP send window may be abnormal due to network congestion and the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "The available TCP send window may be abnormal due to network congestion and the performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "可用的TCP发送窗口可能因网络拥塞而异常,传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_lost_out", + "priority": 3, +- "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "网络可能拥塞,导致TCP异常丢包,传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_sk_drops", + "priority": 3, +- "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "主机协议栈不明原因丢包,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_retran_packets", + "priority": 1, +- "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "网络故障触发TCP重传,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_backlog_drops", + "priority": 0, +- "description": "TCP backlog overflows due to local application performance. As a result, TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "TCP backlog overflows due to local application performance. As a result, TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "由于本地应用程序性能问题,TCP积压溢出,导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_sacked_out", + "priority": 2, +- "description": "TCP performance deteriorates due to network out-of-order. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "TCP performance deteriorates due to network out-of-order. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "网络乱序导致TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" + }, + { + "metric": "gala_gopher_tcp_link_sk_backlog_size", + "priority": 0, +- "description": "The TCP backlog queue length is abnormal due to the local application performance. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" ++ "description": "The TCP backlog queue length is abnormal due to the local application performance. As a result, the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})", ++ "description-zh": "本地应用性能导致TCP backlog队列长度异常,TCP传输性能劣化(PID={},client IP={},Server IP={},Port={})" + } + ] + } +\ No newline at end of file +-- +2.33.0 + diff --git a/add-systemd-service-for-anteater.patch b/add-systemd-service-for-anteater.patch new file mode 100644 index 0000000..cfee1e6 --- /dev/null +++ b/add-systemd-service-for-anteater.patch @@ -0,0 +1,81 @@ +From 2ef581e4960dd0ba49bbe371496933841da001fe Mon Sep 17 00:00:00 2001 +From: lizhenxing11 +Date: Mon, 9 Jan 2023 15:08:01 +0800 +Subject: [PATCH] add systemd service for anteater + +add manifest.in +--- + MANIFEST.in | 11 +++++++++++ + service/gala-anteater.service | 12 ++++++++++++ + setup.py | 5 +++-- + 3 files changed, 26 insertions(+), 2 deletions(-) + create mode 100644 MANIFEST.in + create mode 100644 service/gala-anteater.service + +diff --git a/MANIFEST.in b/MANIFEST.in +new file mode 100644 +index 0000000..7120af9 +--- /dev/null ++++ b/MANIFEST.in +@@ -0,0 +1,11 @@ ++include LICENSE ++include README.en.md ++include README.md ++include requirements.txt ++ ++recursive-include service * ++recursive-include tests * ++recursive-include docs * ++ ++recursive-exclude * __pycache__ ++recursive-exclude * *.py[co] +\ No newline at end of file +diff --git a/service/gala-anteater.service b/service/gala-anteater.service +new file mode 100644 +index 0000000..24af354 +--- /dev/null ++++ b/service/gala-anteater.service +@@ -0,0 +1,12 @@ ++[Unit] ++Description=A-Ops gala-anteater service ++After=network.target ++ ++[Service] ++Type=exec ++ExecStart=/usr/bin/gala-anteater ++Restart=on-failure ++RestartSec=1 ++ ++[Install] ++WantedBy=multi-user.target +\ No newline at end of file +diff --git a/setup.py b/setup.py +index 4471a0f..e075391 100644 +--- a/setup.py ++++ b/setup.py +@@ -23,11 +23,12 @@ setup( + description="Times Series Anomaly Detection Platform on Operating System", + url="https://gitee.com/openeuler/A-Ops/tree/master/gala-anteater", + keywords=["Anomaly Detection", "Time Series Analysis", "Operating System"], +- packages=find_packages(where="."), ++ packages=find_packages(where=".", exclude=("tests",)), + data_files=[ + ('/etc/gala-anteater/config/', glob('config/gala-anteater.yaml')), + ('/etc/gala-anteater/config/', glob('config/log.settings.ini')), + ('/etc/gala-anteater/config/module/', glob('config/module/*')), ++ ('/usr/lib/systemd/system/', glob('service/*')), + ], + install_requires=[ + "APScheduler", +@@ -42,7 +43,7 @@ setup( + ], + entry_points={ + "console_scripts": [ +- "gala-anteater = anteater.main:main", ++ "gala-anteater=anteater.main:main", + ] + } + ) +-- +2.33.0 + diff --git a/fix-str2enum-bug-data-query-refactor.patch b/fix-str2enum-bug-data-query-refactor.patch new file mode 100644 index 0000000..2f0bf7f --- /dev/null +++ b/fix-str2enum-bug-data-query-refactor.patch @@ -0,0 +1,737 @@ +From 27bb7cdd80f76bfc7ebb0f3041544740aa2fa91b Mon Sep 17 00:00:00 2001 +From: lizhenxing11 +Date: Tue, 10 Jan 2023 15:31:44 +0800 +Subject: [PATCH] fix str2enum bug & data query refactor + +--- + anteater/core/anomaly.py | 10 ++++ + anteater/core/kpi.py | 14 ++++++ + anteater/model/algorithms/slope.py | 11 +++-- + anteater/model/detector/base.py | 20 ++++---- + anteater/model/detector/n_sigma_detector.py | 15 +++--- + .../model/detector/online_vae_detector.py | 3 +- + .../tcp_establish_n_sigma_detector.py | 3 +- + .../tcp_trans_latency_n_sigma_detector.py | 48 +++++++++++++++++-- + anteater/model/detector/th_base_detector.py | 3 +- + anteater/module/app/app_sli_detector.py | 4 +- + anteater/module/sys/disk_throughput.py | 4 +- + anteater/module/sys/proc_io_latency.py | 4 +- + anteater/module/sys/sys_io_latency.py | 4 +- + .../module/sys/tcp_transmission_latency.py | 4 +- + .../module/sys/tcp_transmission_throughput.py | 4 +- + anteater/source/metric_loader.py | 41 +++++++++++++++- + anteater/utils/data_load.py | 4 +- + config/module/app_sli_rtt.json | 6 ++- + config/module/disk_throughput.json | 6 ++- + config/module/proc_io_latency.json | 15 ++++-- + config/module/sys_io_latency.json | 2 +- + config/module/sys_tcp_establish.json | 2 +- + .../module/sys_tcp_transmission_latency.json | 4 +- + 23 files changed, 172 insertions(+), 59 deletions(-) + +diff --git a/anteater/core/anomaly.py b/anteater/core/anomaly.py +index 45c4fc3..fdee3d1 100644 +--- a/anteater/core/anomaly.py ++++ b/anteater/core/anomaly.py +@@ -52,3 +52,13 @@ class AnomalyTrend(Enum): + DEFAULT = 0 + RISE = 1 + FALL = 2 ++ ++ @staticmethod ++ def from_str(label: str): ++ """Trans str to Enum type""" ++ if label.upper() == 'RISE': ++ return AnomalyTrend.RISE ++ elif label.upper() == 'FALL': ++ return AnomalyTrend.FALL ++ else: ++ return AnomalyTrend.DEFAULT +diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py +index f83b666..70cc9ee 100644 +--- a/anteater/core/kpi.py ++++ b/anteater/core/kpi.py +@@ -27,6 +27,13 @@ class KPI: + params: dict = field(default=dict) + atrend: AnomalyTrend = AnomalyTrend.DEFAULT + ++ @classmethod ++ def from_dict(cls, **data): ++ if 'atrend' in data: ++ data['atrend'] = AnomalyTrend.from_str(data.get('atrend')) ++ ++ return cls(**data) ++ + + @dataclass + class Feature: +@@ -35,6 +42,13 @@ class Feature: + priority: int = 0 + atrend: AnomalyTrend = AnomalyTrend.DEFAULT + ++ @classmethod ++ def from_dict(cls, **data): ++ if 'atrend' in data: ++ data['atrend'] = AnomalyTrend.from_str(data.get('atrend')) ++ ++ return cls(**data) ++ + + @dataclass + class ModelConfig: +diff --git a/anteater/model/algorithms/slope.py b/anteater/model/algorithms/slope.py +index d324d58..e546183 100644 +--- a/anteater/model/algorithms/slope.py ++++ b/anteater/model/algorithms/slope.py +@@ -17,6 +17,7 @@ import numpy as np + + from anteater.core.anomaly import AnomalyTrend + from anteater.model.algorithms.smooth import conv_smooth ++from anteater.utils.common import divide + + + def slope(y, win_len): +@@ -36,13 +37,15 @@ def smooth_slope(time_series, windows_length): + + def trend(y, win_len=None): + """Gets the trend for the y""" ++ y = conv_smooth(y, box_pts=7) ++ + if not win_len: + win_len = len(y) // 2 + +- if np.mean(y[:win_len]) < np.mean(y[-win_len:]): ++ if divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) < 0.9: + return 1 + +- elif np.mean(y[:win_len]) > np.mean(y[-win_len:]): ++ elif divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) > 1.1: + return -1 + + else: +@@ -51,10 +54,10 @@ def trend(y, win_len=None): + + def check_trend(values: List[float], atrend: AnomalyTrend): + """Checks the values with an 'atrend' trend""" +- if atrend == AnomalyTrend.RISE and trend(values) < 0: ++ if atrend == AnomalyTrend.RISE and trend(values) != 1: + return False + +- if atrend == AnomalyTrend.FALL and trend(values) > 0: ++ if atrend == AnomalyTrend.FALL and trend(values) != -1: + return False + + return True +diff --git a/anteater/model/detector/base.py b/anteater/model/detector/base.py +index 2b2dafe..a23b6d9 100644 +--- a/anteater/model/detector/base.py ++++ b/anteater/model/detector/base.py +@@ -11,6 +11,7 @@ + # See the Mulan PSL v2 for more details. + # ******************************************************************************/ + ++import logging + import math + from abc import abstractmethod + from typing import List +@@ -39,12 +40,6 @@ class Detector: + """Executes anomaly detection on kpis""" + pass + +- def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]: +- """Gets unique machine ids during past minutes""" +- metrics = [_kpi.metric for _kpi in kpis] +- machine_ids = self.data_loader.get_unique_machines(start, end, metrics) +- return machine_ids +- + def execute(self, job_config: JobConfig) -> List[Anomaly]: + """The main function of the detector""" + kpis = job_config.kpis +@@ -56,6 +51,12 @@ class Detector: + + return self._execute(kpis, features, top_n=n) + ++ def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]: ++ """Gets unique machine ids during past minutes""" ++ metrics = [_kpi.metric for _kpi in kpis] ++ machine_ids = self.data_loader.get_unique_machines(start, end, metrics) ++ return machine_ids ++ + def find_root_causes(self, anomalies: List[Anomaly], features: List[Feature], top_n=3)\ + -> List[Anomaly]: + """Finds root causes for each anomaly events""" +@@ -82,6 +83,7 @@ class Detector: + tmp_ts_scores = self.cal_anomaly_score(f.metric, f.description, machine_id=machine_id) + for _ts_score in tmp_ts_scores: + if not check_trend(_ts_score.ts.values, f.atrend): ++ logging.info(f"Trends Filtered: {f.metric}") + _ts_score.score = 0 + if same_intersection_key_value(_ts_score.ts.labels, filters): + ts_scores.append(_ts_score) +@@ -101,6 +103,7 @@ class Detector: + for _ts_s in ts_scores: + if same_intersection_key_value(_ts_s.ts.labels, anomaly.labels): + if not check_trend(_ts_s.ts.values, kpi_atrends[anomaly.metric]): ++ logging.info(f"Trends Filtered: {anomaly.metric}") + anomaly.score = 0 + else: + anomaly.score = _ts_s.score +@@ -115,12 +118,11 @@ class Detector: + machine_id: str)\ + -> List[TimeSeriesScore]: + """Calculates metric anomaly scores based on sr model""" +- start, end = dt.last(minutes=6) ++ start, end = dt.last(minutes=10) + point_count = self.data_loader.expected_point_length(start, end) + model = SpectralResidual(12, 24, 50) + ts_scores = [] +- ts_list = self.data_loader.\ +- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id) ++ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id) + for _ts in ts_list: + if sum(_ts.values) == 0 or \ + len(_ts.values) < point_count * 0.9 or\ +diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py +index 3a2ab01..dbf83c6 100644 +--- a/anteater/model/detector/n_sigma_detector.py ++++ b/anteater/model/detector/n_sigma_detector.py +@@ -29,10 +29,9 @@ from anteater.utils.log import logger + class NSigmaDetector(Detector): + """The three sigma anomaly detector""" + +- def __init__(self, data_loader: MetricLoader, method: str): ++ def __init__(self, data_loader: MetricLoader): + """The detector base class initializer""" + super().__init__(data_loader) +- self.method = method + + def detect_kpis(self, kpis: List[KPI]): + """Executes anomaly detection on kpis""" +@@ -48,7 +47,7 @@ class NSigmaDetector(Detector): + def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]: + """Detects kpi based on signal time series anomaly detection model""" + outlier_ratio_th = kpi.params['outlier_ratio_th'] +- ts_scores = self.calculate_metric_three_sigma_score( ++ ts_scores = self.calculate_n_sigma_score( + kpi.metric, kpi.description, machine_id, **kpi.params) + if not ts_scores: + logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!') +@@ -68,17 +67,17 @@ class NSigmaDetector(Detector): + + return anomalies + +- def calculate_metric_three_sigma_score(self, metric, description, machine_id: str, **kwargs)\ ++ def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\ + -> List[TimeSeriesScore]: + """Calculate kpi anomaly scores based on three sigma scores""" ++ method = kwargs.get('method', 'abs') + look_back = kwargs.get('look_back') + smooth_params = kwargs.get('smooth_params') + obs_size = kwargs.get('obs_size') + n = kwargs.get('n', 3) + start, end = dt.last(minutes=look_back) + point_count = self.data_loader.expected_point_length(start, end) +- ts_list = self.data_loader.\ +- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id) ++ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id) + ts_scores = [] + for _ts in ts_list: + dedup_values = [k for k, g in groupby(_ts.values)] +@@ -87,12 +86,12 @@ class NSigmaDetector(Detector): + len(_ts.values) > point_count * 1.5 or \ + all(x == _ts.values[0] for x in _ts.values): + ratio = 0 +- elif len(dedup_values) < point_count * 0.3: ++ elif len(dedup_values) < point_count * 0.6: + ratio = 0 + else: + smoothed_val = smoothing(_ts.values, **smooth_params) + outlier, mean, std = n_sigma( +- smoothed_val, obs_size=obs_size, n=n, method=self.method) ++ smoothed_val, obs_size=obs_size, n=n, method=method) + ratio = divide(len(outlier), obs_size) + + ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description)) +diff --git a/anteater/model/detector/online_vae_detector.py b/anteater/model/detector/online_vae_detector.py +index 63a7b09..0f91576 100644 +--- a/anteater/model/detector/online_vae_detector.py ++++ b/anteater/model/detector/online_vae_detector.py +@@ -110,8 +110,7 @@ class OnlineVAEDetector(Detector): + metric_dfs = [] + for metric in metrics: + _ts_list = self.data_loader.\ +- get_metric(start, end, metric, label_name="machine_id", +- label_value=machine_id, operator_name='avg') ++ get_metric(start, end, metric, operator='avg', keys="machine_id", machine_id=machine_id) + + if len(_ts_list) > 1: + raise ValueError(f'Got multiple time_series based on machine id: {len(_ts_list)}') +diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py +index 82d7837..3720069 100644 +--- a/anteater/model/detector/tcp_establish_n_sigma_detector.py ++++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py +@@ -73,8 +73,7 @@ class TcpEstablishNSigmaDetector(Detector): + min_rtt = kpi.params.get('min_rtt') + + start, end = dt.last(minutes=look_back) +- ts_list = self.data_loader.\ +- get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id) ++ ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id) + + anomalies = [] + for _ts in ts_list: +diff --git a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py +index 1eeb95f..6d41775 100644 +--- a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py ++++ b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py +@@ -11,20 +11,61 @@ + # See the Mulan PSL v2 for more details. + # ******************************************************************************/ + ++from itertools import groupby + from typing import List + ++import numpy as np ++ + from anteater.core.time_series import TimeSeriesScore ++from anteater.model.algorithms.smooth import smoothing ++from anteater.model.algorithms.three_sigma import n_sigma + from anteater.model.detector.n_sigma_detector import NSigmaDetector + from anteater.source.metric_loader import MetricLoader ++from anteater.utils.common import divide + from anteater.utils.datetime import DateTimeManager as dt + + + class TcpTransLatencyNSigmaDetector(NSigmaDetector): + """The three sigma anomaly detector""" + +- def __init__(self, data_loader: MetricLoader, method: str): ++ def __init__(self, data_loader: MetricLoader): + """The detector base class initializer""" +- super().__init__(data_loader, method) ++ super().__init__(data_loader) ++ ++ def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\ ++ -> List[TimeSeriesScore]: ++ """Calculates anomaly scores based on n sigma scores""" ++ method = kwargs.get('method', 'abs') ++ look_back = kwargs.get('look_back') ++ smooth_params = kwargs.get('smooth_params') ++ obs_size = kwargs.get('obs_size') ++ min_srtt = kwargs.get("min_srtt") ++ n = kwargs.get('n', 3) ++ start, end = dt.last(minutes=look_back) ++ point_count = self.data_loader.expected_point_length(start, end) ++ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id) ++ ts_scores = [] ++ for _ts in ts_list: ++ dedup_values = [k for k, g in groupby(_ts.values)] ++ if sum(_ts.values) == 0 or \ ++ len(_ts.values) < point_count * 0.6 or \ ++ len(_ts.values) > point_count * 1.5 or \ ++ all(x == _ts.values[0] for x in _ts.values): ++ ratio = 0 ++ elif len(dedup_values) < point_count * 0.6: ++ ratio = 0 ++ else: ++ smoothed_val = smoothing(_ts.values, **smooth_params) ++ outlier, mean, std = n_sigma( ++ smoothed_val, obs_size=obs_size, n=n, method=method) ++ if outlier and np.average(outlier) <= min_srtt: ++ ratio = 0 ++ else: ++ ratio = divide(len(outlier), obs_size) ++ ++ ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description)) ++ ++ return ts_scores + + def cal_anomaly_score(self, metric, description, machine_id: str) \ + -> List[TimeSeriesScore]: +@@ -32,8 +73,7 @@ class TcpTransLatencyNSigmaDetector(NSigmaDetector): + start, end = dt.last(minutes=2) + point_count = self.data_loader.expected_point_length(start, end) + ts_scores = [] +- ts_list = self.data_loader. \ +- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id) ++ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id) + for _ts in ts_list: + if sum(_ts.values) == 0 or \ + len(_ts.values) < point_count * 0.5 or \ +diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py +index bec9705..0af4f22 100644 +--- a/anteater/model/detector/th_base_detector.py ++++ b/anteater/model/detector/th_base_detector.py +@@ -44,8 +44,7 @@ class ThBaseDetector(Detector): + look_back = kpi.params.get('look_back') + th = kpi.params.get('th') + start, end = dt.last(minutes=look_back) +- ts_list = self.data_loader.\ +- get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id) ++ ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id) + + if not ts_list: + logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!') +diff --git a/anteater/module/app/app_sli_detector.py b/anteater/module/app/app_sli_detector.py +index 102ed11..e506332 100644 +--- a/anteater/module/app/app_sli_detector.py ++++ b/anteater/module/app/app_sli_detector.py +@@ -44,12 +44,12 @@ class APPSliDetector(E2EDetector): + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ +- NSigmaDetector(data_loader, method='min'), ++ NSigmaDetector(data_loader), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ +- NSigmaDetector(data_loader, method='min') ++ NSigmaDetector(data_loader) + ] + + return detectors +diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py +index 9a192fb..7971505 100644 +--- a/anteater/module/sys/disk_throughput.py ++++ b/anteater/module/sys/disk_throughput.py +@@ -38,12 +38,12 @@ class DiskThroughputDetector(E2EDetector): + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ +- NSigmaDetector(data_loader, method='max'), ++ NSigmaDetector(data_loader), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ +- NSigmaDetector(data_loader, method='max') ++ NSigmaDetector(data_loader) + ] + + return detectors +diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py +index a34c48d..b76acea 100644 +--- a/anteater/module/sys/proc_io_latency.py ++++ b/anteater/module/sys/proc_io_latency.py +@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector): + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ +- NSigmaDetector(data_loader, method='max'), ++ NSigmaDetector(data_loader), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ +- NSigmaDetector(data_loader, method='max') ++ NSigmaDetector(data_loader) + ] + + return detectors +diff --git a/anteater/module/sys/sys_io_latency.py b/anteater/module/sys/sys_io_latency.py +index a6f01c2..17a34c9 100644 +--- a/anteater/module/sys/sys_io_latency.py ++++ b/anteater/module/sys/sys_io_latency.py +@@ -38,12 +38,12 @@ class SysIOLatencyDetector(E2EDetector): + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ +- NSigmaDetector(data_loader, method='abs'), ++ NSigmaDetector(data_loader), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ +- NSigmaDetector(data_loader, method='abs') ++ NSigmaDetector(data_loader) + ] + + return detectors +diff --git a/anteater/module/sys/tcp_transmission_latency.py b/anteater/module/sys/tcp_transmission_latency.py +index cf0f406..e085ec3 100644 +--- a/anteater/module/sys/tcp_transmission_latency.py ++++ b/anteater/module/sys/tcp_transmission_latency.py +@@ -39,12 +39,12 @@ class SysTcpTransmissionLatencyDetector(E2EDetector): + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ +- TcpTransLatencyNSigmaDetector(data_loader, method='max'), ++ TcpTransLatencyNSigmaDetector(data_loader), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ +- TcpTransLatencyNSigmaDetector(data_loader, method='max') ++ TcpTransLatencyNSigmaDetector(data_loader) + ] + + return detectors +diff --git a/anteater/module/sys/tcp_transmission_throughput.py b/anteater/module/sys/tcp_transmission_throughput.py +index 86ecc9e..2921602 100644 +--- a/anteater/module/sys/tcp_transmission_throughput.py ++++ b/anteater/module/sys/tcp_transmission_throughput.py +@@ -38,12 +38,12 @@ class SysTcpTransmissionThroughputDetector(E2EDetector): + def init_detectors(self, data_loader): + if self.job_config.model_config.enable: + detectors = [ +- NSigmaDetector(data_loader, method='abs'), ++ NSigmaDetector(data_loader), + OnlineVAEDetector(data_loader, self.job_config.model_config) + ] + else: + detectors = [ +- NSigmaDetector(data_loader, method='abs') ++ NSigmaDetector(data_loader) + ] + + return detectors +diff --git a/anteater/source/metric_loader.py b/anteater/source/metric_loader.py +index ef2d012..4745d87 100644 +--- a/anteater/source/metric_loader.py ++++ b/anteater/source/metric_loader.py +@@ -65,6 +65,43 @@ def get_query(metric: str, + return query + + ++def get_query2( ++ metric: str, operator: str = None, value: float = None, keys: Union[str, List] = None, **labels): ++ """Gets aggregated query patterns ++ ++ Format: [operator]([value,] metric{[**labels]}) by (keys) ++ ++ Such as: ++ - 1. gala_gopher_bind_sends{machine_id="1234"} ++ - 2. sum(gala_gopher_bind_sends) by (machine_id) ++ - 2. sum(gala_gopher_bind_sends) by (machine_id) ++ - 3. sum(gala_gopher_bind_sends{machine_id="1234"}) by (machine_id) ++ - 4. quantile(0.7, gala_gopher_bind_sends{machine_id="1234"}) by (machine_id) ++ """ ++ if operator and not keys: ++ raise ValueError("Please provide param 'keys' when specified 'operator'!") ++ ++ rule = "" ++ if labels: ++ pairs = ",".join([f"{n}='{v}'" for n, v in labels.items()]) ++ rule = f"{{{pairs}}}" ++ ++ group = "" ++ if isinstance(keys, list): ++ group = ",".join([k for k in keys]) ++ elif isinstance(keys, str): ++ group = keys ++ ++ if operator and value: ++ query = f"{operator}({value}, {metric}{rule}) by ({group})" ++ elif operator: ++ query = f"{operator}({metric}{rule}) by ({group})" ++ else: ++ query = f"{metric}{rule}" ++ ++ return query ++ ++ + class MetricLoader: + """ + The metric loader that consumes raw data from PrometheusAdapter, +@@ -87,7 +124,7 @@ class MetricLoader: + + :return List of TimeSeries + """ +- query = get_query(metric, **kwargs) ++ query = get_query2(metric, **kwargs) + time_series = self.provider.range_query(start, end, metric, query) + + return time_series +@@ -109,7 +146,7 @@ class MetricLoader: + """Gets unique labels of all metrics""" + unique_labels = set() + for metric in metrics: +- time_series = self.get_metric(start, end, metric, label_name=label_name) ++ time_series = self.get_metric(start, end, metric) + unique_labels.update([item.labels.get(label_name, "") for item in time_series]) + + return list([lbl for lbl in unique_labels if lbl]) +diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py +index 730c9c6..60c28e5 100644 +--- a/anteater/utils/data_load.py ++++ b/anteater/utils/data_load.py +@@ -48,8 +48,8 @@ def load_job_config(file_name) -> JobConfig: + keywords = config['keywords'] + root_cause_number = config['root_cause_number'] + +- kpis = [KPI(**update_description(_conf)) for _conf in config['KPI']] +- features = [Feature(**update_description(_conf)) for _conf in config['Features']] ++ kpis = [KPI.from_dict(**update_description(_conf)) for _conf in config['KPI']] ++ features = [Feature.from_dict(**update_description(_conf)) for _conf in config['Features']] + + model_config = None + if 'OnlineModel' in config: +diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json +index 0146883..5027b8d 100644 +--- a/config/module/app_sli_rtt.json ++++ b/config/module/app_sli_rtt.json +@@ -10,13 +10,14 @@ + "metric": "gala_gopher_sli_rtt_nsec", + "kpi_type": "rtt", + "entity_name": "sli", +- "enable": false, ++ "enable": true, + "description": "sli rtt 异常", + "description-zh": "应用级请求往返时延(RTT)异常", + "params": { ++ "method": "max", + "look_back": 10, + "obs_size": 25, +- "outlier_ratio_th": 0.3, ++ "outlier_ratio_th": 0.5, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +@@ -31,6 +32,7 @@ + "description": "sli tps 异常", + "description-zh": "应用级请求吞吐量(TPS)异常", + "params": { ++ "method": "min", + "look_back": 10, + "obs_size": 25, + "outlier_ratio_th": 0.3, +diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json +index f6244f6..e3bcf68 100644 +--- a/config/module/disk_throughput.json ++++ b/config/module/disk_throughput.json +@@ -14,9 +14,10 @@ + "description": "Disk read await time is increasing!", + "description-zh": "磁盘读响应时间升高,性能发生劣化", + "params": { ++ "method": "max", + "look_back": 20, + "obs_size": 25, +- "outlier_ratio_th": 0.3, ++ "outlier_ratio_th": 0.5, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +@@ -31,9 +32,10 @@ + "description": "Disk write await time is increasing!", + "description-zh": "磁盘写响应时间升高,性能发生劣化", + "params": { ++ "method": "max", + "look_back": 20, + "obs_size": 25, +- "outlier_ratio_th": 0.3, ++ "outlier_ratio_th": 0.5, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json +index f086b87..171c5f4 100644 +--- a/config/module/proc_io_latency.json ++++ b/config/module/proc_io_latency.json +@@ -14,9 +14,10 @@ + "description": "I/O operation delay at the BIO layer (unit: us)", + "description-zh": "BIO层I/O操作延时高(单位:us)", + "params": { ++ "method":"max", + "look_back": 20, + "obs_size": 37, +- "outlier_ratio_th": 0.4, ++ "outlier_ratio_th": 0.5, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +@@ -31,9 +32,10 @@ + "description": "Number of small I/O (less than 4 KB) read operations at the BIO layer.", + "description-zh": "BIO层小数据I/O读操作数量异常(小于4KB)", + "params": { ++ "method":"max", + "look_back": 20, + "obs_size": 25, +- "outlier_ratio_th": 0.3, ++ "outlier_ratio_th": 0.4, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +@@ -48,9 +50,10 @@ + "description": "Number of small I/O (less than 4 KB) write operations at the BIO layer.", + "description-zh": "BIO层小数据I/O写操作数量异常(小于4KB)", + "params": { ++ "method":"max", + "look_back": 20, + "obs_size": 25, +- "outlier_ratio_th": 0.3, ++ "outlier_ratio_th": 0.4, + "smooth_params": { + "method": "savgol_smooth", + "window_length": 13, +@@ -66,9 +69,10 @@ + "description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.", + "description-zh": "BIO层大数据I/O读操作数量异常(大于4KB)", + "params": { ++ "method":"max", + "look_back": 20, + "obs_size": 25, +- "outlier_ratio_th": 0.3, ++ "outlier_ratio_th": 0.4, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +@@ -83,9 +87,10 @@ + "description": "Number of big I/O (greater than 4 KB) write operations at the BIO layer.", + "description-zh": "BIO层大数据写操作数量异常(大于4KB)", + "params": { ++ "method":"max", + "look_back": 20, + "obs_size": 25, +- "outlier_ratio_th": 0.3, ++ "outlier_ratio_th": 0.4, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json +index bdf17d3..3fa1266 100644 +--- a/config/module/sys_io_latency.json ++++ b/config/module/sys_io_latency.json +@@ -16,7 +16,7 @@ + "params": { + "look_back": 20, + "obs_size": 25, +- "outlier_ratio_th": 0.3, ++ "outlier_ratio_th": 0.4, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json +index 7cd2369..9bd2a46 100644 +--- a/config/module/sys_tcp_establish.json ++++ b/config/module/sys_tcp_establish.json +@@ -17,7 +17,7 @@ + "look_back": 30, + "outlier_ratio_th": 0.5, + "obs_size": 3, +- "min_rtt": 500000 ++ "min_rtt": 100000 + } + } + ], +diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json +index 0527487..3ba8113 100644 +--- a/config/module/sys_tcp_transmission_latency.json ++++ b/config/module/sys_tcp_transmission_latency.json +@@ -14,10 +14,12 @@ + "description": "Smoothed Round Trip Time(us)", + "description-zh": "TCP链接往返时延异常,性能劣化", + "params": { ++ "method": "max", + "look_back": 20, + "obs_size": 25, + "n": 3, +- "outlier_ratio_th": 0.4, ++ "min_srtt": 20000, ++ "outlier_ratio_th": 0.6, + "smooth_params": { + "method": "conv_smooth", + "box_pts": 3 +-- +2.33.0 + diff --git a/gala-anteater-1.0.1.tar.gz b/gala-anteater-1.0.1.tar.gz index b309b88..e19781b 100644 Binary files a/gala-anteater-1.0.1.tar.gz and b/gala-anteater-1.0.1.tar.gz differ diff --git a/gala-anteater.spec b/gala-anteater.spec index a6d7685..6f825e3 100644 --- a/gala-anteater.spec +++ b/gala-anteater.spec @@ -2,7 +2,7 @@ Name: gala-anteater Version: 1.0.1 -Release: 1 +Release: 2 Summary: A time-series anomaly detection platform for operating system. License: MulanPSL2 URL: https://gitee.com/openeuler/gala-anteater @@ -11,6 +11,13 @@ BuildRoot: %{_builddir}/%{name}-%{version} BuildRequires: procps-ng python3-setuptools Requires: python3-gala-anteater = %{version}-%{release} +Patch1: Add-disk-throughput-detector.patch +Patch2: Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch +Patch3: add-chinese-descriptions.patch +Patch4: remove-sys-level-config-param.patch +Patch5: add-systemd-service-for-anteater.patch +Patch6: fix-str2enum-bug-data-query-refactor.patch + %description Abnormal detection module for A-Ops project @@ -31,11 +38,19 @@ Python3 package of gala-anteater %install %py3_install +%pre +if [ -f "%{_unitdir}/gala-anteater.service" ] ; then + systemctl enable gala-anteater.service || : +fi + %post +%systemd_post gala-anteater.service %preun +%systemd_preun gala-anteater.service %postun +%systemd_postun_with_restart gala-anteater.service %files %doc README.md @@ -49,7 +64,9 @@ Python3 package of gala-anteater %config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_establish.json %config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_transmission_latency.json %config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_tcp_transmission_throughput.json - +%config(noreplace) %{_sysconfdir}/%{name}/config/module/disk_throughput.json +%config(noreplace) %{_sysconfdir}/%{name}/config/module/sys_nic_loss.json +/usr/lib/systemd/system/gala-anteater.service %files -n python3-gala-anteater %{python3_sitelib}/anteater/* @@ -57,6 +74,14 @@ Python3 package of gala-anteater %changelog +* Tue Jan 17 2023 Zhen Chen - 1.0.1-2 +- fix str2enum bug & data query refactor +- add systemd service for anteater +- remove 'sys-level' config param +- add chinese descriptions +- Update TCP Establish Model & Add Nic Loss Detector +- Add disk throughput detector + * Wed Nov 30 2022 Li Zhenxing - 1.0.1-1 - Add sys level anomaly detection and cause inference diff --git a/remove-sys-level-config-param.patch b/remove-sys-level-config-param.patch new file mode 100644 index 0000000..4d0c7fd --- /dev/null +++ b/remove-sys-level-config-param.patch @@ -0,0 +1,98 @@ +From 5c6b03a49a49ddc56574e906f959d5fe34c1debc Mon Sep 17 00:00:00 2001 +From: lizhenxing11 +Date: Fri, 6 Jan 2023 10:59:12 +0800 +Subject: [PATCH] remove 'sys-level' config param + +--- + anteater/config.py | 1 - + anteater/main.py | 29 +++++++++++------------------ + config/gala-anteater.yaml | 1 - + docs/conf_introduction.md | 1 - + 4 files changed, 11 insertions(+), 21 deletions(-) + +diff --git a/anteater/config.py b/anteater/config.py +index e9ab557..caeceec 100644 +--- a/anteater/config.py ++++ b/anteater/config.py +@@ -27,7 +27,6 @@ import yaml + class GlobalConf: + """The global config""" + data_source: str +- sys_level: bool + + + @dataclass +diff --git a/anteater/main.py b/anteater/main.py +index 4de72f9..87aae95 100644 +--- a/anteater/main.py ++++ b/anteater/main.py +@@ -26,7 +26,6 @@ from anteater.module.sys.nic_loss import NICLossDetector + from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector + from anteater.module.sys.sys_io_latency import SysIOLatencyDetector + from anteater.module.sys.tcp_establish import SysTcpEstablishDetector +-from anteater.module.sys.tcp_transmission_throughput import SysTcpTransmissionThroughputDetector + from anteater.module.sys.tcp_transmission_latency import SysTcpTransmissionLatencyDetector + from anteater.provider.kafka import KafkaProvider + from anteater.source.anomaly_report import AnomalyReport +@@ -49,24 +48,18 @@ def main(): + kafka_provider = KafkaProvider(conf.kafka) + loader = MetricLoader(conf) + report = AnomalyReport(kafka_provider) +- if conf.global_conf.sys_level: +- detectors = [ +- # APP sli anomaly detection +- APPSliDetector(loader, report), ++ detectors = [ ++ # APP sli anomaly detection ++ APPSliDetector(loader, report), + +- # SYS tcp/io detection +- SysTcpEstablishDetector(loader, report), +- SysTcpTransmissionLatencyDetector(loader, report), +- SysIOLatencyDetector(loader, report), +- ProcIOLatencyDetector(loader, report), +- DiskThroughputDetector(loader, report), +- NICLossDetector(loader, report), +- ] +- else: +- detectors = [ +- # APP sli anomaly detection +- APPSliDetector(loader, report) +- ] ++ # SYS tcp/io detection ++ SysTcpEstablishDetector(loader, report), ++ SysTcpTransmissionLatencyDetector(loader, report), ++ SysIOLatencyDetector(loader, report), ++ ProcIOLatencyDetector(loader, report), ++ DiskThroughputDetector(loader, report), ++ NICLossDetector(loader, report), ++ ] + + anomaly_detect = AnomalyDetection(detectors, conf) + +diff --git a/config/gala-anteater.yaml b/config/gala-anteater.yaml +index c4c54a0..72ffc31 100644 +--- a/config/gala-anteater.yaml ++++ b/config/gala-anteater.yaml +@@ -1,6 +1,5 @@ + Global: + data_source: "prometheus" +- sys_level: false + + Kafka: + server: "localhost" +diff --git a/docs/conf_introduction.md b/docs/conf_introduction.md +index 09a7284..869d3e9 100644 +--- a/docs/conf_introduction.md ++++ b/docs/conf_introduction.md +@@ -16,7 +16,6 @@ gala-anteater # gala-anteater 主目录 + 在文件`gala-anteater.yaml`中,配置`gala-anteater`启动时所需的参数。该配置项中,主要包含: + - Global: 配置启动时的全局变量 + - data_source: 时序数据的来源,目前支持`"prometheus"`(Prometheus)和`"aom"`(AOM)两种数据来源; +- - sys_level: 是否支持`系统级`异常检测,可选:`true`、`false`。 + + - Kafka: 配置中间件Kafka所需的参数 + - server: Kafak对应的`server ip`,如:"10.xxx.xxx.xxx"; +-- +2.33.0 +