From dd870b17120f3c7961c4613d454f1653fbd42214 Mon Sep 17 00:00:00 2001 From: lizhenxing11 Date: Tue, 27 Dec 2022 18:39:32 +0800 Subject: [PATCH] Update TCP Establish Model & Add Nic Loss Detector change method 'abs' to 'max' --- anteater/main.py | 2 + anteater/model/algorithms/three_sigma.py | 4 +- anteater/model/detector/n_sigma_detector.py | 4 +- .../tcp_establish_n_sigma_detector.py | 12 +++- anteater/model/detector/th_base_detector.py | 66 +++++++++++++++++++ anteater/module/sys/nic_loss.py | 59 +++++++++++++++++ anteater/module/sys/proc_io_latency.py | 4 +- anteater/template/app_anomaly_template.py | 2 + anteater/template/sys_anomaly_template.py | 1 + config/module/sys_nic_loss.json | 53 +++++++++++++++ config/module/sys_tcp_establish.json | 3 +- 11 files changed, 200 insertions(+), 10 deletions(-) create mode 100644 anteater/model/detector/th_base_detector.py create mode 100644 anteater/module/sys/nic_loss.py create mode 100644 config/module/sys_nic_loss.json diff --git a/anteater/main.py b/anteater/main.py index ba7be70..4de72f9 100644 --- a/anteater/main.py +++ b/anteater/main.py @@ -22,6 +22,7 @@ from anteater.anomaly_detection import AnomalyDetection from anteater.config import AnteaterConf from anteater.module.app.app_sli_detector import APPSliDetector from anteater.module.sys.disk_throughput import DiskThroughputDetector +from anteater.module.sys.nic_loss import NICLossDetector from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector from anteater.module.sys.sys_io_latency import SysIOLatencyDetector from anteater.module.sys.tcp_establish import SysTcpEstablishDetector @@ -59,6 +60,7 @@ def main(): SysIOLatencyDetector(loader, report), ProcIOLatencyDetector(loader, report), DiskThroughputDetector(loader, report), + NICLossDetector(loader, report), ] else: detectors = [ diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py index 49b9952..0865417 100644 --- a/anteater/model/algorithms/three_sigma.py +++ b/anteater/model/algorithms/three_sigma.py @@ -14,8 +14,8 @@ import numpy as np -def three_sigma(values, obs_size, n=3, method="abs"): - """The '3-sigma rule' outlier detect function""" +def n_sigma(values, obs_size, n=3, method="abs"): + """The 'N-sigma rule' outlier detect function""" if obs_size <= 0: raise ValueError("The obs_size should great than zero!") if len(values) <= obs_size: diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py index f632326..3a2ab01 100644 --- a/anteater/model/detector/n_sigma_detector.py +++ b/anteater/model/detector/n_sigma_detector.py @@ -19,7 +19,7 @@ from anteater.core.kpi import KPI from anteater.core.time_series import TimeSeriesScore from anteater.model.detector.base import Detector from anteater.model.algorithms.smooth import smoothing -from anteater.model.algorithms.three_sigma import three_sigma +from anteater.model.algorithms.three_sigma import n_sigma from anteater.source.metric_loader import MetricLoader from anteater.utils.common import divide from anteater.utils.datetime import DateTimeManager as dt @@ -91,7 +91,7 @@ class NSigmaDetector(Detector): ratio = 0 else: smoothed_val = smoothing(_ts.values, **smooth_params) - outlier, mean, std = three_sigma( + outlier, mean, std = n_sigma( smoothed_val, obs_size=obs_size, n=n, method=self.method) ratio = divide(len(outlier), obs_size) diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py index 8dcf9ae..82d7837 100644 --- a/anteater/model/detector/tcp_establish_n_sigma_detector.py +++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py @@ -42,8 +42,13 @@ class TcpEstablishNSigmaDetector(Detector): start, _ = dt.last(minutes=look_back) mid, _ = dt.last(minutes=3) + filtered_ts_list = [] ts_list = self.data_loader.get_metric(start, mid, kpi.metric) - establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in ts_list]) + for _ts in ts_list: + if sum(_ts.values) > 0: + filtered_ts_list.append(_ts) + + establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in filtered_ts_list]) self.mean = np.mean(establish_time) self.std = np.std(establish_time) @@ -65,6 +70,7 @@ class TcpEstablishNSigmaDetector(Detector): """Detects kpi based on signal time series anomaly detection model""" outlier_ratio_th = kpi.params.get('outlier_ratio_th') look_back = kpi.params.get('obs_size') + min_rtt = kpi.params.get('min_rtt') start, end = dt.last(minutes=look_back) ts_list = self.data_loader.\ @@ -72,9 +78,9 @@ class TcpEstablishNSigmaDetector(Detector): anomalies = [] for _ts in ts_list: - outlier = [val for val in _ts.values if abs(val - self.mean) > 3 * self.std] + outlier = [val for val in _ts.values if val > self.mean + 5 * self.std] ratio = divide(len(outlier), len(_ts.values)) - if outlier and ratio > outlier_ratio_th: + if outlier and ratio > outlier_ratio_th and np.average(outlier) >= min_rtt: anomalies.append( Anomaly( machine_id=machine_id, diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py new file mode 100644 index 0000000..bec9705 --- /dev/null +++ b/anteater/model/detector/th_base_detector.py @@ -0,0 +1,66 @@ +#!/usr/bin/python3 +# ****************************************************************************** +# Copyright (c) 2022 Huawei Technologies Co., Ltd. +# gala-anteater is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ******************************************************************************/ + +from typing import List + +from anteater.core.anomaly import Anomaly +from anteater.core.kpi import KPI +from anteater.model.detector.base import Detector +from anteater.source.metric_loader import MetricLoader +from anteater.utils.datetime import DateTimeManager as dt +from anteater.utils.log import logger + + +class ThBaseDetector(Detector): + """The threshold-based anomaly detector""" + + def __init__(self, data_loader: MetricLoader): + """The detector base class initializer""" + super().__init__(data_loader) + + def detect_kpis(self, kpis: List[KPI]): + """Executes anomaly detection on kpis""" + start, end = dt.last(minutes=1) + machine_ids = self.get_unique_machine_id(start, end, kpis) + anomalies = [] + for _id in machine_ids: + for kpi in kpis: + anomalies.extend(self.detect_signal_kpi(kpi, _id)) + + return anomalies + + def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]: + """Detects kpi based on threshold based anomaly detection model""" + look_back = kpi.params.get('look_back') + th = kpi.params.get('th') + start, end = dt.last(minutes=look_back) + ts_list = self.data_loader.\ + get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id) + + if not ts_list: + logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!') + return [] + + anomalies = [ + Anomaly( + machine_id=machine_id, + metric=_ts.metric, + labels=_ts.labels, + score=1, + entity_name=kpi.entity_name, + description=kpi.description) + for _ts in ts_list + if sum(_ts.values) >= th + ] + + return anomalies diff --git a/anteater/module/sys/nic_loss.py b/anteater/module/sys/nic_loss.py new file mode 100644 index 0000000..d24e06f --- /dev/null +++ b/anteater/module/sys/nic_loss.py @@ -0,0 +1,59 @@ +#!/usr/bin/python3 +# ****************************************************************************** +# Copyright (c) 2022 Huawei Technologies Co., Ltd. +# gala-anteater is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ******************************************************************************/ + +from typing import List, Dict + +from anteater.core.anomaly import Anomaly +from anteater.model.detector.th_base_detector import ThBaseDetector +from anteater.module.base import E2EDetector +from anteater.source.anomaly_report import AnomalyReport +from anteater.source.metric_loader import MetricLoader +from anteater.template.sys_anomaly_template import SysAnomalyTemplate + + +class NICLossDetector(E2EDetector): + """SYS nic loss e2e detector which detects the network loss. + """ + + config_file = 'sys_nic_loss.json' + + def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport): + """The system tcp transmission latency e2e detector initializer""" + super().__init__(reporter, SysAnomalyTemplate) + + self.detectors = [ + ThBaseDetector(data_loader) + ] + + def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]: + """Parses the cause metrics into the specific formats""" + cause_metrics = [] + for _cs in anomaly.root_causes: + tmp = { + 'metric': _cs.ts.metric, + 'labels': _cs.ts.labels, + 'score': _cs.score, + } + if 'tcp' in _cs.ts.metric: + tmp['description'] = _cs.description.format( + _cs.ts.labels.get('tgid', ''), + _cs.ts.labels.get('client_port', ''), + _cs.ts.labels.get('server_ip', ''), + _cs.ts.labels.get('server_port', '')) + else: + tmp['description'] = _cs.description.format( + _cs.ts.labels.get('dev_name', '')) + + cause_metrics.append(tmp) + + return cause_metrics diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py index 43e069f..a34c48d 100644 --- a/anteater/module/sys/proc_io_latency.py +++ b/anteater/module/sys/proc_io_latency.py @@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector): def init_detectors(self, data_loader): if self.job_config.model_config.enable: detectors = [ - NSigmaDetector(data_loader, method='abs'), + NSigmaDetector(data_loader, method='max'), OnlineVAEDetector(data_loader, self.job_config.model_config) ] else: detectors = [ - NSigmaDetector(data_loader, method='abs') + NSigmaDetector(data_loader, method='max') ] return detectors diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py index a509c96..4df4a35 100644 --- a/anteater/template/app_anomaly_template.py +++ b/anteater/template/app_anomaly_template.py @@ -46,6 +46,8 @@ class AppAnomalyTemplate(Template): 'SeverityNumber': 13, 'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.', 'event_id': f'{timestamp}_{self.entity_id}', + "keywords": self.keywords, + 'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'} } return result diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py index 4ac6abb..aec6ea0 100644 --- a/anteater/template/sys_anomaly_template.py +++ b/anteater/template/sys_anomaly_template.py @@ -46,6 +46,7 @@ class SysAnomalyTemplate(Template): 'SeverityNumber': 13, 'Body': f'{self.timestamp.strftime("%c")} WARN, SYS may be impacting performance issues.', 'event_id': f'{timestamp}_{self.entity_id}', + "keywords": self.keywords } return result diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json new file mode 100644 index 0000000..793f82f --- /dev/null +++ b/config/module/sys_nic_loss.json @@ -0,0 +1,53 @@ +{ + "name": "sys_tcp_transmission_latency", + "job_type": "sys", + "keywords": [ + "net" + ], + "root_cause_number": 3, + "KPI": [ + { + "metric": "gala_gopher_nic_tc_sent_drop", + "kpi_type": "", + "entity_name": "nic", + "enable": true, + "description": "TC发送丢包数异常", + "params": { + "look_back": 2, + "th": 1 + } + } + ], + "Features": [ + { + "metric": "gala_gopher_nic_tx_dropped", + "priority": 0, + "description": "网卡发送丢弃的数据包数异常。(dev_name = {})" + }, + { + "metric": "gala_gopher_nic_rx_dropped", + "priority": 0, + "description": "网卡接收丢弃的数据包数异常。(dev_name = {})" + }, + { + "metric": "gala_gopher_tcp_link_sk_drops", + "priority": 3, + "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})" + }, + { + "metric": "gala_gopher_tcp_link_retran_packets", + "priority": 1, + "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" + }, + { + "metric": "gala_gopher_tcp_link_lost_out", + "priority": 3, + "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})" + }, + { + "metric": "gala_gopher_tcp_link_notsent_bytes", + "priority": 4, + "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})" + } + ] +} \ No newline at end of file diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json index 61ae72d..2c158c0 100644 --- a/config/module/sys_tcp_establish.json +++ b/config/module/sys_tcp_establish.json @@ -15,7 +15,8 @@ "params": { "look_back": 30, "outlier_ratio_th": 0.5, - "obs_size": 3 + "obs_size": 3, + "min_rtt": 500000 } } ], -- 2.33.0