- fix str2enum bug & data query refactor - add systemd service for anteater - remove 'sys-level' config param - add chinese descriptions - Update TCP Establish Model & Add Nic Loss Detector - Add disk throughput detector (cherry picked from commit f3c17e8c6a619a7803afd89b945ae3f36d17f9b0)
378 lines
15 KiB
Diff
378 lines
15 KiB
Diff
From dd870b17120f3c7961c4613d454f1653fbd42214 Mon Sep 17 00:00:00 2001
|
|
From: lizhenxing11 <lizhenxing11@huawei.com>
|
|
Date: Tue, 27 Dec 2022 18:39:32 +0800
|
|
Subject: [PATCH] Update TCP Establish Model & Add Nic Loss Detector
|
|
|
|
change method 'abs' to 'max'
|
|
---
|
|
anteater/main.py | 2 +
|
|
anteater/model/algorithms/three_sigma.py | 4 +-
|
|
anteater/model/detector/n_sigma_detector.py | 4 +-
|
|
.../tcp_establish_n_sigma_detector.py | 12 +++-
|
|
anteater/model/detector/th_base_detector.py | 66 +++++++++++++++++++
|
|
anteater/module/sys/nic_loss.py | 59 +++++++++++++++++
|
|
anteater/module/sys/proc_io_latency.py | 4 +-
|
|
anteater/template/app_anomaly_template.py | 2 +
|
|
anteater/template/sys_anomaly_template.py | 1 +
|
|
config/module/sys_nic_loss.json | 53 +++++++++++++++
|
|
config/module/sys_tcp_establish.json | 3 +-
|
|
11 files changed, 200 insertions(+), 10 deletions(-)
|
|
create mode 100644 anteater/model/detector/th_base_detector.py
|
|
create mode 100644 anteater/module/sys/nic_loss.py
|
|
create mode 100644 config/module/sys_nic_loss.json
|
|
|
|
diff --git a/anteater/main.py b/anteater/main.py
|
|
index ba7be70..4de72f9 100644
|
|
--- a/anteater/main.py
|
|
+++ b/anteater/main.py
|
|
@@ -22,6 +22,7 @@ from anteater.anomaly_detection import AnomalyDetection
|
|
from anteater.config import AnteaterConf
|
|
from anteater.module.app.app_sli_detector import APPSliDetector
|
|
from anteater.module.sys.disk_throughput import DiskThroughputDetector
|
|
+from anteater.module.sys.nic_loss import NICLossDetector
|
|
from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector
|
|
from anteater.module.sys.sys_io_latency import SysIOLatencyDetector
|
|
from anteater.module.sys.tcp_establish import SysTcpEstablishDetector
|
|
@@ -59,6 +60,7 @@ def main():
|
|
SysIOLatencyDetector(loader, report),
|
|
ProcIOLatencyDetector(loader, report),
|
|
DiskThroughputDetector(loader, report),
|
|
+ NICLossDetector(loader, report),
|
|
]
|
|
else:
|
|
detectors = [
|
|
diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py
|
|
index 49b9952..0865417 100644
|
|
--- a/anteater/model/algorithms/three_sigma.py
|
|
+++ b/anteater/model/algorithms/three_sigma.py
|
|
@@ -14,8 +14,8 @@
|
|
import numpy as np
|
|
|
|
|
|
-def three_sigma(values, obs_size, n=3, method="abs"):
|
|
- """The '3-sigma rule' outlier detect function"""
|
|
+def n_sigma(values, obs_size, n=3, method="abs"):
|
|
+ """The 'N-sigma rule' outlier detect function"""
|
|
if obs_size <= 0:
|
|
raise ValueError("The obs_size should great than zero!")
|
|
if len(values) <= obs_size:
|
|
diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py
|
|
index f632326..3a2ab01 100644
|
|
--- a/anteater/model/detector/n_sigma_detector.py
|
|
+++ b/anteater/model/detector/n_sigma_detector.py
|
|
@@ -19,7 +19,7 @@ from anteater.core.kpi import KPI
|
|
from anteater.core.time_series import TimeSeriesScore
|
|
from anteater.model.detector.base import Detector
|
|
from anteater.model.algorithms.smooth import smoothing
|
|
-from anteater.model.algorithms.three_sigma import three_sigma
|
|
+from anteater.model.algorithms.three_sigma import n_sigma
|
|
from anteater.source.metric_loader import MetricLoader
|
|
from anteater.utils.common import divide
|
|
from anteater.utils.datetime import DateTimeManager as dt
|
|
@@ -91,7 +91,7 @@ class NSigmaDetector(Detector):
|
|
ratio = 0
|
|
else:
|
|
smoothed_val = smoothing(_ts.values, **smooth_params)
|
|
- outlier, mean, std = three_sigma(
|
|
+ outlier, mean, std = n_sigma(
|
|
smoothed_val, obs_size=obs_size, n=n, method=self.method)
|
|
ratio = divide(len(outlier), obs_size)
|
|
|
|
diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py
|
|
index 8dcf9ae..82d7837 100644
|
|
--- a/anteater/model/detector/tcp_establish_n_sigma_detector.py
|
|
+++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py
|
|
@@ -42,8 +42,13 @@ class TcpEstablishNSigmaDetector(Detector):
|
|
start, _ = dt.last(minutes=look_back)
|
|
mid, _ = dt.last(minutes=3)
|
|
|
|
+ filtered_ts_list = []
|
|
ts_list = self.data_loader.get_metric(start, mid, kpi.metric)
|
|
- establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in ts_list])
|
|
+ for _ts in ts_list:
|
|
+ if sum(_ts.values) > 0:
|
|
+ filtered_ts_list.append(_ts)
|
|
+
|
|
+ establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in filtered_ts_list])
|
|
|
|
self.mean = np.mean(establish_time)
|
|
self.std = np.std(establish_time)
|
|
@@ -65,6 +70,7 @@ class TcpEstablishNSigmaDetector(Detector):
|
|
"""Detects kpi based on signal time series anomaly detection model"""
|
|
outlier_ratio_th = kpi.params.get('outlier_ratio_th')
|
|
look_back = kpi.params.get('obs_size')
|
|
+ min_rtt = kpi.params.get('min_rtt')
|
|
|
|
start, end = dt.last(minutes=look_back)
|
|
ts_list = self.data_loader.\
|
|
@@ -72,9 +78,9 @@ class TcpEstablishNSigmaDetector(Detector):
|
|
|
|
anomalies = []
|
|
for _ts in ts_list:
|
|
- outlier = [val for val in _ts.values if abs(val - self.mean) > 3 * self.std]
|
|
+ outlier = [val for val in _ts.values if val > self.mean + 5 * self.std]
|
|
ratio = divide(len(outlier), len(_ts.values))
|
|
- if outlier and ratio > outlier_ratio_th:
|
|
+ if outlier and ratio > outlier_ratio_th and np.average(outlier) >= min_rtt:
|
|
anomalies.append(
|
|
Anomaly(
|
|
machine_id=machine_id,
|
|
diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py
|
|
new file mode 100644
|
|
index 0000000..bec9705
|
|
--- /dev/null
|
|
+++ b/anteater/model/detector/th_base_detector.py
|
|
@@ -0,0 +1,66 @@
|
|
+#!/usr/bin/python3
|
|
+# ******************************************************************************
|
|
+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
|
|
+# gala-anteater is licensed under Mulan PSL v2.
|
|
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
+# You may obtain a copy of Mulan PSL v2 at:
|
|
+# http://license.coscl.org.cn/MulanPSL2
|
|
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
+# See the Mulan PSL v2 for more details.
|
|
+# ******************************************************************************/
|
|
+
|
|
+from typing import List
|
|
+
|
|
+from anteater.core.anomaly import Anomaly
|
|
+from anteater.core.kpi import KPI
|
|
+from anteater.model.detector.base import Detector
|
|
+from anteater.source.metric_loader import MetricLoader
|
|
+from anteater.utils.datetime import DateTimeManager as dt
|
|
+from anteater.utils.log import logger
|
|
+
|
|
+
|
|
+class ThBaseDetector(Detector):
|
|
+ """The threshold-based anomaly detector"""
|
|
+
|
|
+ def __init__(self, data_loader: MetricLoader):
|
|
+ """The detector base class initializer"""
|
|
+ super().__init__(data_loader)
|
|
+
|
|
+ def detect_kpis(self, kpis: List[KPI]):
|
|
+ """Executes anomaly detection on kpis"""
|
|
+ start, end = dt.last(minutes=1)
|
|
+ machine_ids = self.get_unique_machine_id(start, end, kpis)
|
|
+ anomalies = []
|
|
+ for _id in machine_ids:
|
|
+ for kpi in kpis:
|
|
+ anomalies.extend(self.detect_signal_kpi(kpi, _id))
|
|
+
|
|
+ return anomalies
|
|
+
|
|
+ def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]:
|
|
+ """Detects kpi based on threshold based anomaly detection model"""
|
|
+ look_back = kpi.params.get('look_back')
|
|
+ th = kpi.params.get('th')
|
|
+ start, end = dt.last(minutes=look_back)
|
|
+ ts_list = self.data_loader.\
|
|
+ get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
|
|
+
|
|
+ if not ts_list:
|
|
+ logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
|
|
+ return []
|
|
+
|
|
+ anomalies = [
|
|
+ Anomaly(
|
|
+ machine_id=machine_id,
|
|
+ metric=_ts.metric,
|
|
+ labels=_ts.labels,
|
|
+ score=1,
|
|
+ entity_name=kpi.entity_name,
|
|
+ description=kpi.description)
|
|
+ for _ts in ts_list
|
|
+ if sum(_ts.values) >= th
|
|
+ ]
|
|
+
|
|
+ return anomalies
|
|
diff --git a/anteater/module/sys/nic_loss.py b/anteater/module/sys/nic_loss.py
|
|
new file mode 100644
|
|
index 0000000..d24e06f
|
|
--- /dev/null
|
|
+++ b/anteater/module/sys/nic_loss.py
|
|
@@ -0,0 +1,59 @@
|
|
+#!/usr/bin/python3
|
|
+# ******************************************************************************
|
|
+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
|
|
+# gala-anteater is licensed under Mulan PSL v2.
|
|
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
+# You may obtain a copy of Mulan PSL v2 at:
|
|
+# http://license.coscl.org.cn/MulanPSL2
|
|
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
+# See the Mulan PSL v2 for more details.
|
|
+# ******************************************************************************/
|
|
+
|
|
+from typing import List, Dict
|
|
+
|
|
+from anteater.core.anomaly import Anomaly
|
|
+from anteater.model.detector.th_base_detector import ThBaseDetector
|
|
+from anteater.module.base import E2EDetector
|
|
+from anteater.source.anomaly_report import AnomalyReport
|
|
+from anteater.source.metric_loader import MetricLoader
|
|
+from anteater.template.sys_anomaly_template import SysAnomalyTemplate
|
|
+
|
|
+
|
|
+class NICLossDetector(E2EDetector):
|
|
+ """SYS nic loss e2e detector which detects the network loss.
|
|
+ """
|
|
+
|
|
+ config_file = 'sys_nic_loss.json'
|
|
+
|
|
+ def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport):
|
|
+ """The system tcp transmission latency e2e detector initializer"""
|
|
+ super().__init__(reporter, SysAnomalyTemplate)
|
|
+
|
|
+ self.detectors = [
|
|
+ ThBaseDetector(data_loader)
|
|
+ ]
|
|
+
|
|
+ def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
|
|
+ """Parses the cause metrics into the specific formats"""
|
|
+ cause_metrics = []
|
|
+ for _cs in anomaly.root_causes:
|
|
+ tmp = {
|
|
+ 'metric': _cs.ts.metric,
|
|
+ 'labels': _cs.ts.labels,
|
|
+ 'score': _cs.score,
|
|
+ }
|
|
+ if 'tcp' in _cs.ts.metric:
|
|
+ tmp['description'] = _cs.description.format(
|
|
+ _cs.ts.labels.get('tgid', ''),
|
|
+ _cs.ts.labels.get('client_port', ''),
|
|
+ _cs.ts.labels.get('server_ip', ''),
|
|
+ _cs.ts.labels.get('server_port', ''))
|
|
+ else:
|
|
+ tmp['description'] = _cs.description.format(
|
|
+ _cs.ts.labels.get('dev_name', ''))
|
|
+
|
|
+ cause_metrics.append(tmp)
|
|
+
|
|
+ return cause_metrics
|
|
diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
|
|
index 43e069f..a34c48d 100644
|
|
--- a/anteater/module/sys/proc_io_latency.py
|
|
+++ b/anteater/module/sys/proc_io_latency.py
|
|
@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
|
|
def init_detectors(self, data_loader):
|
|
if self.job_config.model_config.enable:
|
|
detectors = [
|
|
- NSigmaDetector(data_loader, method='abs'),
|
|
+ NSigmaDetector(data_loader, method='max'),
|
|
OnlineVAEDetector(data_loader, self.job_config.model_config)
|
|
]
|
|
else:
|
|
detectors = [
|
|
- NSigmaDetector(data_loader, method='abs')
|
|
+ NSigmaDetector(data_loader, method='max')
|
|
]
|
|
|
|
return detectors
|
|
diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py
|
|
index a509c96..4df4a35 100644
|
|
--- a/anteater/template/app_anomaly_template.py
|
|
+++ b/anteater/template/app_anomaly_template.py
|
|
@@ -46,6 +46,8 @@ class AppAnomalyTemplate(Template):
|
|
'SeverityNumber': 13,
|
|
'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.',
|
|
'event_id': f'{timestamp}_{self.entity_id}',
|
|
+ "keywords": self.keywords,
|
|
+ 'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
|
|
}
|
|
|
|
return result
|
|
diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py
|
|
index 4ac6abb..aec6ea0 100644
|
|
--- a/anteater/template/sys_anomaly_template.py
|
|
+++ b/anteater/template/sys_anomaly_template.py
|
|
@@ -46,6 +46,7 @@ class SysAnomalyTemplate(Template):
|
|
'SeverityNumber': 13,
|
|
'Body': f'{self.timestamp.strftime("%c")} WARN, SYS may be impacting performance issues.',
|
|
'event_id': f'{timestamp}_{self.entity_id}',
|
|
+ "keywords": self.keywords
|
|
}
|
|
|
|
return result
|
|
diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json
|
|
new file mode 100644
|
|
index 0000000..793f82f
|
|
--- /dev/null
|
|
+++ b/config/module/sys_nic_loss.json
|
|
@@ -0,0 +1,53 @@
|
|
+{
|
|
+ "name": "sys_tcp_transmission_latency",
|
|
+ "job_type": "sys",
|
|
+ "keywords": [
|
|
+ "net"
|
|
+ ],
|
|
+ "root_cause_number": 3,
|
|
+ "KPI": [
|
|
+ {
|
|
+ "metric": "gala_gopher_nic_tc_sent_drop",
|
|
+ "kpi_type": "",
|
|
+ "entity_name": "nic",
|
|
+ "enable": true,
|
|
+ "description": "TC发送丢包数异常",
|
|
+ "params": {
|
|
+ "look_back": 2,
|
|
+ "th": 1
|
|
+ }
|
|
+ }
|
|
+ ],
|
|
+ "Features": [
|
|
+ {
|
|
+ "metric": "gala_gopher_nic_tx_dropped",
|
|
+ "priority": 0,
|
|
+ "description": "网卡发送丢弃的数据包数异常。(dev_name = {})"
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_nic_rx_dropped",
|
|
+ "priority": 0,
|
|
+ "description": "网卡接收丢弃的数据包数异常。(dev_name = {})"
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_tcp_link_sk_drops",
|
|
+ "priority": 3,
|
|
+ "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_tcp_link_retran_packets",
|
|
+ "priority": 1,
|
|
+ "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_tcp_link_lost_out",
|
|
+ "priority": 3,
|
|
+ "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
|
|
+ },
|
|
+ {
|
|
+ "metric": "gala_gopher_tcp_link_notsent_bytes",
|
|
+ "priority": 4,
|
|
+ "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})"
|
|
+ }
|
|
+ ]
|
|
+}
|
|
\ No newline at end of file
|
|
diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
|
|
index 61ae72d..2c158c0 100644
|
|
--- a/config/module/sys_tcp_establish.json
|
|
+++ b/config/module/sys_tcp_establish.json
|
|
@@ -15,7 +15,8 @@
|
|
"params": {
|
|
"look_back": 30,
|
|
"outlier_ratio_th": 0.5,
|
|
- "obs_size": 3
|
|
+ "obs_size": 3,
|
|
+ "min_rtt": 500000
|
|
}
|
|
}
|
|
],
|
|
--
|
|
2.33.0
|
|
|