gala-anteater/Update-TCP-Establish-Model-Add-Nic-Loss-Detector.patch
Zhen Chen 1f46c219ad Model optimization and bugfix
- fix str2enum bug & data query refactor
- add systemd service for anteater
- remove 'sys-level' config param
- add chinese descriptions
- Update TCP Establish Model & Add Nic Loss Detector
- Add disk throughput detector

(cherry picked from commit f3c17e8c6a619a7803afd89b945ae3f36d17f9b0)
2023-01-17 22:40:46 +08:00

378 lines
15 KiB
Diff

From dd870b17120f3c7961c4613d454f1653fbd42214 Mon Sep 17 00:00:00 2001
From: lizhenxing11 <lizhenxing11@huawei.com>
Date: Tue, 27 Dec 2022 18:39:32 +0800
Subject: [PATCH] Update TCP Establish Model & Add Nic Loss Detector
change method 'abs' to 'max'
---
anteater/main.py | 2 +
anteater/model/algorithms/three_sigma.py | 4 +-
anteater/model/detector/n_sigma_detector.py | 4 +-
.../tcp_establish_n_sigma_detector.py | 12 +++-
anteater/model/detector/th_base_detector.py | 66 +++++++++++++++++++
anteater/module/sys/nic_loss.py | 59 +++++++++++++++++
anteater/module/sys/proc_io_latency.py | 4 +-
anteater/template/app_anomaly_template.py | 2 +
anteater/template/sys_anomaly_template.py | 1 +
config/module/sys_nic_loss.json | 53 +++++++++++++++
config/module/sys_tcp_establish.json | 3 +-
11 files changed, 200 insertions(+), 10 deletions(-)
create mode 100644 anteater/model/detector/th_base_detector.py
create mode 100644 anteater/module/sys/nic_loss.py
create mode 100644 config/module/sys_nic_loss.json
diff --git a/anteater/main.py b/anteater/main.py
index ba7be70..4de72f9 100644
--- a/anteater/main.py
+++ b/anteater/main.py
@@ -22,6 +22,7 @@ from anteater.anomaly_detection import AnomalyDetection
from anteater.config import AnteaterConf
from anteater.module.app.app_sli_detector import APPSliDetector
from anteater.module.sys.disk_throughput import DiskThroughputDetector
+from anteater.module.sys.nic_loss import NICLossDetector
from anteater.module.sys.proc_io_latency import ProcIOLatencyDetector
from anteater.module.sys.sys_io_latency import SysIOLatencyDetector
from anteater.module.sys.tcp_establish import SysTcpEstablishDetector
@@ -59,6 +60,7 @@ def main():
SysIOLatencyDetector(loader, report),
ProcIOLatencyDetector(loader, report),
DiskThroughputDetector(loader, report),
+ NICLossDetector(loader, report),
]
else:
detectors = [
diff --git a/anteater/model/algorithms/three_sigma.py b/anteater/model/algorithms/three_sigma.py
index 49b9952..0865417 100644
--- a/anteater/model/algorithms/three_sigma.py
+++ b/anteater/model/algorithms/three_sigma.py
@@ -14,8 +14,8 @@
import numpy as np
-def three_sigma(values, obs_size, n=3, method="abs"):
- """The '3-sigma rule' outlier detect function"""
+def n_sigma(values, obs_size, n=3, method="abs"):
+ """The 'N-sigma rule' outlier detect function"""
if obs_size <= 0:
raise ValueError("The obs_size should great than zero!")
if len(values) <= obs_size:
diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py
index f632326..3a2ab01 100644
--- a/anteater/model/detector/n_sigma_detector.py
+++ b/anteater/model/detector/n_sigma_detector.py
@@ -19,7 +19,7 @@ from anteater.core.kpi import KPI
from anteater.core.time_series import TimeSeriesScore
from anteater.model.detector.base import Detector
from anteater.model.algorithms.smooth import smoothing
-from anteater.model.algorithms.three_sigma import three_sigma
+from anteater.model.algorithms.three_sigma import n_sigma
from anteater.source.metric_loader import MetricLoader
from anteater.utils.common import divide
from anteater.utils.datetime import DateTimeManager as dt
@@ -91,7 +91,7 @@ class NSigmaDetector(Detector):
ratio = 0
else:
smoothed_val = smoothing(_ts.values, **smooth_params)
- outlier, mean, std = three_sigma(
+ outlier, mean, std = n_sigma(
smoothed_val, obs_size=obs_size, n=n, method=self.method)
ratio = divide(len(outlier), obs_size)
diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py
index 8dcf9ae..82d7837 100644
--- a/anteater/model/detector/tcp_establish_n_sigma_detector.py
+++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py
@@ -42,8 +42,13 @@ class TcpEstablishNSigmaDetector(Detector):
start, _ = dt.last(minutes=look_back)
mid, _ = dt.last(minutes=3)
+ filtered_ts_list = []
ts_list = self.data_loader.get_metric(start, mid, kpi.metric)
- establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in ts_list])
+ for _ts in ts_list:
+ if sum(_ts.values) > 0:
+ filtered_ts_list.append(_ts)
+
+ establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in filtered_ts_list])
self.mean = np.mean(establish_time)
self.std = np.std(establish_time)
@@ -65,6 +70,7 @@ class TcpEstablishNSigmaDetector(Detector):
"""Detects kpi based on signal time series anomaly detection model"""
outlier_ratio_th = kpi.params.get('outlier_ratio_th')
look_back = kpi.params.get('obs_size')
+ min_rtt = kpi.params.get('min_rtt')
start, end = dt.last(minutes=look_back)
ts_list = self.data_loader.\
@@ -72,9 +78,9 @@ class TcpEstablishNSigmaDetector(Detector):
anomalies = []
for _ts in ts_list:
- outlier = [val for val in _ts.values if abs(val - self.mean) > 3 * self.std]
+ outlier = [val for val in _ts.values if val > self.mean + 5 * self.std]
ratio = divide(len(outlier), len(_ts.values))
- if outlier and ratio > outlier_ratio_th:
+ if outlier and ratio > outlier_ratio_th and np.average(outlier) >= min_rtt:
anomalies.append(
Anomaly(
machine_id=machine_id,
diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py
new file mode 100644
index 0000000..bec9705
--- /dev/null
+++ b/anteater/model/detector/th_base_detector.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python3
+# ******************************************************************************
+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# gala-anteater is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+
+from typing import List
+
+from anteater.core.anomaly import Anomaly
+from anteater.core.kpi import KPI
+from anteater.model.detector.base import Detector
+from anteater.source.metric_loader import MetricLoader
+from anteater.utils.datetime import DateTimeManager as dt
+from anteater.utils.log import logger
+
+
+class ThBaseDetector(Detector):
+ """The threshold-based anomaly detector"""
+
+ def __init__(self, data_loader: MetricLoader):
+ """The detector base class initializer"""
+ super().__init__(data_loader)
+
+ def detect_kpis(self, kpis: List[KPI]):
+ """Executes anomaly detection on kpis"""
+ start, end = dt.last(minutes=1)
+ machine_ids = self.get_unique_machine_id(start, end, kpis)
+ anomalies = []
+ for _id in machine_ids:
+ for kpi in kpis:
+ anomalies.extend(self.detect_signal_kpi(kpi, _id))
+
+ return anomalies
+
+ def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]:
+ """Detects kpi based on threshold based anomaly detection model"""
+ look_back = kpi.params.get('look_back')
+ th = kpi.params.get('th')
+ start, end = dt.last(minutes=look_back)
+ ts_list = self.data_loader.\
+ get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
+
+ if not ts_list:
+ logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
+ return []
+
+ anomalies = [
+ Anomaly(
+ machine_id=machine_id,
+ metric=_ts.metric,
+ labels=_ts.labels,
+ score=1,
+ entity_name=kpi.entity_name,
+ description=kpi.description)
+ for _ts in ts_list
+ if sum(_ts.values) >= th
+ ]
+
+ return anomalies
diff --git a/anteater/module/sys/nic_loss.py b/anteater/module/sys/nic_loss.py
new file mode 100644
index 0000000..d24e06f
--- /dev/null
+++ b/anteater/module/sys/nic_loss.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python3
+# ******************************************************************************
+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# gala-anteater is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+# http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+
+from typing import List, Dict
+
+from anteater.core.anomaly import Anomaly
+from anteater.model.detector.th_base_detector import ThBaseDetector
+from anteater.module.base import E2EDetector
+from anteater.source.anomaly_report import AnomalyReport
+from anteater.source.metric_loader import MetricLoader
+from anteater.template.sys_anomaly_template import SysAnomalyTemplate
+
+
+class NICLossDetector(E2EDetector):
+ """SYS nic loss e2e detector which detects the network loss.
+ """
+
+ config_file = 'sys_nic_loss.json'
+
+ def __init__(self, data_loader: MetricLoader, reporter: AnomalyReport):
+ """The system tcp transmission latency e2e detector initializer"""
+ super().__init__(reporter, SysAnomalyTemplate)
+
+ self.detectors = [
+ ThBaseDetector(data_loader)
+ ]
+
+ def parse_cause_metrics(self, anomaly: Anomaly) -> List[Dict]:
+ """Parses the cause metrics into the specific formats"""
+ cause_metrics = []
+ for _cs in anomaly.root_causes:
+ tmp = {
+ 'metric': _cs.ts.metric,
+ 'labels': _cs.ts.labels,
+ 'score': _cs.score,
+ }
+ if 'tcp' in _cs.ts.metric:
+ tmp['description'] = _cs.description.format(
+ _cs.ts.labels.get('tgid', ''),
+ _cs.ts.labels.get('client_port', ''),
+ _cs.ts.labels.get('server_ip', ''),
+ _cs.ts.labels.get('server_port', ''))
+ else:
+ tmp['description'] = _cs.description.format(
+ _cs.ts.labels.get('dev_name', ''))
+
+ cause_metrics.append(tmp)
+
+ return cause_metrics
diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
index 43e069f..a34c48d 100644
--- a/anteater/module/sys/proc_io_latency.py
+++ b/anteater/module/sys/proc_io_latency.py
@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
def init_detectors(self, data_loader):
if self.job_config.model_config.enable:
detectors = [
- NSigmaDetector(data_loader, method='abs'),
+ NSigmaDetector(data_loader, method='max'),
OnlineVAEDetector(data_loader, self.job_config.model_config)
]
else:
detectors = [
- NSigmaDetector(data_loader, method='abs')
+ NSigmaDetector(data_loader, method='max')
]
return detectors
diff --git a/anteater/template/app_anomaly_template.py b/anteater/template/app_anomaly_template.py
index a509c96..4df4a35 100644
--- a/anteater/template/app_anomaly_template.py
+++ b/anteater/template/app_anomaly_template.py
@@ -46,6 +46,8 @@ class AppAnomalyTemplate(Template):
'SeverityNumber': 13,
'Body': f'{self.timestamp.strftime("%c")} WARN, APP may be impacting sli performance issues.',
'event_id': f'{timestamp}_{self.entity_id}',
+ "keywords": self.keywords,
+ 'cause_metrics': self.cause_metrics[0] if self.cause_metrics else {'description': 'Unknown'}
}
return result
diff --git a/anteater/template/sys_anomaly_template.py b/anteater/template/sys_anomaly_template.py
index 4ac6abb..aec6ea0 100644
--- a/anteater/template/sys_anomaly_template.py
+++ b/anteater/template/sys_anomaly_template.py
@@ -46,6 +46,7 @@ class SysAnomalyTemplate(Template):
'SeverityNumber': 13,
'Body': f'{self.timestamp.strftime("%c")} WARN, SYS may be impacting performance issues.',
'event_id': f'{timestamp}_{self.entity_id}',
+ "keywords": self.keywords
}
return result
diff --git a/config/module/sys_nic_loss.json b/config/module/sys_nic_loss.json
new file mode 100644
index 0000000..793f82f
--- /dev/null
+++ b/config/module/sys_nic_loss.json
@@ -0,0 +1,53 @@
+{
+ "name": "sys_tcp_transmission_latency",
+ "job_type": "sys",
+ "keywords": [
+ "net"
+ ],
+ "root_cause_number": 3,
+ "KPI": [
+ {
+ "metric": "gala_gopher_nic_tc_sent_drop",
+ "kpi_type": "",
+ "entity_name": "nic",
+ "enable": true,
+ "description": "TC发送丢包数异常",
+ "params": {
+ "look_back": 2,
+ "th": 1
+ }
+ }
+ ],
+ "Features": [
+ {
+ "metric": "gala_gopher_nic_tx_dropped",
+ "priority": 0,
+ "description": "网卡发送丢弃的数据包数异常。(dev_name = {})"
+ },
+ {
+ "metric": "gala_gopher_nic_rx_dropped",
+ "priority": 0,
+ "description": "网卡接收丢弃的数据包数异常。(dev_name = {})"
+ },
+ {
+ "metric": "gala_gopher_tcp_link_sk_drops",
+ "priority": 3,
+ "description": "Packets are lost in the host protocol stack due to unknown causes, and the TCP performance deteriorates. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
+ },
+ {
+ "metric": "gala_gopher_tcp_link_retran_packets",
+ "priority": 1,
+ "description": "TCP retransmission is triggered due to network faults, resulting in TCP performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
+ },
+ {
+ "metric": "gala_gopher_tcp_link_lost_out",
+ "priority": 3,
+ "description": "The network may be congested, causing abnormal TCP packet loss and performance deterioration. (PID ={}, client IP = {}, Server IP = {}, Port = {})"
+ },
+ {
+ "metric": "gala_gopher_tcp_link_notsent_bytes",
+ "priority": 4,
+ "description": "Due to network delay or peer application performance, too many packets to be sent are accumulated in the sliding window. As a result, TCP performance deteriorates. (PID = {}, client IP = {}, Server IP = {}, Port = {})"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
index 61ae72d..2c158c0 100644
--- a/config/module/sys_tcp_establish.json
+++ b/config/module/sys_tcp_establish.json
@@ -15,7 +15,8 @@
"params": {
"look_back": 30,
"outlier_ratio_th": 0.5,
- "obs_size": 3
+ "obs_size": 3,
+ "min_rtt": 500000
}
}
],
--
2.33.0