- fix str2enum bug & data query refactor - add systemd service for anteater - remove 'sys-level' config param - add chinese descriptions - Update TCP Establish Model & Add Nic Loss Detector - Add disk throughput detector (cherry picked from commit f3c17e8c6a619a7803afd89b945ae3f36d17f9b0)
738 lines
30 KiB
Diff
738 lines
30 KiB
Diff
From 27bb7cdd80f76bfc7ebb0f3041544740aa2fa91b Mon Sep 17 00:00:00 2001
|
||
From: lizhenxing11 <lizhenxing11@huawei.com>
|
||
Date: Tue, 10 Jan 2023 15:31:44 +0800
|
||
Subject: [PATCH] fix str2enum bug & data query refactor
|
||
|
||
---
|
||
anteater/core/anomaly.py | 10 ++++
|
||
anteater/core/kpi.py | 14 ++++++
|
||
anteater/model/algorithms/slope.py | 11 +++--
|
||
anteater/model/detector/base.py | 20 ++++----
|
||
anteater/model/detector/n_sigma_detector.py | 15 +++---
|
||
.../model/detector/online_vae_detector.py | 3 +-
|
||
.../tcp_establish_n_sigma_detector.py | 3 +-
|
||
.../tcp_trans_latency_n_sigma_detector.py | 48 +++++++++++++++++--
|
||
anteater/model/detector/th_base_detector.py | 3 +-
|
||
anteater/module/app/app_sli_detector.py | 4 +-
|
||
anteater/module/sys/disk_throughput.py | 4 +-
|
||
anteater/module/sys/proc_io_latency.py | 4 +-
|
||
anteater/module/sys/sys_io_latency.py | 4 +-
|
||
.../module/sys/tcp_transmission_latency.py | 4 +-
|
||
.../module/sys/tcp_transmission_throughput.py | 4 +-
|
||
anteater/source/metric_loader.py | 41 +++++++++++++++-
|
||
anteater/utils/data_load.py | 4 +-
|
||
config/module/app_sli_rtt.json | 6 ++-
|
||
config/module/disk_throughput.json | 6 ++-
|
||
config/module/proc_io_latency.json | 15 ++++--
|
||
config/module/sys_io_latency.json | 2 +-
|
||
config/module/sys_tcp_establish.json | 2 +-
|
||
.../module/sys_tcp_transmission_latency.json | 4 +-
|
||
23 files changed, 172 insertions(+), 59 deletions(-)
|
||
|
||
diff --git a/anteater/core/anomaly.py b/anteater/core/anomaly.py
|
||
index 45c4fc3..fdee3d1 100644
|
||
--- a/anteater/core/anomaly.py
|
||
+++ b/anteater/core/anomaly.py
|
||
@@ -52,3 +52,13 @@ class AnomalyTrend(Enum):
|
||
DEFAULT = 0
|
||
RISE = 1
|
||
FALL = 2
|
||
+
|
||
+ @staticmethod
|
||
+ def from_str(label: str):
|
||
+ """Trans str to Enum type"""
|
||
+ if label.upper() == 'RISE':
|
||
+ return AnomalyTrend.RISE
|
||
+ elif label.upper() == 'FALL':
|
||
+ return AnomalyTrend.FALL
|
||
+ else:
|
||
+ return AnomalyTrend.DEFAULT
|
||
diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py
|
||
index f83b666..70cc9ee 100644
|
||
--- a/anteater/core/kpi.py
|
||
+++ b/anteater/core/kpi.py
|
||
@@ -27,6 +27,13 @@ class KPI:
|
||
params: dict = field(default=dict)
|
||
atrend: AnomalyTrend = AnomalyTrend.DEFAULT
|
||
|
||
+ @classmethod
|
||
+ def from_dict(cls, **data):
|
||
+ if 'atrend' in data:
|
||
+ data['atrend'] = AnomalyTrend.from_str(data.get('atrend'))
|
||
+
|
||
+ return cls(**data)
|
||
+
|
||
|
||
@dataclass
|
||
class Feature:
|
||
@@ -35,6 +42,13 @@ class Feature:
|
||
priority: int = 0
|
||
atrend: AnomalyTrend = AnomalyTrend.DEFAULT
|
||
|
||
+ @classmethod
|
||
+ def from_dict(cls, **data):
|
||
+ if 'atrend' in data:
|
||
+ data['atrend'] = AnomalyTrend.from_str(data.get('atrend'))
|
||
+
|
||
+ return cls(**data)
|
||
+
|
||
|
||
@dataclass
|
||
class ModelConfig:
|
||
diff --git a/anteater/model/algorithms/slope.py b/anteater/model/algorithms/slope.py
|
||
index d324d58..e546183 100644
|
||
--- a/anteater/model/algorithms/slope.py
|
||
+++ b/anteater/model/algorithms/slope.py
|
||
@@ -17,6 +17,7 @@ import numpy as np
|
||
|
||
from anteater.core.anomaly import AnomalyTrend
|
||
from anteater.model.algorithms.smooth import conv_smooth
|
||
+from anteater.utils.common import divide
|
||
|
||
|
||
def slope(y, win_len):
|
||
@@ -36,13 +37,15 @@ def smooth_slope(time_series, windows_length):
|
||
|
||
def trend(y, win_len=None):
|
||
"""Gets the trend for the y"""
|
||
+ y = conv_smooth(y, box_pts=7)
|
||
+
|
||
if not win_len:
|
||
win_len = len(y) // 2
|
||
|
||
- if np.mean(y[:win_len]) < np.mean(y[-win_len:]):
|
||
+ if divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) < 0.9:
|
||
return 1
|
||
|
||
- elif np.mean(y[:win_len]) > np.mean(y[-win_len:]):
|
||
+ elif divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) > 1.1:
|
||
return -1
|
||
|
||
else:
|
||
@@ -51,10 +54,10 @@ def trend(y, win_len=None):
|
||
|
||
def check_trend(values: List[float], atrend: AnomalyTrend):
|
||
"""Checks the values with an 'atrend' trend"""
|
||
- if atrend == AnomalyTrend.RISE and trend(values) < 0:
|
||
+ if atrend == AnomalyTrend.RISE and trend(values) != 1:
|
||
return False
|
||
|
||
- if atrend == AnomalyTrend.FALL and trend(values) > 0:
|
||
+ if atrend == AnomalyTrend.FALL and trend(values) != -1:
|
||
return False
|
||
|
||
return True
|
||
diff --git a/anteater/model/detector/base.py b/anteater/model/detector/base.py
|
||
index 2b2dafe..a23b6d9 100644
|
||
--- a/anteater/model/detector/base.py
|
||
+++ b/anteater/model/detector/base.py
|
||
@@ -11,6 +11,7 @@
|
||
# See the Mulan PSL v2 for more details.
|
||
# ******************************************************************************/
|
||
|
||
+import logging
|
||
import math
|
||
from abc import abstractmethod
|
||
from typing import List
|
||
@@ -39,12 +40,6 @@ class Detector:
|
||
"""Executes anomaly detection on kpis"""
|
||
pass
|
||
|
||
- def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]:
|
||
- """Gets unique machine ids during past minutes"""
|
||
- metrics = [_kpi.metric for _kpi in kpis]
|
||
- machine_ids = self.data_loader.get_unique_machines(start, end, metrics)
|
||
- return machine_ids
|
||
-
|
||
def execute(self, job_config: JobConfig) -> List[Anomaly]:
|
||
"""The main function of the detector"""
|
||
kpis = job_config.kpis
|
||
@@ -56,6 +51,12 @@ class Detector:
|
||
|
||
return self._execute(kpis, features, top_n=n)
|
||
|
||
+ def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]:
|
||
+ """Gets unique machine ids during past minutes"""
|
||
+ metrics = [_kpi.metric for _kpi in kpis]
|
||
+ machine_ids = self.data_loader.get_unique_machines(start, end, metrics)
|
||
+ return machine_ids
|
||
+
|
||
def find_root_causes(self, anomalies: List[Anomaly], features: List[Feature], top_n=3)\
|
||
-> List[Anomaly]:
|
||
"""Finds root causes for each anomaly events"""
|
||
@@ -82,6 +83,7 @@ class Detector:
|
||
tmp_ts_scores = self.cal_anomaly_score(f.metric, f.description, machine_id=machine_id)
|
||
for _ts_score in tmp_ts_scores:
|
||
if not check_trend(_ts_score.ts.values, f.atrend):
|
||
+ logging.info(f"Trends Filtered: {f.metric}")
|
||
_ts_score.score = 0
|
||
if same_intersection_key_value(_ts_score.ts.labels, filters):
|
||
ts_scores.append(_ts_score)
|
||
@@ -101,6 +103,7 @@ class Detector:
|
||
for _ts_s in ts_scores:
|
||
if same_intersection_key_value(_ts_s.ts.labels, anomaly.labels):
|
||
if not check_trend(_ts_s.ts.values, kpi_atrends[anomaly.metric]):
|
||
+ logging.info(f"Trends Filtered: {anomaly.metric}")
|
||
anomaly.score = 0
|
||
else:
|
||
anomaly.score = _ts_s.score
|
||
@@ -115,12 +118,11 @@ class Detector:
|
||
machine_id: str)\
|
||
-> List[TimeSeriesScore]:
|
||
"""Calculates metric anomaly scores based on sr model"""
|
||
- start, end = dt.last(minutes=6)
|
||
+ start, end = dt.last(minutes=10)
|
||
point_count = self.data_loader.expected_point_length(start, end)
|
||
model = SpectralResidual(12, 24, 50)
|
||
ts_scores = []
|
||
- ts_list = self.data_loader.\
|
||
- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
|
||
+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
|
||
for _ts in ts_list:
|
||
if sum(_ts.values) == 0 or \
|
||
len(_ts.values) < point_count * 0.9 or\
|
||
diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py
|
||
index 3a2ab01..dbf83c6 100644
|
||
--- a/anteater/model/detector/n_sigma_detector.py
|
||
+++ b/anteater/model/detector/n_sigma_detector.py
|
||
@@ -29,10 +29,9 @@ from anteater.utils.log import logger
|
||
class NSigmaDetector(Detector):
|
||
"""The three sigma anomaly detector"""
|
||
|
||
- def __init__(self, data_loader: MetricLoader, method: str):
|
||
+ def __init__(self, data_loader: MetricLoader):
|
||
"""The detector base class initializer"""
|
||
super().__init__(data_loader)
|
||
- self.method = method
|
||
|
||
def detect_kpis(self, kpis: List[KPI]):
|
||
"""Executes anomaly detection on kpis"""
|
||
@@ -48,7 +47,7 @@ class NSigmaDetector(Detector):
|
||
def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]:
|
||
"""Detects kpi based on signal time series anomaly detection model"""
|
||
outlier_ratio_th = kpi.params['outlier_ratio_th']
|
||
- ts_scores = self.calculate_metric_three_sigma_score(
|
||
+ ts_scores = self.calculate_n_sigma_score(
|
||
kpi.metric, kpi.description, machine_id, **kpi.params)
|
||
if not ts_scores:
|
||
logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
|
||
@@ -68,17 +67,17 @@ class NSigmaDetector(Detector):
|
||
|
||
return anomalies
|
||
|
||
- def calculate_metric_three_sigma_score(self, metric, description, machine_id: str, **kwargs)\
|
||
+ def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\
|
||
-> List[TimeSeriesScore]:
|
||
"""Calculate kpi anomaly scores based on three sigma scores"""
|
||
+ method = kwargs.get('method', 'abs')
|
||
look_back = kwargs.get('look_back')
|
||
smooth_params = kwargs.get('smooth_params')
|
||
obs_size = kwargs.get('obs_size')
|
||
n = kwargs.get('n', 3)
|
||
start, end = dt.last(minutes=look_back)
|
||
point_count = self.data_loader.expected_point_length(start, end)
|
||
- ts_list = self.data_loader.\
|
||
- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
|
||
+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
|
||
ts_scores = []
|
||
for _ts in ts_list:
|
||
dedup_values = [k for k, g in groupby(_ts.values)]
|
||
@@ -87,12 +86,12 @@ class NSigmaDetector(Detector):
|
||
len(_ts.values) > point_count * 1.5 or \
|
||
all(x == _ts.values[0] for x in _ts.values):
|
||
ratio = 0
|
||
- elif len(dedup_values) < point_count * 0.3:
|
||
+ elif len(dedup_values) < point_count * 0.6:
|
||
ratio = 0
|
||
else:
|
||
smoothed_val = smoothing(_ts.values, **smooth_params)
|
||
outlier, mean, std = n_sigma(
|
||
- smoothed_val, obs_size=obs_size, n=n, method=self.method)
|
||
+ smoothed_val, obs_size=obs_size, n=n, method=method)
|
||
ratio = divide(len(outlier), obs_size)
|
||
|
||
ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description))
|
||
diff --git a/anteater/model/detector/online_vae_detector.py b/anteater/model/detector/online_vae_detector.py
|
||
index 63a7b09..0f91576 100644
|
||
--- a/anteater/model/detector/online_vae_detector.py
|
||
+++ b/anteater/model/detector/online_vae_detector.py
|
||
@@ -110,8 +110,7 @@ class OnlineVAEDetector(Detector):
|
||
metric_dfs = []
|
||
for metric in metrics:
|
||
_ts_list = self.data_loader.\
|
||
- get_metric(start, end, metric, label_name="machine_id",
|
||
- label_value=machine_id, operator_name='avg')
|
||
+ get_metric(start, end, metric, operator='avg', keys="machine_id", machine_id=machine_id)
|
||
|
||
if len(_ts_list) > 1:
|
||
raise ValueError(f'Got multiple time_series based on machine id: {len(_ts_list)}')
|
||
diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py
|
||
index 82d7837..3720069 100644
|
||
--- a/anteater/model/detector/tcp_establish_n_sigma_detector.py
|
||
+++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py
|
||
@@ -73,8 +73,7 @@ class TcpEstablishNSigmaDetector(Detector):
|
||
min_rtt = kpi.params.get('min_rtt')
|
||
|
||
start, end = dt.last(minutes=look_back)
|
||
- ts_list = self.data_loader.\
|
||
- get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
|
||
+ ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id)
|
||
|
||
anomalies = []
|
||
for _ts in ts_list:
|
||
diff --git a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
|
||
index 1eeb95f..6d41775 100644
|
||
--- a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
|
||
+++ b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
|
||
@@ -11,20 +11,61 @@
|
||
# See the Mulan PSL v2 for more details.
|
||
# ******************************************************************************/
|
||
|
||
+from itertools import groupby
|
||
from typing import List
|
||
|
||
+import numpy as np
|
||
+
|
||
from anteater.core.time_series import TimeSeriesScore
|
||
+from anteater.model.algorithms.smooth import smoothing
|
||
+from anteater.model.algorithms.three_sigma import n_sigma
|
||
from anteater.model.detector.n_sigma_detector import NSigmaDetector
|
||
from anteater.source.metric_loader import MetricLoader
|
||
+from anteater.utils.common import divide
|
||
from anteater.utils.datetime import DateTimeManager as dt
|
||
|
||
|
||
class TcpTransLatencyNSigmaDetector(NSigmaDetector):
|
||
"""The three sigma anomaly detector"""
|
||
|
||
- def __init__(self, data_loader: MetricLoader, method: str):
|
||
+ def __init__(self, data_loader: MetricLoader):
|
||
"""The detector base class initializer"""
|
||
- super().__init__(data_loader, method)
|
||
+ super().__init__(data_loader)
|
||
+
|
||
+ def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\
|
||
+ -> List[TimeSeriesScore]:
|
||
+ """Calculates anomaly scores based on n sigma scores"""
|
||
+ method = kwargs.get('method', 'abs')
|
||
+ look_back = kwargs.get('look_back')
|
||
+ smooth_params = kwargs.get('smooth_params')
|
||
+ obs_size = kwargs.get('obs_size')
|
||
+ min_srtt = kwargs.get("min_srtt")
|
||
+ n = kwargs.get('n', 3)
|
||
+ start, end = dt.last(minutes=look_back)
|
||
+ point_count = self.data_loader.expected_point_length(start, end)
|
||
+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
|
||
+ ts_scores = []
|
||
+ for _ts in ts_list:
|
||
+ dedup_values = [k for k, g in groupby(_ts.values)]
|
||
+ if sum(_ts.values) == 0 or \
|
||
+ len(_ts.values) < point_count * 0.6 or \
|
||
+ len(_ts.values) > point_count * 1.5 or \
|
||
+ all(x == _ts.values[0] for x in _ts.values):
|
||
+ ratio = 0
|
||
+ elif len(dedup_values) < point_count * 0.6:
|
||
+ ratio = 0
|
||
+ else:
|
||
+ smoothed_val = smoothing(_ts.values, **smooth_params)
|
||
+ outlier, mean, std = n_sigma(
|
||
+ smoothed_val, obs_size=obs_size, n=n, method=method)
|
||
+ if outlier and np.average(outlier) <= min_srtt:
|
||
+ ratio = 0
|
||
+ else:
|
||
+ ratio = divide(len(outlier), obs_size)
|
||
+
|
||
+ ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description))
|
||
+
|
||
+ return ts_scores
|
||
|
||
def cal_anomaly_score(self, metric, description, machine_id: str) \
|
||
-> List[TimeSeriesScore]:
|
||
@@ -32,8 +73,7 @@ class TcpTransLatencyNSigmaDetector(NSigmaDetector):
|
||
start, end = dt.last(minutes=2)
|
||
point_count = self.data_loader.expected_point_length(start, end)
|
||
ts_scores = []
|
||
- ts_list = self.data_loader. \
|
||
- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
|
||
+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
|
||
for _ts in ts_list:
|
||
if sum(_ts.values) == 0 or \
|
||
len(_ts.values) < point_count * 0.5 or \
|
||
diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py
|
||
index bec9705..0af4f22 100644
|
||
--- a/anteater/model/detector/th_base_detector.py
|
||
+++ b/anteater/model/detector/th_base_detector.py
|
||
@@ -44,8 +44,7 @@ class ThBaseDetector(Detector):
|
||
look_back = kpi.params.get('look_back')
|
||
th = kpi.params.get('th')
|
||
start, end = dt.last(minutes=look_back)
|
||
- ts_list = self.data_loader.\
|
||
- get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
|
||
+ ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id)
|
||
|
||
if not ts_list:
|
||
logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
|
||
diff --git a/anteater/module/app/app_sli_detector.py b/anteater/module/app/app_sli_detector.py
|
||
index 102ed11..e506332 100644
|
||
--- a/anteater/module/app/app_sli_detector.py
|
||
+++ b/anteater/module/app/app_sli_detector.py
|
||
@@ -44,12 +44,12 @@ class APPSliDetector(E2EDetector):
|
||
def init_detectors(self, data_loader):
|
||
if self.job_config.model_config.enable:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='min'),
|
||
+ NSigmaDetector(data_loader),
|
||
OnlineVAEDetector(data_loader, self.job_config.model_config)
|
||
]
|
||
else:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='min')
|
||
+ NSigmaDetector(data_loader)
|
||
]
|
||
|
||
return detectors
|
||
diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py
|
||
index 9a192fb..7971505 100644
|
||
--- a/anteater/module/sys/disk_throughput.py
|
||
+++ b/anteater/module/sys/disk_throughput.py
|
||
@@ -38,12 +38,12 @@ class DiskThroughputDetector(E2EDetector):
|
||
def init_detectors(self, data_loader):
|
||
if self.job_config.model_config.enable:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='max'),
|
||
+ NSigmaDetector(data_loader),
|
||
OnlineVAEDetector(data_loader, self.job_config.model_config)
|
||
]
|
||
else:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='max')
|
||
+ NSigmaDetector(data_loader)
|
||
]
|
||
|
||
return detectors
|
||
diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
|
||
index a34c48d..b76acea 100644
|
||
--- a/anteater/module/sys/proc_io_latency.py
|
||
+++ b/anteater/module/sys/proc_io_latency.py
|
||
@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
|
||
def init_detectors(self, data_loader):
|
||
if self.job_config.model_config.enable:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='max'),
|
||
+ NSigmaDetector(data_loader),
|
||
OnlineVAEDetector(data_loader, self.job_config.model_config)
|
||
]
|
||
else:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='max')
|
||
+ NSigmaDetector(data_loader)
|
||
]
|
||
|
||
return detectors
|
||
diff --git a/anteater/module/sys/sys_io_latency.py b/anteater/module/sys/sys_io_latency.py
|
||
index a6f01c2..17a34c9 100644
|
||
--- a/anteater/module/sys/sys_io_latency.py
|
||
+++ b/anteater/module/sys/sys_io_latency.py
|
||
@@ -38,12 +38,12 @@ class SysIOLatencyDetector(E2EDetector):
|
||
def init_detectors(self, data_loader):
|
||
if self.job_config.model_config.enable:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='abs'),
|
||
+ NSigmaDetector(data_loader),
|
||
OnlineVAEDetector(data_loader, self.job_config.model_config)
|
||
]
|
||
else:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='abs')
|
||
+ NSigmaDetector(data_loader)
|
||
]
|
||
|
||
return detectors
|
||
diff --git a/anteater/module/sys/tcp_transmission_latency.py b/anteater/module/sys/tcp_transmission_latency.py
|
||
index cf0f406..e085ec3 100644
|
||
--- a/anteater/module/sys/tcp_transmission_latency.py
|
||
+++ b/anteater/module/sys/tcp_transmission_latency.py
|
||
@@ -39,12 +39,12 @@ class SysTcpTransmissionLatencyDetector(E2EDetector):
|
||
def init_detectors(self, data_loader):
|
||
if self.job_config.model_config.enable:
|
||
detectors = [
|
||
- TcpTransLatencyNSigmaDetector(data_loader, method='max'),
|
||
+ TcpTransLatencyNSigmaDetector(data_loader),
|
||
OnlineVAEDetector(data_loader, self.job_config.model_config)
|
||
]
|
||
else:
|
||
detectors = [
|
||
- TcpTransLatencyNSigmaDetector(data_loader, method='max')
|
||
+ TcpTransLatencyNSigmaDetector(data_loader)
|
||
]
|
||
|
||
return detectors
|
||
diff --git a/anteater/module/sys/tcp_transmission_throughput.py b/anteater/module/sys/tcp_transmission_throughput.py
|
||
index 86ecc9e..2921602 100644
|
||
--- a/anteater/module/sys/tcp_transmission_throughput.py
|
||
+++ b/anteater/module/sys/tcp_transmission_throughput.py
|
||
@@ -38,12 +38,12 @@ class SysTcpTransmissionThroughputDetector(E2EDetector):
|
||
def init_detectors(self, data_loader):
|
||
if self.job_config.model_config.enable:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='abs'),
|
||
+ NSigmaDetector(data_loader),
|
||
OnlineVAEDetector(data_loader, self.job_config.model_config)
|
||
]
|
||
else:
|
||
detectors = [
|
||
- NSigmaDetector(data_loader, method='abs')
|
||
+ NSigmaDetector(data_loader)
|
||
]
|
||
|
||
return detectors
|
||
diff --git a/anteater/source/metric_loader.py b/anteater/source/metric_loader.py
|
||
index ef2d012..4745d87 100644
|
||
--- a/anteater/source/metric_loader.py
|
||
+++ b/anteater/source/metric_loader.py
|
||
@@ -65,6 +65,43 @@ def get_query(metric: str,
|
||
return query
|
||
|
||
|
||
+def get_query2(
|
||
+ metric: str, operator: str = None, value: float = None, keys: Union[str, List] = None, **labels):
|
||
+ """Gets aggregated query patterns
|
||
+
|
||
+ Format: [operator]([value,] metric{[**labels]}) by (keys)
|
||
+
|
||
+ Such as:
|
||
+ - 1. gala_gopher_bind_sends{machine_id="1234"}
|
||
+ - 2. sum(gala_gopher_bind_sends) by (machine_id)
|
||
+ - 2. sum(gala_gopher_bind_sends) by (machine_id)
|
||
+ - 3. sum(gala_gopher_bind_sends{machine_id="1234"}) by (machine_id)
|
||
+ - 4. quantile(0.7, gala_gopher_bind_sends{machine_id="1234"}) by (machine_id)
|
||
+ """
|
||
+ if operator and not keys:
|
||
+ raise ValueError("Please provide param 'keys' when specified 'operator'!")
|
||
+
|
||
+ rule = ""
|
||
+ if labels:
|
||
+ pairs = ",".join([f"{n}='{v}'" for n, v in labels.items()])
|
||
+ rule = f"{{{pairs}}}"
|
||
+
|
||
+ group = ""
|
||
+ if isinstance(keys, list):
|
||
+ group = ",".join([k for k in keys])
|
||
+ elif isinstance(keys, str):
|
||
+ group = keys
|
||
+
|
||
+ if operator and value:
|
||
+ query = f"{operator}({value}, {metric}{rule}) by ({group})"
|
||
+ elif operator:
|
||
+ query = f"{operator}({metric}{rule}) by ({group})"
|
||
+ else:
|
||
+ query = f"{metric}{rule}"
|
||
+
|
||
+ return query
|
||
+
|
||
+
|
||
class MetricLoader:
|
||
"""
|
||
The metric loader that consumes raw data from PrometheusAdapter,
|
||
@@ -87,7 +124,7 @@ class MetricLoader:
|
||
|
||
:return List of TimeSeries
|
||
"""
|
||
- query = get_query(metric, **kwargs)
|
||
+ query = get_query2(metric, **kwargs)
|
||
time_series = self.provider.range_query(start, end, metric, query)
|
||
|
||
return time_series
|
||
@@ -109,7 +146,7 @@ class MetricLoader:
|
||
"""Gets unique labels of all metrics"""
|
||
unique_labels = set()
|
||
for metric in metrics:
|
||
- time_series = self.get_metric(start, end, metric, label_name=label_name)
|
||
+ time_series = self.get_metric(start, end, metric)
|
||
unique_labels.update([item.labels.get(label_name, "") for item in time_series])
|
||
|
||
return list([lbl for lbl in unique_labels if lbl])
|
||
diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
|
||
index 730c9c6..60c28e5 100644
|
||
--- a/anteater/utils/data_load.py
|
||
+++ b/anteater/utils/data_load.py
|
||
@@ -48,8 +48,8 @@ def load_job_config(file_name) -> JobConfig:
|
||
keywords = config['keywords']
|
||
root_cause_number = config['root_cause_number']
|
||
|
||
- kpis = [KPI(**update_description(_conf)) for _conf in config['KPI']]
|
||
- features = [Feature(**update_description(_conf)) for _conf in config['Features']]
|
||
+ kpis = [KPI.from_dict(**update_description(_conf)) for _conf in config['KPI']]
|
||
+ features = [Feature.from_dict(**update_description(_conf)) for _conf in config['Features']]
|
||
|
||
model_config = None
|
||
if 'OnlineModel' in config:
|
||
diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
|
||
index 0146883..5027b8d 100644
|
||
--- a/config/module/app_sli_rtt.json
|
||
+++ b/config/module/app_sli_rtt.json
|
||
@@ -10,13 +10,14 @@
|
||
"metric": "gala_gopher_sli_rtt_nsec",
|
||
"kpi_type": "rtt",
|
||
"entity_name": "sli",
|
||
- "enable": false,
|
||
+ "enable": true,
|
||
"description": "sli rtt 异常",
|
||
"description-zh": "应用级请求往返时延(RTT)异常",
|
||
"params": {
|
||
+ "method": "max",
|
||
"look_back": 10,
|
||
"obs_size": 25,
|
||
- "outlier_ratio_th": 0.3,
|
||
+ "outlier_ratio_th": 0.5,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
@@ -31,6 +32,7 @@
|
||
"description": "sli tps 异常",
|
||
"description-zh": "应用级请求吞吐量(TPS)异常",
|
||
"params": {
|
||
+ "method": "min",
|
||
"look_back": 10,
|
||
"obs_size": 25,
|
||
"outlier_ratio_th": 0.3,
|
||
diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json
|
||
index f6244f6..e3bcf68 100644
|
||
--- a/config/module/disk_throughput.json
|
||
+++ b/config/module/disk_throughput.json
|
||
@@ -14,9 +14,10 @@
|
||
"description": "Disk read await time is increasing!",
|
||
"description-zh": "磁盘读响应时间升高,性能发生劣化",
|
||
"params": {
|
||
+ "method": "max",
|
||
"look_back": 20,
|
||
"obs_size": 25,
|
||
- "outlier_ratio_th": 0.3,
|
||
+ "outlier_ratio_th": 0.5,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
@@ -31,9 +32,10 @@
|
||
"description": "Disk write await time is increasing!",
|
||
"description-zh": "磁盘写响应时间升高,性能发生劣化",
|
||
"params": {
|
||
+ "method": "max",
|
||
"look_back": 20,
|
||
"obs_size": 25,
|
||
- "outlier_ratio_th": 0.3,
|
||
+ "outlier_ratio_th": 0.5,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json
|
||
index f086b87..171c5f4 100644
|
||
--- a/config/module/proc_io_latency.json
|
||
+++ b/config/module/proc_io_latency.json
|
||
@@ -14,9 +14,10 @@
|
||
"description": "I/O operation delay at the BIO layer (unit: us)",
|
||
"description-zh": "BIO层I/O操作延时高(单位:us)",
|
||
"params": {
|
||
+ "method":"max",
|
||
"look_back": 20,
|
||
"obs_size": 37,
|
||
- "outlier_ratio_th": 0.4,
|
||
+ "outlier_ratio_th": 0.5,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
@@ -31,9 +32,10 @@
|
||
"description": "Number of small I/O (less than 4 KB) read operations at the BIO layer.",
|
||
"description-zh": "BIO层小数据I/O读操作数量异常(小于4KB)",
|
||
"params": {
|
||
+ "method":"max",
|
||
"look_back": 20,
|
||
"obs_size": 25,
|
||
- "outlier_ratio_th": 0.3,
|
||
+ "outlier_ratio_th": 0.4,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
@@ -48,9 +50,10 @@
|
||
"description": "Number of small I/O (less than 4 KB) write operations at the BIO layer.",
|
||
"description-zh": "BIO层小数据I/O写操作数量异常(小于4KB)",
|
||
"params": {
|
||
+ "method":"max",
|
||
"look_back": 20,
|
||
"obs_size": 25,
|
||
- "outlier_ratio_th": 0.3,
|
||
+ "outlier_ratio_th": 0.4,
|
||
"smooth_params": {
|
||
"method": "savgol_smooth",
|
||
"window_length": 13,
|
||
@@ -66,9 +69,10 @@
|
||
"description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.",
|
||
"description-zh": "BIO层大数据I/O读操作数量异常(大于4KB)",
|
||
"params": {
|
||
+ "method":"max",
|
||
"look_back": 20,
|
||
"obs_size": 25,
|
||
- "outlier_ratio_th": 0.3,
|
||
+ "outlier_ratio_th": 0.4,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
@@ -83,9 +87,10 @@
|
||
"description": "Number of big I/O (greater than 4 KB) write operations at the BIO layer.",
|
||
"description-zh": "BIO层大数据写操作数量异常(大于4KB)",
|
||
"params": {
|
||
+ "method":"max",
|
||
"look_back": 20,
|
||
"obs_size": 25,
|
||
- "outlier_ratio_th": 0.3,
|
||
+ "outlier_ratio_th": 0.4,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json
|
||
index bdf17d3..3fa1266 100644
|
||
--- a/config/module/sys_io_latency.json
|
||
+++ b/config/module/sys_io_latency.json
|
||
@@ -16,7 +16,7 @@
|
||
"params": {
|
||
"look_back": 20,
|
||
"obs_size": 25,
|
||
- "outlier_ratio_th": 0.3,
|
||
+ "outlier_ratio_th": 0.4,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
|
||
index 7cd2369..9bd2a46 100644
|
||
--- a/config/module/sys_tcp_establish.json
|
||
+++ b/config/module/sys_tcp_establish.json
|
||
@@ -17,7 +17,7 @@
|
||
"look_back": 30,
|
||
"outlier_ratio_th": 0.5,
|
||
"obs_size": 3,
|
||
- "min_rtt": 500000
|
||
+ "min_rtt": 100000
|
||
}
|
||
}
|
||
],
|
||
diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json
|
||
index 0527487..3ba8113 100644
|
||
--- a/config/module/sys_tcp_transmission_latency.json
|
||
+++ b/config/module/sys_tcp_transmission_latency.json
|
||
@@ -14,10 +14,12 @@
|
||
"description": "Smoothed Round Trip Time(us)",
|
||
"description-zh": "TCP链接往返时延异常,性能劣化",
|
||
"params": {
|
||
+ "method": "max",
|
||
"look_back": 20,
|
||
"obs_size": 25,
|
||
"n": 3,
|
||
- "outlier_ratio_th": 0.4,
|
||
+ "min_srtt": 20000,
|
||
+ "outlier_ratio_th": 0.6,
|
||
"smooth_params": {
|
||
"method": "conv_smooth",
|
||
"box_pts": 3
|
||
--
|
||
2.33.0
|
||
|