gala-anteater/fix-str2enum-bug-data-query-refactor.patch
Zhen Chen 1f46c219ad Model optimization and bugfix
- fix str2enum bug & data query refactor
- add systemd service for anteater
- remove 'sys-level' config param
- add chinese descriptions
- Update TCP Establish Model & Add Nic Loss Detector
- Add disk throughput detector

(cherry picked from commit f3c17e8c6a619a7803afd89b945ae3f36d17f9b0)
2023-01-17 22:40:46 +08:00

738 lines
30 KiB
Diff
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From 27bb7cdd80f76bfc7ebb0f3041544740aa2fa91b Mon Sep 17 00:00:00 2001
From: lizhenxing11 <lizhenxing11@huawei.com>
Date: Tue, 10 Jan 2023 15:31:44 +0800
Subject: [PATCH] fix str2enum bug & data query refactor
---
anteater/core/anomaly.py | 10 ++++
anteater/core/kpi.py | 14 ++++++
anteater/model/algorithms/slope.py | 11 +++--
anteater/model/detector/base.py | 20 ++++----
anteater/model/detector/n_sigma_detector.py | 15 +++---
.../model/detector/online_vae_detector.py | 3 +-
.../tcp_establish_n_sigma_detector.py | 3 +-
.../tcp_trans_latency_n_sigma_detector.py | 48 +++++++++++++++++--
anteater/model/detector/th_base_detector.py | 3 +-
anteater/module/app/app_sli_detector.py | 4 +-
anteater/module/sys/disk_throughput.py | 4 +-
anteater/module/sys/proc_io_latency.py | 4 +-
anteater/module/sys/sys_io_latency.py | 4 +-
.../module/sys/tcp_transmission_latency.py | 4 +-
.../module/sys/tcp_transmission_throughput.py | 4 +-
anteater/source/metric_loader.py | 41 +++++++++++++++-
anteater/utils/data_load.py | 4 +-
config/module/app_sli_rtt.json | 6 ++-
config/module/disk_throughput.json | 6 ++-
config/module/proc_io_latency.json | 15 ++++--
config/module/sys_io_latency.json | 2 +-
config/module/sys_tcp_establish.json | 2 +-
.../module/sys_tcp_transmission_latency.json | 4 +-
23 files changed, 172 insertions(+), 59 deletions(-)
diff --git a/anteater/core/anomaly.py b/anteater/core/anomaly.py
index 45c4fc3..fdee3d1 100644
--- a/anteater/core/anomaly.py
+++ b/anteater/core/anomaly.py
@@ -52,3 +52,13 @@ class AnomalyTrend(Enum):
DEFAULT = 0
RISE = 1
FALL = 2
+
+ @staticmethod
+ def from_str(label: str):
+ """Trans str to Enum type"""
+ if label.upper() == 'RISE':
+ return AnomalyTrend.RISE
+ elif label.upper() == 'FALL':
+ return AnomalyTrend.FALL
+ else:
+ return AnomalyTrend.DEFAULT
diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py
index f83b666..70cc9ee 100644
--- a/anteater/core/kpi.py
+++ b/anteater/core/kpi.py
@@ -27,6 +27,13 @@ class KPI:
params: dict = field(default=dict)
atrend: AnomalyTrend = AnomalyTrend.DEFAULT
+ @classmethod
+ def from_dict(cls, **data):
+ if 'atrend' in data:
+ data['atrend'] = AnomalyTrend.from_str(data.get('atrend'))
+
+ return cls(**data)
+
@dataclass
class Feature:
@@ -35,6 +42,13 @@ class Feature:
priority: int = 0
atrend: AnomalyTrend = AnomalyTrend.DEFAULT
+ @classmethod
+ def from_dict(cls, **data):
+ if 'atrend' in data:
+ data['atrend'] = AnomalyTrend.from_str(data.get('atrend'))
+
+ return cls(**data)
+
@dataclass
class ModelConfig:
diff --git a/anteater/model/algorithms/slope.py b/anteater/model/algorithms/slope.py
index d324d58..e546183 100644
--- a/anteater/model/algorithms/slope.py
+++ b/anteater/model/algorithms/slope.py
@@ -17,6 +17,7 @@ import numpy as np
from anteater.core.anomaly import AnomalyTrend
from anteater.model.algorithms.smooth import conv_smooth
+from anteater.utils.common import divide
def slope(y, win_len):
@@ -36,13 +37,15 @@ def smooth_slope(time_series, windows_length):
def trend(y, win_len=None):
"""Gets the trend for the y"""
+ y = conv_smooth(y, box_pts=7)
+
if not win_len:
win_len = len(y) // 2
- if np.mean(y[:win_len]) < np.mean(y[-win_len:]):
+ if divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) < 0.9:
return 1
- elif np.mean(y[:win_len]) > np.mean(y[-win_len:]):
+ elif divide(np.mean(y[:win_len]), np.mean(y[-win_len:])) > 1.1:
return -1
else:
@@ -51,10 +54,10 @@ def trend(y, win_len=None):
def check_trend(values: List[float], atrend: AnomalyTrend):
"""Checks the values with an 'atrend' trend"""
- if atrend == AnomalyTrend.RISE and trend(values) < 0:
+ if atrend == AnomalyTrend.RISE and trend(values) != 1:
return False
- if atrend == AnomalyTrend.FALL and trend(values) > 0:
+ if atrend == AnomalyTrend.FALL and trend(values) != -1:
return False
return True
diff --git a/anteater/model/detector/base.py b/anteater/model/detector/base.py
index 2b2dafe..a23b6d9 100644
--- a/anteater/model/detector/base.py
+++ b/anteater/model/detector/base.py
@@ -11,6 +11,7 @@
# See the Mulan PSL v2 for more details.
# ******************************************************************************/
+import logging
import math
from abc import abstractmethod
from typing import List
@@ -39,12 +40,6 @@ class Detector:
"""Executes anomaly detection on kpis"""
pass
- def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]:
- """Gets unique machine ids during past minutes"""
- metrics = [_kpi.metric for _kpi in kpis]
- machine_ids = self.data_loader.get_unique_machines(start, end, metrics)
- return machine_ids
-
def execute(self, job_config: JobConfig) -> List[Anomaly]:
"""The main function of the detector"""
kpis = job_config.kpis
@@ -56,6 +51,12 @@ class Detector:
return self._execute(kpis, features, top_n=n)
+ def get_unique_machine_id(self, start, end, kpis: List[KPI]) -> List[str]:
+ """Gets unique machine ids during past minutes"""
+ metrics = [_kpi.metric for _kpi in kpis]
+ machine_ids = self.data_loader.get_unique_machines(start, end, metrics)
+ return machine_ids
+
def find_root_causes(self, anomalies: List[Anomaly], features: List[Feature], top_n=3)\
-> List[Anomaly]:
"""Finds root causes for each anomaly events"""
@@ -82,6 +83,7 @@ class Detector:
tmp_ts_scores = self.cal_anomaly_score(f.metric, f.description, machine_id=machine_id)
for _ts_score in tmp_ts_scores:
if not check_trend(_ts_score.ts.values, f.atrend):
+ logging.info(f"Trends Filtered: {f.metric}")
_ts_score.score = 0
if same_intersection_key_value(_ts_score.ts.labels, filters):
ts_scores.append(_ts_score)
@@ -101,6 +103,7 @@ class Detector:
for _ts_s in ts_scores:
if same_intersection_key_value(_ts_s.ts.labels, anomaly.labels):
if not check_trend(_ts_s.ts.values, kpi_atrends[anomaly.metric]):
+ logging.info(f"Trends Filtered: {anomaly.metric}")
anomaly.score = 0
else:
anomaly.score = _ts_s.score
@@ -115,12 +118,11 @@ class Detector:
machine_id: str)\
-> List[TimeSeriesScore]:
"""Calculates metric anomaly scores based on sr model"""
- start, end = dt.last(minutes=6)
+ start, end = dt.last(minutes=10)
point_count = self.data_loader.expected_point_length(start, end)
model = SpectralResidual(12, 24, 50)
ts_scores = []
- ts_list = self.data_loader.\
- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
for _ts in ts_list:
if sum(_ts.values) == 0 or \
len(_ts.values) < point_count * 0.9 or\
diff --git a/anteater/model/detector/n_sigma_detector.py b/anteater/model/detector/n_sigma_detector.py
index 3a2ab01..dbf83c6 100644
--- a/anteater/model/detector/n_sigma_detector.py
+++ b/anteater/model/detector/n_sigma_detector.py
@@ -29,10 +29,9 @@ from anteater.utils.log import logger
class NSigmaDetector(Detector):
"""The three sigma anomaly detector"""
- def __init__(self, data_loader: MetricLoader, method: str):
+ def __init__(self, data_loader: MetricLoader):
"""The detector base class initializer"""
super().__init__(data_loader)
- self.method = method
def detect_kpis(self, kpis: List[KPI]):
"""Executes anomaly detection on kpis"""
@@ -48,7 +47,7 @@ class NSigmaDetector(Detector):
def detect_signal_kpi(self, kpi, machine_id: str) -> List[Anomaly]:
"""Detects kpi based on signal time series anomaly detection model"""
outlier_ratio_th = kpi.params['outlier_ratio_th']
- ts_scores = self.calculate_metric_three_sigma_score(
+ ts_scores = self.calculate_n_sigma_score(
kpi.metric, kpi.description, machine_id, **kpi.params)
if not ts_scores:
logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
@@ -68,17 +67,17 @@ class NSigmaDetector(Detector):
return anomalies
- def calculate_metric_three_sigma_score(self, metric, description, machine_id: str, **kwargs)\
+ def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\
-> List[TimeSeriesScore]:
"""Calculate kpi anomaly scores based on three sigma scores"""
+ method = kwargs.get('method', 'abs')
look_back = kwargs.get('look_back')
smooth_params = kwargs.get('smooth_params')
obs_size = kwargs.get('obs_size')
n = kwargs.get('n', 3)
start, end = dt.last(minutes=look_back)
point_count = self.data_loader.expected_point_length(start, end)
- ts_list = self.data_loader.\
- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
ts_scores = []
for _ts in ts_list:
dedup_values = [k for k, g in groupby(_ts.values)]
@@ -87,12 +86,12 @@ class NSigmaDetector(Detector):
len(_ts.values) > point_count * 1.5 or \
all(x == _ts.values[0] for x in _ts.values):
ratio = 0
- elif len(dedup_values) < point_count * 0.3:
+ elif len(dedup_values) < point_count * 0.6:
ratio = 0
else:
smoothed_val = smoothing(_ts.values, **smooth_params)
outlier, mean, std = n_sigma(
- smoothed_val, obs_size=obs_size, n=n, method=self.method)
+ smoothed_val, obs_size=obs_size, n=n, method=method)
ratio = divide(len(outlier), obs_size)
ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description))
diff --git a/anteater/model/detector/online_vae_detector.py b/anteater/model/detector/online_vae_detector.py
index 63a7b09..0f91576 100644
--- a/anteater/model/detector/online_vae_detector.py
+++ b/anteater/model/detector/online_vae_detector.py
@@ -110,8 +110,7 @@ class OnlineVAEDetector(Detector):
metric_dfs = []
for metric in metrics:
_ts_list = self.data_loader.\
- get_metric(start, end, metric, label_name="machine_id",
- label_value=machine_id, operator_name='avg')
+ get_metric(start, end, metric, operator='avg', keys="machine_id", machine_id=machine_id)
if len(_ts_list) > 1:
raise ValueError(f'Got multiple time_series based on machine id: {len(_ts_list)}')
diff --git a/anteater/model/detector/tcp_establish_n_sigma_detector.py b/anteater/model/detector/tcp_establish_n_sigma_detector.py
index 82d7837..3720069 100644
--- a/anteater/model/detector/tcp_establish_n_sigma_detector.py
+++ b/anteater/model/detector/tcp_establish_n_sigma_detector.py
@@ -73,8 +73,7 @@ class TcpEstablishNSigmaDetector(Detector):
min_rtt = kpi.params.get('min_rtt')
start, end = dt.last(minutes=look_back)
- ts_list = self.data_loader.\
- get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
+ ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id)
anomalies = []
for _ts in ts_list:
diff --git a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
index 1eeb95f..6d41775 100644
--- a/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
+++ b/anteater/model/detector/tcp_trans_latency_n_sigma_detector.py
@@ -11,20 +11,61 @@
# See the Mulan PSL v2 for more details.
# ******************************************************************************/
+from itertools import groupby
from typing import List
+import numpy as np
+
from anteater.core.time_series import TimeSeriesScore
+from anteater.model.algorithms.smooth import smoothing
+from anteater.model.algorithms.three_sigma import n_sigma
from anteater.model.detector.n_sigma_detector import NSigmaDetector
from anteater.source.metric_loader import MetricLoader
+from anteater.utils.common import divide
from anteater.utils.datetime import DateTimeManager as dt
class TcpTransLatencyNSigmaDetector(NSigmaDetector):
"""The three sigma anomaly detector"""
- def __init__(self, data_loader: MetricLoader, method: str):
+ def __init__(self, data_loader: MetricLoader):
"""The detector base class initializer"""
- super().__init__(data_loader, method)
+ super().__init__(data_loader)
+
+ def calculate_n_sigma_score(self, metric, description, machine_id: str, **kwargs)\
+ -> List[TimeSeriesScore]:
+ """Calculates anomaly scores based on n sigma scores"""
+ method = kwargs.get('method', 'abs')
+ look_back = kwargs.get('look_back')
+ smooth_params = kwargs.get('smooth_params')
+ obs_size = kwargs.get('obs_size')
+ min_srtt = kwargs.get("min_srtt")
+ n = kwargs.get('n', 3)
+ start, end = dt.last(minutes=look_back)
+ point_count = self.data_loader.expected_point_length(start, end)
+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
+ ts_scores = []
+ for _ts in ts_list:
+ dedup_values = [k for k, g in groupby(_ts.values)]
+ if sum(_ts.values) == 0 or \
+ len(_ts.values) < point_count * 0.6 or \
+ len(_ts.values) > point_count * 1.5 or \
+ all(x == _ts.values[0] for x in _ts.values):
+ ratio = 0
+ elif len(dedup_values) < point_count * 0.6:
+ ratio = 0
+ else:
+ smoothed_val = smoothing(_ts.values, **smooth_params)
+ outlier, mean, std = n_sigma(
+ smoothed_val, obs_size=obs_size, n=n, method=method)
+ if outlier and np.average(outlier) <= min_srtt:
+ ratio = 0
+ else:
+ ratio = divide(len(outlier), obs_size)
+
+ ts_scores.append(TimeSeriesScore(ts=_ts, score=ratio, description=description))
+
+ return ts_scores
def cal_anomaly_score(self, metric, description, machine_id: str) \
-> List[TimeSeriesScore]:
@@ -32,8 +73,7 @@ class TcpTransLatencyNSigmaDetector(NSigmaDetector):
start, end = dt.last(minutes=2)
point_count = self.data_loader.expected_point_length(start, end)
ts_scores = []
- ts_list = self.data_loader. \
- get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
+ ts_list = self.data_loader.get_metric(start, end, metric, machine_id=machine_id)
for _ts in ts_list:
if sum(_ts.values) == 0 or \
len(_ts.values) < point_count * 0.5 or \
diff --git a/anteater/model/detector/th_base_detector.py b/anteater/model/detector/th_base_detector.py
index bec9705..0af4f22 100644
--- a/anteater/model/detector/th_base_detector.py
+++ b/anteater/model/detector/th_base_detector.py
@@ -44,8 +44,7 @@ class ThBaseDetector(Detector):
look_back = kpi.params.get('look_back')
th = kpi.params.get('th')
start, end = dt.last(minutes=look_back)
- ts_list = self.data_loader.\
- get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
+ ts_list = self.data_loader.get_metric(start, end, kpi.metric, machine_id=machine_id)
if not ts_list:
logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
diff --git a/anteater/module/app/app_sli_detector.py b/anteater/module/app/app_sli_detector.py
index 102ed11..e506332 100644
--- a/anteater/module/app/app_sli_detector.py
+++ b/anteater/module/app/app_sli_detector.py
@@ -44,12 +44,12 @@ class APPSliDetector(E2EDetector):
def init_detectors(self, data_loader):
if self.job_config.model_config.enable:
detectors = [
- NSigmaDetector(data_loader, method='min'),
+ NSigmaDetector(data_loader),
OnlineVAEDetector(data_loader, self.job_config.model_config)
]
else:
detectors = [
- NSigmaDetector(data_loader, method='min')
+ NSigmaDetector(data_loader)
]
return detectors
diff --git a/anteater/module/sys/disk_throughput.py b/anteater/module/sys/disk_throughput.py
index 9a192fb..7971505 100644
--- a/anteater/module/sys/disk_throughput.py
+++ b/anteater/module/sys/disk_throughput.py
@@ -38,12 +38,12 @@ class DiskThroughputDetector(E2EDetector):
def init_detectors(self, data_loader):
if self.job_config.model_config.enable:
detectors = [
- NSigmaDetector(data_loader, method='max'),
+ NSigmaDetector(data_loader),
OnlineVAEDetector(data_loader, self.job_config.model_config)
]
else:
detectors = [
- NSigmaDetector(data_loader, method='max')
+ NSigmaDetector(data_loader)
]
return detectors
diff --git a/anteater/module/sys/proc_io_latency.py b/anteater/module/sys/proc_io_latency.py
index a34c48d..b76acea 100644
--- a/anteater/module/sys/proc_io_latency.py
+++ b/anteater/module/sys/proc_io_latency.py
@@ -38,12 +38,12 @@ class ProcIOLatencyDetector(E2EDetector):
def init_detectors(self, data_loader):
if self.job_config.model_config.enable:
detectors = [
- NSigmaDetector(data_loader, method='max'),
+ NSigmaDetector(data_loader),
OnlineVAEDetector(data_loader, self.job_config.model_config)
]
else:
detectors = [
- NSigmaDetector(data_loader, method='max')
+ NSigmaDetector(data_loader)
]
return detectors
diff --git a/anteater/module/sys/sys_io_latency.py b/anteater/module/sys/sys_io_latency.py
index a6f01c2..17a34c9 100644
--- a/anteater/module/sys/sys_io_latency.py
+++ b/anteater/module/sys/sys_io_latency.py
@@ -38,12 +38,12 @@ class SysIOLatencyDetector(E2EDetector):
def init_detectors(self, data_loader):
if self.job_config.model_config.enable:
detectors = [
- NSigmaDetector(data_loader, method='abs'),
+ NSigmaDetector(data_loader),
OnlineVAEDetector(data_loader, self.job_config.model_config)
]
else:
detectors = [
- NSigmaDetector(data_loader, method='abs')
+ NSigmaDetector(data_loader)
]
return detectors
diff --git a/anteater/module/sys/tcp_transmission_latency.py b/anteater/module/sys/tcp_transmission_latency.py
index cf0f406..e085ec3 100644
--- a/anteater/module/sys/tcp_transmission_latency.py
+++ b/anteater/module/sys/tcp_transmission_latency.py
@@ -39,12 +39,12 @@ class SysTcpTransmissionLatencyDetector(E2EDetector):
def init_detectors(self, data_loader):
if self.job_config.model_config.enable:
detectors = [
- TcpTransLatencyNSigmaDetector(data_loader, method='max'),
+ TcpTransLatencyNSigmaDetector(data_loader),
OnlineVAEDetector(data_loader, self.job_config.model_config)
]
else:
detectors = [
- TcpTransLatencyNSigmaDetector(data_loader, method='max')
+ TcpTransLatencyNSigmaDetector(data_loader)
]
return detectors
diff --git a/anteater/module/sys/tcp_transmission_throughput.py b/anteater/module/sys/tcp_transmission_throughput.py
index 86ecc9e..2921602 100644
--- a/anteater/module/sys/tcp_transmission_throughput.py
+++ b/anteater/module/sys/tcp_transmission_throughput.py
@@ -38,12 +38,12 @@ class SysTcpTransmissionThroughputDetector(E2EDetector):
def init_detectors(self, data_loader):
if self.job_config.model_config.enable:
detectors = [
- NSigmaDetector(data_loader, method='abs'),
+ NSigmaDetector(data_loader),
OnlineVAEDetector(data_loader, self.job_config.model_config)
]
else:
detectors = [
- NSigmaDetector(data_loader, method='abs')
+ NSigmaDetector(data_loader)
]
return detectors
diff --git a/anteater/source/metric_loader.py b/anteater/source/metric_loader.py
index ef2d012..4745d87 100644
--- a/anteater/source/metric_loader.py
+++ b/anteater/source/metric_loader.py
@@ -65,6 +65,43 @@ def get_query(metric: str,
return query
+def get_query2(
+ metric: str, operator: str = None, value: float = None, keys: Union[str, List] = None, **labels):
+ """Gets aggregated query patterns
+
+ Format: [operator]([value,] metric{[**labels]}) by (keys)
+
+ Such as:
+ - 1. gala_gopher_bind_sends{machine_id="1234"}
+ - 2. sum(gala_gopher_bind_sends) by (machine_id)
+ - 2. sum(gala_gopher_bind_sends) by (machine_id)
+ - 3. sum(gala_gopher_bind_sends{machine_id="1234"}) by (machine_id)
+ - 4. quantile(0.7, gala_gopher_bind_sends{machine_id="1234"}) by (machine_id)
+ """
+ if operator and not keys:
+ raise ValueError("Please provide param 'keys' when specified 'operator'!")
+
+ rule = ""
+ if labels:
+ pairs = ",".join([f"{n}='{v}'" for n, v in labels.items()])
+ rule = f"{{{pairs}}}"
+
+ group = ""
+ if isinstance(keys, list):
+ group = ",".join([k for k in keys])
+ elif isinstance(keys, str):
+ group = keys
+
+ if operator and value:
+ query = f"{operator}({value}, {metric}{rule}) by ({group})"
+ elif operator:
+ query = f"{operator}({metric}{rule}) by ({group})"
+ else:
+ query = f"{metric}{rule}"
+
+ return query
+
+
class MetricLoader:
"""
The metric loader that consumes raw data from PrometheusAdapter,
@@ -87,7 +124,7 @@ class MetricLoader:
:return List of TimeSeries
"""
- query = get_query(metric, **kwargs)
+ query = get_query2(metric, **kwargs)
time_series = self.provider.range_query(start, end, metric, query)
return time_series
@@ -109,7 +146,7 @@ class MetricLoader:
"""Gets unique labels of all metrics"""
unique_labels = set()
for metric in metrics:
- time_series = self.get_metric(start, end, metric, label_name=label_name)
+ time_series = self.get_metric(start, end, metric)
unique_labels.update([item.labels.get(label_name, "") for item in time_series])
return list([lbl for lbl in unique_labels if lbl])
diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
index 730c9c6..60c28e5 100644
--- a/anteater/utils/data_load.py
+++ b/anteater/utils/data_load.py
@@ -48,8 +48,8 @@ def load_job_config(file_name) -> JobConfig:
keywords = config['keywords']
root_cause_number = config['root_cause_number']
- kpis = [KPI(**update_description(_conf)) for _conf in config['KPI']]
- features = [Feature(**update_description(_conf)) for _conf in config['Features']]
+ kpis = [KPI.from_dict(**update_description(_conf)) for _conf in config['KPI']]
+ features = [Feature.from_dict(**update_description(_conf)) for _conf in config['Features']]
model_config = None
if 'OnlineModel' in config:
diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
index 0146883..5027b8d 100644
--- a/config/module/app_sli_rtt.json
+++ b/config/module/app_sli_rtt.json
@@ -10,13 +10,14 @@
"metric": "gala_gopher_sli_rtt_nsec",
"kpi_type": "rtt",
"entity_name": "sli",
- "enable": false,
+ "enable": true,
"description": "sli rtt 异常",
"description-zh": "应用级请求往返时延RTT异常",
"params": {
+ "method": "max",
"look_back": 10,
"obs_size": 25,
- "outlier_ratio_th": 0.3,
+ "outlier_ratio_th": 0.5,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
@@ -31,6 +32,7 @@
"description": "sli tps 异常",
"description-zh": "应用级请求吞吐量TPS异常",
"params": {
+ "method": "min",
"look_back": 10,
"obs_size": 25,
"outlier_ratio_th": 0.3,
diff --git a/config/module/disk_throughput.json b/config/module/disk_throughput.json
index f6244f6..e3bcf68 100644
--- a/config/module/disk_throughput.json
+++ b/config/module/disk_throughput.json
@@ -14,9 +14,10 @@
"description": "Disk read await time is increasing!",
"description-zh": "磁盘读响应时间升高,性能发生劣化",
"params": {
+ "method": "max",
"look_back": 20,
"obs_size": 25,
- "outlier_ratio_th": 0.3,
+ "outlier_ratio_th": 0.5,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
@@ -31,9 +32,10 @@
"description": "Disk write await time is increasing!",
"description-zh": "磁盘写响应时间升高,性能发生劣化",
"params": {
+ "method": "max",
"look_back": 20,
"obs_size": 25,
- "outlier_ratio_th": 0.3,
+ "outlier_ratio_th": 0.5,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json
index f086b87..171c5f4 100644
--- a/config/module/proc_io_latency.json
+++ b/config/module/proc_io_latency.json
@@ -14,9 +14,10 @@
"description": "I/O operation delay at the BIO layer (unit: us)",
"description-zh": "BIO层I/O操作延时高(单位us)",
"params": {
+ "method":"max",
"look_back": 20,
"obs_size": 37,
- "outlier_ratio_th": 0.4,
+ "outlier_ratio_th": 0.5,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
@@ -31,9 +32,10 @@
"description": "Number of small I/O (less than 4 KB) read operations at the BIO layer.",
"description-zh": "BIO层小数据I/O读操作数量异常小于4KB",
"params": {
+ "method":"max",
"look_back": 20,
"obs_size": 25,
- "outlier_ratio_th": 0.3,
+ "outlier_ratio_th": 0.4,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
@@ -48,9 +50,10 @@
"description": "Number of small I/O (less than 4 KB) write operations at the BIO layer.",
"description-zh": "BIO层小数据I/O写操作数量异常小于4KB",
"params": {
+ "method":"max",
"look_back": 20,
"obs_size": 25,
- "outlier_ratio_th": 0.3,
+ "outlier_ratio_th": 0.4,
"smooth_params": {
"method": "savgol_smooth",
"window_length": 13,
@@ -66,9 +69,10 @@
"description": "Number of big I/O (greater than 4 KB) read operations at the BIO layer.",
"description-zh": "BIO层大数据I/O读操作数量异常大于4KB",
"params": {
+ "method":"max",
"look_back": 20,
"obs_size": 25,
- "outlier_ratio_th": 0.3,
+ "outlier_ratio_th": 0.4,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
@@ -83,9 +87,10 @@
"description": "Number of big I/O (greater than 4 KB) write operations at the BIO layer.",
"description-zh": "BIO层大数据写操作数量异常大于4KB",
"params": {
+ "method":"max",
"look_back": 20,
"obs_size": 25,
- "outlier_ratio_th": 0.3,
+ "outlier_ratio_th": 0.4,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json
index bdf17d3..3fa1266 100644
--- a/config/module/sys_io_latency.json
+++ b/config/module/sys_io_latency.json
@@ -16,7 +16,7 @@
"params": {
"look_back": 20,
"obs_size": 25,
- "outlier_ratio_th": 0.3,
+ "outlier_ratio_th": 0.4,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
index 7cd2369..9bd2a46 100644
--- a/config/module/sys_tcp_establish.json
+++ b/config/module/sys_tcp_establish.json
@@ -17,7 +17,7 @@
"look_back": 30,
"outlier_ratio_th": 0.5,
"obs_size": 3,
- "min_rtt": 500000
+ "min_rtt": 100000
}
}
],
diff --git a/config/module/sys_tcp_transmission_latency.json b/config/module/sys_tcp_transmission_latency.json
index 0527487..3ba8113 100644
--- a/config/module/sys_tcp_transmission_latency.json
+++ b/config/module/sys_tcp_transmission_latency.json
@@ -14,10 +14,12 @@
"description": "Smoothed Round Trip Time(us)",
"description-zh": "TCP链接往返时延异常性能劣化",
"params": {
+ "method": "max",
"look_back": 20,
"obs_size": 25,
"n": 3,
- "outlier_ratio_th": 0.4,
+ "min_srtt": 20000,
+ "outlier_ratio_th": 0.6,
"smooth_params": {
"method": "conv_smooth",
"box_pts": 3
--
2.33.0