gala-anteater/add_metrics_anomaly_trends_indicator.patch
lizhenxing11 8ec215e3c4 updates the model and improves cause inference
update changelog

update prep config

update changelog
2022-11-22 17:29:25 +08:00

285 lines
9.3 KiB
Diff
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From ece4a0551bd81f64158ab465a865e31e97b63562 Mon Sep 17 00:00:00 2001
From: lizhenxing11 <lizhenxing11@huawei.com>
Date: Mon, 21 Nov 2022 14:54:20 +0800
Subject: [PATCH 2/2] Add Metrics Anomaly Trends Indicator
update config file
---
anteater/core/feature.py | 8 +++++
anteater/model/slope.py | 15 +++++++++
anteater/module/app_sli_detector.py | 20 ++++++++---
anteater/utils/data_load.py | 18 ++++++++--
config/module/app_sli_rtt.json | 52 ++++++++++++++++++++---------
5 files changed, 90 insertions(+), 23 deletions(-)
diff --git a/anteater/core/feature.py b/anteater/core/feature.py
index 306d835..6db764d 100644
--- a/anteater/core/feature.py
+++ b/anteater/core/feature.py
@@ -12,6 +12,13 @@
# ******************************************************************************/
from dataclasses import dataclass
+from enum import Enum
+
+
+class AnomalyTrend(Enum):
+ DEFAULT = 0
+ RISE = 1
+ FALL = 2
@dataclass
@@ -19,3 +26,4 @@ class Feature:
metric: str
description: str
priority: int = 0
+ atrend: AnomalyTrend = AnomalyTrend.DEFAULT
diff --git a/anteater/model/slope.py b/anteater/model/slope.py
index 422d6bc..08c4211 100644
--- a/anteater/model/slope.py
+++ b/anteater/model/slope.py
@@ -29,3 +29,18 @@ def smooth_slope(time_series, windows_length):
val = conv_smooth(time_series.to_df(), box_pts=13)
val = slope(val, win_len=13)
return val[-windows_length:]
+
+
+def trend(y, win_len=None):
+ """Gets the trend for the y"""
+ if not win_len:
+ win_len = len(y) // 2
+
+ if np.mean(y[:win_len]) < np.mean(y[-win_len:]):
+ return 1
+
+ elif np.mean(y[:win_len]) > np.mean(y[-win_len:]):
+ return -1
+
+ else:
+ return 0
diff --git a/anteater/module/app_sli_detector.py b/anteater/module/app_sli_detector.py
index b69f73c..b63f5e2 100644
--- a/anteater/module/app_sli_detector.py
+++ b/anteater/module/app_sli_detector.py
@@ -20,7 +20,9 @@ import math
from typing import List
from anteater.core.anomaly import Anomaly
+from anteater.core.feature import AnomalyTrend
from anteater.model.algorithms.spectral_residual import SpectralResidual
+from anteater.model.slope import trend
from anteater.model.smoother import conv_smooth
from anteater.model.three_sigma import three_sigma
from anteater.module.detector import Detector
@@ -134,10 +136,11 @@ class APPSliDetector(Detector):
return anomalies
- def detect_features(self, metrics, machine_id: str, top_n):
+ def detect_features(self, machine_id: str, top_n):
+ metric_atrend = {f.metric: f.atrend for f in self.features}
start, end = dt.last(minutes=6)
time_series_list = []
- for metric in metrics:
+ for metric in metric_atrend.keys():
time_series = self.data_loader.get_metric(
start, end, metric, label_name='machine_id', label_value=machine_id)
time_series_list.extend(time_series)
@@ -156,8 +159,16 @@ class APPSliDetector(Detector):
if all(x == values[0] for x in values):
continue
+ if trend(time_series.values) < 0 and \
+ metric_atrend[time_series.metric] == AnomalyTrend.RISE:
+ continue
+
+ if trend(time_series.values) > 0 and \
+ metric_atrend[time_series.metric] == AnomalyTrend.FALL:
+ continue
+
scores = sr_model.compute_score(values)
- score = max(scores[-13:])
+ score = max(scores[-25:])
if math.isnan(score) or math.isinf(score):
continue
@@ -170,9 +181,8 @@ class APPSliDetector(Detector):
def report(self, anomaly: Anomaly, machine_id: str):
"""Reports a single anomaly at each time"""
- feature_metrics = [f.metric for f in self.features]
description = {f.metric: f.description for f in self.features}
- cause_metrics = self.detect_features(feature_metrics, machine_id, top_n=60)
+ cause_metrics = self.detect_features(machine_id, top_n=60)
cause_metrics = [
{'metric': cause[0].metric,
'label': cause[0].labels,
diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
index 108d5ed..f8ce277 100644
--- a/anteater/utils/data_load.py
+++ b/anteater/utils/data_load.py
@@ -17,7 +17,7 @@ from os import path, sep
from json import JSONDecodeError
from typing import List, Tuple
-from anteater.core.feature import Feature
+from anteater.core.feature import AnomalyTrend, Feature
from anteater.core.kpi import KPI
from anteater.utils.log import logger
@@ -76,7 +76,21 @@ def load_kpi_feature(file_name) -> Tuple[List[KPI], List[Feature]]:
raise e
kpis = [KPI(**param) for param in params.get('KPI')]
- features = [Feature(**param) for param in params.get('Features')]
+
+ features = []
+ for param in params.get('Features'):
+ parsed_param = {}
+ for key, value in param.items():
+ if key == 'atrend':
+ if value.lower() == 'rise':
+ value = AnomalyTrend.RISE
+ elif value.lower() == 'fall':
+ value = AnomalyTrend.FALL
+ else:
+ value = AnomalyTrend.DEFAULT
+ parsed_param[key] = value
+
+ features.append(Feature(**parsed_param))
if duplicated_metric([kpi.metric for kpi in kpis]) or \
duplicated_metric([f.metric for f in features]):
diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
index 0744416..b7f78b7 100644
--- a/config/module/app_sli_rtt.json
+++ b/config/module/app_sli_rtt.json
@@ -34,19 +34,23 @@
},
{
"metric": "gala_gopher_block_latency_req_jitter",
- "description": "block层request时延抖动异常"
+ "description": "block层request时延抖动异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_block_latency_req_last",
- "description": "block层request时延最近值异常"
+ "description": "block层request时延最近值异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_block_latency_req_max",
- "description": "block层request时延最大值异常"
+ "description": "block层request时延最大值异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_block_latency_req_sum",
- "description": "block层request时延总计值异常"
+ "description": "block层request时延总计值异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_cpu_iowait_total_second",
@@ -54,11 +58,13 @@
},
{
"metric": "gala_gopher_cpu_user_total_second",
- "description": "用户态cpu占用时间不包括nice异常"
+ "description": "用户态cpu占用时间不包括nice异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_cpu_total_used_per",
- "description": "CPU总利用率异常"
+ "description": "CPU总利用率异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_cpu_backlog_drops",
@@ -86,7 +92,8 @@
},
{
"metric": "gala_gopher_disk_r_await",
- "description": "读响应时间异常"
+ "description": "读响应时间异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_disk_rareq",
@@ -94,19 +101,23 @@
},
{
"metric": "gala_gopher_disk_rspeed",
- "description": "读速率IOPS异常"
+ "description": "读速率IOPS异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_disk_rspeed_kB",
- "description": "读吞吐量异常"
+ "description": "读吞吐量异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_disk_util",
- "description": "磁盘使用率异常"
+ "description": "磁盘使用率异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_disk_w_await",
- "description": "写响应时间异常"
+ "description": "写响应时间异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_disk_wareq",
@@ -114,19 +125,23 @@
},
{
"metric": "gala_gopher_disk_wspeed",
- "description": "写速率IOPS异常"
+ "description": "写速率IOPS异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_disk_wspeed_kB",
- "description": "写吞吐量异常"
+ "description": "写吞吐量异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_proc_read_bytes",
- "description": "进程实际从磁盘读取的字节数异常"
+ "description": "进程实际从磁盘读取的字节数异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_proc_write_bytes",
- "description": "进程实际从磁盘写入的字节数异常"
+ "description": "进程实际从磁盘写入的字节数异常",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_net_tcp_retrans_segs",
@@ -134,7 +149,12 @@
},
{
"metric": "gala_gopher_tcp_link_lost_out",
- "description": "TPC丢包数异常"
+ "description": "TCP丢包数异常"
+ },
+ {
+ "metric": "gala_gopher_tcp_link_srtt",
+ "description": "TCP超时",
+ "atrend": "rise"
},
{
"metric": "gala_gopher_tcp_link_notack_bytes",
--
2.37.0.windows.1