285 lines
9.3 KiB
Diff
285 lines
9.3 KiB
Diff
From ece4a0551bd81f64158ab465a865e31e97b63562 Mon Sep 17 00:00:00 2001
|
||
From: lizhenxing11 <lizhenxing11@huawei.com>
|
||
Date: Mon, 21 Nov 2022 14:54:20 +0800
|
||
Subject: [PATCH 2/2] Add Metrics Anomaly Trends Indicator
|
||
|
||
update config file
|
||
---
|
||
anteater/core/feature.py | 8 +++++
|
||
anteater/model/slope.py | 15 +++++++++
|
||
anteater/module/app_sli_detector.py | 20 ++++++++---
|
||
anteater/utils/data_load.py | 18 ++++++++--
|
||
config/module/app_sli_rtt.json | 52 ++++++++++++++++++++---------
|
||
5 files changed, 90 insertions(+), 23 deletions(-)
|
||
|
||
diff --git a/anteater/core/feature.py b/anteater/core/feature.py
|
||
index 306d835..6db764d 100644
|
||
--- a/anteater/core/feature.py
|
||
+++ b/anteater/core/feature.py
|
||
@@ -12,6 +12,13 @@
|
||
# ******************************************************************************/
|
||
|
||
from dataclasses import dataclass
|
||
+from enum import Enum
|
||
+
|
||
+
|
||
+class AnomalyTrend(Enum):
|
||
+ DEFAULT = 0
|
||
+ RISE = 1
|
||
+ FALL = 2
|
||
|
||
|
||
@dataclass
|
||
@@ -19,3 +26,4 @@ class Feature:
|
||
metric: str
|
||
description: str
|
||
priority: int = 0
|
||
+ atrend: AnomalyTrend = AnomalyTrend.DEFAULT
|
||
diff --git a/anteater/model/slope.py b/anteater/model/slope.py
|
||
index 422d6bc..08c4211 100644
|
||
--- a/anteater/model/slope.py
|
||
+++ b/anteater/model/slope.py
|
||
@@ -29,3 +29,18 @@ def smooth_slope(time_series, windows_length):
|
||
val = conv_smooth(time_series.to_df(), box_pts=13)
|
||
val = slope(val, win_len=13)
|
||
return val[-windows_length:]
|
||
+
|
||
+
|
||
+def trend(y, win_len=None):
|
||
+ """Gets the trend for the y"""
|
||
+ if not win_len:
|
||
+ win_len = len(y) // 2
|
||
+
|
||
+ if np.mean(y[:win_len]) < np.mean(y[-win_len:]):
|
||
+ return 1
|
||
+
|
||
+ elif np.mean(y[:win_len]) > np.mean(y[-win_len:]):
|
||
+ return -1
|
||
+
|
||
+ else:
|
||
+ return 0
|
||
diff --git a/anteater/module/app_sli_detector.py b/anteater/module/app_sli_detector.py
|
||
index b69f73c..b63f5e2 100644
|
||
--- a/anteater/module/app_sli_detector.py
|
||
+++ b/anteater/module/app_sli_detector.py
|
||
@@ -20,7 +20,9 @@ import math
|
||
from typing import List
|
||
|
||
from anteater.core.anomaly import Anomaly
|
||
+from anteater.core.feature import AnomalyTrend
|
||
from anteater.model.algorithms.spectral_residual import SpectralResidual
|
||
+from anteater.model.slope import trend
|
||
from anteater.model.smoother import conv_smooth
|
||
from anteater.model.three_sigma import three_sigma
|
||
from anteater.module.detector import Detector
|
||
@@ -134,10 +136,11 @@ class APPSliDetector(Detector):
|
||
|
||
return anomalies
|
||
|
||
- def detect_features(self, metrics, machine_id: str, top_n):
|
||
+ def detect_features(self, machine_id: str, top_n):
|
||
+ metric_atrend = {f.metric: f.atrend for f in self.features}
|
||
start, end = dt.last(minutes=6)
|
||
time_series_list = []
|
||
- for metric in metrics:
|
||
+ for metric in metric_atrend.keys():
|
||
time_series = self.data_loader.get_metric(
|
||
start, end, metric, label_name='machine_id', label_value=machine_id)
|
||
time_series_list.extend(time_series)
|
||
@@ -156,8 +159,16 @@ class APPSliDetector(Detector):
|
||
if all(x == values[0] for x in values):
|
||
continue
|
||
|
||
+ if trend(time_series.values) < 0 and \
|
||
+ metric_atrend[time_series.metric] == AnomalyTrend.RISE:
|
||
+ continue
|
||
+
|
||
+ if trend(time_series.values) > 0 and \
|
||
+ metric_atrend[time_series.metric] == AnomalyTrend.FALL:
|
||
+ continue
|
||
+
|
||
scores = sr_model.compute_score(values)
|
||
- score = max(scores[-13:])
|
||
+ score = max(scores[-25:])
|
||
|
||
if math.isnan(score) or math.isinf(score):
|
||
continue
|
||
@@ -170,9 +181,8 @@ class APPSliDetector(Detector):
|
||
|
||
def report(self, anomaly: Anomaly, machine_id: str):
|
||
"""Reports a single anomaly at each time"""
|
||
- feature_metrics = [f.metric for f in self.features]
|
||
description = {f.metric: f.description for f in self.features}
|
||
- cause_metrics = self.detect_features(feature_metrics, machine_id, top_n=60)
|
||
+ cause_metrics = self.detect_features(machine_id, top_n=60)
|
||
cause_metrics = [
|
||
{'metric': cause[0].metric,
|
||
'label': cause[0].labels,
|
||
diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py
|
||
index 108d5ed..f8ce277 100644
|
||
--- a/anteater/utils/data_load.py
|
||
+++ b/anteater/utils/data_load.py
|
||
@@ -17,7 +17,7 @@ from os import path, sep
|
||
from json import JSONDecodeError
|
||
from typing import List, Tuple
|
||
|
||
-from anteater.core.feature import Feature
|
||
+from anteater.core.feature import AnomalyTrend, Feature
|
||
from anteater.core.kpi import KPI
|
||
from anteater.utils.log import logger
|
||
|
||
@@ -76,7 +76,21 @@ def load_kpi_feature(file_name) -> Tuple[List[KPI], List[Feature]]:
|
||
raise e
|
||
|
||
kpis = [KPI(**param) for param in params.get('KPI')]
|
||
- features = [Feature(**param) for param in params.get('Features')]
|
||
+
|
||
+ features = []
|
||
+ for param in params.get('Features'):
|
||
+ parsed_param = {}
|
||
+ for key, value in param.items():
|
||
+ if key == 'atrend':
|
||
+ if value.lower() == 'rise':
|
||
+ value = AnomalyTrend.RISE
|
||
+ elif value.lower() == 'fall':
|
||
+ value = AnomalyTrend.FALL
|
||
+ else:
|
||
+ value = AnomalyTrend.DEFAULT
|
||
+ parsed_param[key] = value
|
||
+
|
||
+ features.append(Feature(**parsed_param))
|
||
|
||
if duplicated_metric([kpi.metric for kpi in kpis]) or \
|
||
duplicated_metric([f.metric for f in features]):
|
||
diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
|
||
index 0744416..b7f78b7 100644
|
||
--- a/config/module/app_sli_rtt.json
|
||
+++ b/config/module/app_sli_rtt.json
|
||
@@ -34,19 +34,23 @@
|
||
},
|
||
{
|
||
"metric": "gala_gopher_block_latency_req_jitter",
|
||
- "description": "block层request时延抖动异常"
|
||
+ "description": "block层request时延抖动异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_block_latency_req_last",
|
||
- "description": "block层request时延最近值异常"
|
||
+ "description": "block层request时延最近值异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_block_latency_req_max",
|
||
- "description": "block层request时延最大值异常"
|
||
+ "description": "block层request时延最大值异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_block_latency_req_sum",
|
||
- "description": "block层request时延总计值异常"
|
||
+ "description": "block层request时延总计值异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_cpu_iowait_total_second",
|
||
@@ -54,11 +58,13 @@
|
||
},
|
||
{
|
||
"metric": "gala_gopher_cpu_user_total_second",
|
||
- "description": "用户态cpu占用时间(不包括nice)异常"
|
||
+ "description": "用户态cpu占用时间(不包括nice)异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_cpu_total_used_per",
|
||
- "description": "CPU总利用率异常"
|
||
+ "description": "CPU总利用率异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_cpu_backlog_drops",
|
||
@@ -86,7 +92,8 @@
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_r_await",
|
||
- "description": "读响应时间异常"
|
||
+ "description": "读响应时间异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_rareq",
|
||
@@ -94,19 +101,23 @@
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_rspeed",
|
||
- "description": "读速率(IOPS)异常"
|
||
+ "description": "读速率(IOPS)异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_rspeed_kB",
|
||
- "description": "读吞吐量异常"
|
||
+ "description": "读吞吐量异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_util",
|
||
- "description": "磁盘使用率异常"
|
||
+ "description": "磁盘使用率异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_w_await",
|
||
- "description": "写响应时间异常"
|
||
+ "description": "写响应时间异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_wareq",
|
||
@@ -114,19 +125,23 @@
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_wspeed",
|
||
- "description": "写速率(IOPS)异常"
|
||
+ "description": "写速率(IOPS)异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_disk_wspeed_kB",
|
||
- "description": "写吞吐量异常"
|
||
+ "description": "写吞吐量异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_proc_read_bytes",
|
||
- "description": "进程实际从磁盘读取的字节数异常"
|
||
+ "description": "进程实际从磁盘读取的字节数异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_proc_write_bytes",
|
||
- "description": "进程实际从磁盘写入的字节数异常"
|
||
+ "description": "进程实际从磁盘写入的字节数异常",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_net_tcp_retrans_segs",
|
||
@@ -134,7 +149,12 @@
|
||
},
|
||
{
|
||
"metric": "gala_gopher_tcp_link_lost_out",
|
||
- "description": "TPC丢包数异常"
|
||
+ "description": "TCP丢包数异常"
|
||
+ },
|
||
+ {
|
||
+ "metric": "gala_gopher_tcp_link_srtt",
|
||
+ "description": "TCP超时",
|
||
+ "atrend": "rise"
|
||
},
|
||
{
|
||
"metric": "gala_gopher_tcp_link_notack_bytes",
|
||
--
|
||
2.37.0.windows.1
|
||
|