From ece4a0551bd81f64158ab465a865e31e97b63562 Mon Sep 17 00:00:00 2001 From: lizhenxing11 Date: Mon, 21 Nov 2022 14:54:20 +0800 Subject: [PATCH 2/2] Add Metrics Anomaly Trends Indicator update config file --- anteater/core/feature.py | 8 +++++ anteater/model/slope.py | 15 +++++++++ anteater/module/app_sli_detector.py | 20 ++++++++--- anteater/utils/data_load.py | 18 ++++++++-- config/module/app_sli_rtt.json | 52 ++++++++++++++++++++--------- 5 files changed, 90 insertions(+), 23 deletions(-) diff --git a/anteater/core/feature.py b/anteater/core/feature.py index 306d835..6db764d 100644 --- a/anteater/core/feature.py +++ b/anteater/core/feature.py @@ -12,6 +12,13 @@ # ******************************************************************************/ from dataclasses import dataclass +from enum import Enum + + +class AnomalyTrend(Enum): + DEFAULT = 0 + RISE = 1 + FALL = 2 @dataclass @@ -19,3 +26,4 @@ class Feature: metric: str description: str priority: int = 0 + atrend: AnomalyTrend = AnomalyTrend.DEFAULT diff --git a/anteater/model/slope.py b/anteater/model/slope.py index 422d6bc..08c4211 100644 --- a/anteater/model/slope.py +++ b/anteater/model/slope.py @@ -29,3 +29,18 @@ def smooth_slope(time_series, windows_length): val = conv_smooth(time_series.to_df(), box_pts=13) val = slope(val, win_len=13) return val[-windows_length:] + + +def trend(y, win_len=None): + """Gets the trend for the y""" + if not win_len: + win_len = len(y) // 2 + + if np.mean(y[:win_len]) < np.mean(y[-win_len:]): + return 1 + + elif np.mean(y[:win_len]) > np.mean(y[-win_len:]): + return -1 + + else: + return 0 diff --git a/anteater/module/app_sli_detector.py b/anteater/module/app_sli_detector.py index b69f73c..b63f5e2 100644 --- a/anteater/module/app_sli_detector.py +++ b/anteater/module/app_sli_detector.py @@ -20,7 +20,9 @@ import math from typing import List from anteater.core.anomaly import Anomaly +from anteater.core.feature import AnomalyTrend from anteater.model.algorithms.spectral_residual import SpectralResidual +from anteater.model.slope import trend from anteater.model.smoother import conv_smooth from anteater.model.three_sigma import three_sigma from anteater.module.detector import Detector @@ -134,10 +136,11 @@ class APPSliDetector(Detector): return anomalies - def detect_features(self, metrics, machine_id: str, top_n): + def detect_features(self, machine_id: str, top_n): + metric_atrend = {f.metric: f.atrend for f in self.features} start, end = dt.last(minutes=6) time_series_list = [] - for metric in metrics: + for metric in metric_atrend.keys(): time_series = self.data_loader.get_metric( start, end, metric, label_name='machine_id', label_value=machine_id) time_series_list.extend(time_series) @@ -156,8 +159,16 @@ class APPSliDetector(Detector): if all(x == values[0] for x in values): continue + if trend(time_series.values) < 0 and \ + metric_atrend[time_series.metric] == AnomalyTrend.RISE: + continue + + if trend(time_series.values) > 0 and \ + metric_atrend[time_series.metric] == AnomalyTrend.FALL: + continue + scores = sr_model.compute_score(values) - score = max(scores[-13:]) + score = max(scores[-25:]) if math.isnan(score) or math.isinf(score): continue @@ -170,9 +181,8 @@ class APPSliDetector(Detector): def report(self, anomaly: Anomaly, machine_id: str): """Reports a single anomaly at each time""" - feature_metrics = [f.metric for f in self.features] description = {f.metric: f.description for f in self.features} - cause_metrics = self.detect_features(feature_metrics, machine_id, top_n=60) + cause_metrics = self.detect_features(machine_id, top_n=60) cause_metrics = [ {'metric': cause[0].metric, 'label': cause[0].labels, diff --git a/anteater/utils/data_load.py b/anteater/utils/data_load.py index 108d5ed..f8ce277 100644 --- a/anteater/utils/data_load.py +++ b/anteater/utils/data_load.py @@ -17,7 +17,7 @@ from os import path, sep from json import JSONDecodeError from typing import List, Tuple -from anteater.core.feature import Feature +from anteater.core.feature import AnomalyTrend, Feature from anteater.core.kpi import KPI from anteater.utils.log import logger @@ -76,7 +76,21 @@ def load_kpi_feature(file_name) -> Tuple[List[KPI], List[Feature]]: raise e kpis = [KPI(**param) for param in params.get('KPI')] - features = [Feature(**param) for param in params.get('Features')] + + features = [] + for param in params.get('Features'): + parsed_param = {} + for key, value in param.items(): + if key == 'atrend': + if value.lower() == 'rise': + value = AnomalyTrend.RISE + elif value.lower() == 'fall': + value = AnomalyTrend.FALL + else: + value = AnomalyTrend.DEFAULT + parsed_param[key] = value + + features.append(Feature(**parsed_param)) if duplicated_metric([kpi.metric for kpi in kpis]) or \ duplicated_metric([f.metric for f in features]): diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json index 0744416..b7f78b7 100644 --- a/config/module/app_sli_rtt.json +++ b/config/module/app_sli_rtt.json @@ -34,19 +34,23 @@ }, { "metric": "gala_gopher_block_latency_req_jitter", - "description": "block层request时延抖动异常" + "description": "block层request时延抖动异常", + "atrend": "rise" }, { "metric": "gala_gopher_block_latency_req_last", - "description": "block层request时延最近值异常" + "description": "block层request时延最近值异常", + "atrend": "rise" }, { "metric": "gala_gopher_block_latency_req_max", - "description": "block层request时延最大值异常" + "description": "block层request时延最大值异常", + "atrend": "rise" }, { "metric": "gala_gopher_block_latency_req_sum", - "description": "block层request时延总计值异常" + "description": "block层request时延总计值异常", + "atrend": "rise" }, { "metric": "gala_gopher_cpu_iowait_total_second", @@ -54,11 +58,13 @@ }, { "metric": "gala_gopher_cpu_user_total_second", - "description": "用户态cpu占用时间(不包括nice)异常" + "description": "用户态cpu占用时间(不包括nice)异常", + "atrend": "rise" }, { "metric": "gala_gopher_cpu_total_used_per", - "description": "CPU总利用率异常" + "description": "CPU总利用率异常", + "atrend": "rise" }, { "metric": "gala_gopher_cpu_backlog_drops", @@ -86,7 +92,8 @@ }, { "metric": "gala_gopher_disk_r_await", - "description": "读响应时间异常" + "description": "读响应时间异常", + "atrend": "rise" }, { "metric": "gala_gopher_disk_rareq", @@ -94,19 +101,23 @@ }, { "metric": "gala_gopher_disk_rspeed", - "description": "读速率(IOPS)异常" + "description": "读速率(IOPS)异常", + "atrend": "rise" }, { "metric": "gala_gopher_disk_rspeed_kB", - "description": "读吞吐量异常" + "description": "读吞吐量异常", + "atrend": "rise" }, { "metric": "gala_gopher_disk_util", - "description": "磁盘使用率异常" + "description": "磁盘使用率异常", + "atrend": "rise" }, { "metric": "gala_gopher_disk_w_await", - "description": "写响应时间异常" + "description": "写响应时间异常", + "atrend": "rise" }, { "metric": "gala_gopher_disk_wareq", @@ -114,19 +125,23 @@ }, { "metric": "gala_gopher_disk_wspeed", - "description": "写速率(IOPS)异常" + "description": "写速率(IOPS)异常", + "atrend": "rise" }, { "metric": "gala_gopher_disk_wspeed_kB", - "description": "写吞吐量异常" + "description": "写吞吐量异常", + "atrend": "rise" }, { "metric": "gala_gopher_proc_read_bytes", - "description": "进程实际从磁盘读取的字节数异常" + "description": "进程实际从磁盘读取的字节数异常", + "atrend": "rise" }, { "metric": "gala_gopher_proc_write_bytes", - "description": "进程实际从磁盘写入的字节数异常" + "description": "进程实际从磁盘写入的字节数异常", + "atrend": "rise" }, { "metric": "gala_gopher_net_tcp_retrans_segs", @@ -134,7 +149,12 @@ }, { "metric": "gala_gopher_tcp_link_lost_out", - "description": "TPC丢包数异常" + "description": "TCP丢包数异常" + }, + { + "metric": "gala_gopher_tcp_link_srtt", + "description": "TCP超时", + "atrend": "rise" }, { "metric": "gala_gopher_tcp_link_notack_bytes", -- 2.37.0.windows.1