gala-anteater/update_sys_io_latency_detector_model.patch

From 0b2243b61fe5083784e634db0d97c2888330eb0a Mon Sep 17 00:00:00 2001
From: lizhenxing11 <lizhenxing11@huawei.com>
Date: Mon, 14 Nov 2022 19:43:17 +0800
Subject: [PATCH 1/2] Update sys io latency detector model

Add tcp detector and update model

fix code check issue
---
 anteater/core/anomaly.py                      |   8 ++
 anteater/core/kpi.py                          |   2 +-
 anteater/model/three_sigma.py                 |  37 ++++++
 anteater/module/app_sli_detector.py           |  93 ++++++++-------
 anteater/module/detector.py                   |  54 +++++++--
 anteater/module/proc_io_latency_detector.py   |  49 +++++---
 anteater/module/sys_io_latency_detector.py    | 109 +++++++++---------
 anteater/module/sys_tcp_establish_detector.py | 108 +++++++++--------
 .../module/sys_tcp_transmission_detector.py   |  76 ++++++------
 anteater/provider/base.py                     |  14 ++-
 anteater/source/anomaly_report.py             |   1 +
 anteater/template/template.py                 |   2 +-
 anteater/utils/common.py                      |  12 ++
 config/gala-anteater.yaml                     |   6 +-
 config/module/app_sli_rtt.json                |  15 ++-
 config/module/proc_io_latency.json            |  14 ++-
 config/module/sys_io_latency.json             |   9 +-
 config/module/sys_tcp_establish.json          |   5 +-
 config/module/sys_tcp_transmission.json       |  33 +++++-
 19 files changed, 416 insertions(+), 231 deletions(-)
 create mode 100644 anteater/model/three_sigma.py

diff --git a/anteater/core/anomaly.py b/anteater/core/anomaly.py
index cce7767..b95eeab 100644
--- a/anteater/core/anomaly.py
+++ b/anteater/core/anomaly.py
@@ -13,6 +13,8 @@

 from dataclasses import dataclass

+from anteater.utils.time_series import TimeSeries
+

 @dataclass
 class Anomaly:
@@ -21,3 +23,9 @@ class Anomaly:
     score: float = None
     entity_name: str = None
     description: str = None
+
+
+@dataclass
+class CauseMetric:
+    ts: TimeSeries
+    score: float
diff --git a/anteater/core/kpi.py b/anteater/core/kpi.py
index db4c046..620ffdf 100644
--- a/anteater/core/kpi.py
+++ b/anteater/core/kpi.py
@@ -21,5 +21,5 @@ class KPI:
     entity_name: str = None
     enable: bool = False
     description: str = ""
-    parameter: dict = field(default=dict)
+    params: dict = field(default=dict)

diff --git a/anteater/model/three_sigma.py b/anteater/model/three_sigma.py
new file mode 100644
index 0000000..08d05ba
--- /dev/null
+++ b/anteater/model/three_sigma.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python3
+# ******************************************************************************
+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# gala-anteater is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+#          http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+
+
+import numpy as np
+
+
+def three_sigma(values, obs_size, method="abs"):
+    """The '3-sigma rule' outlier detect function"""
+    if obs_size <= 0:
+        raise ValueError("The obs_size should great than zero!")
+    if len(values) <= obs_size:
+        raise ValueError("The obs_size should be great than values' length")
+    train_val = values[:-obs_size]
+    obs_val = values[-obs_size:]
+    mean = np.mean(train_val)
+    std = np.std(train_val)
+    if method == "abs":
+        outlier = [val for val in obs_val if abs(val - mean) > 3 * std]
+    elif method == 'min':
+        outlier = [val for val in obs_val if val < mean - 3 * std]
+    elif method == 'max':
+        outlier = [val for val in obs_val if val > mean + 3 * std]
+    else:
+        raise ValueError(f'Unknown method {method}')
+
+    return outlier, mean, std
diff --git a/anteater/module/app_sli_detector.py b/anteater/module/app_sli_detector.py
index e38d53e..b69f73c 100644
--- a/anteater/module/app_sli_detector.py
+++ b/anteater/module/app_sli_detector.py
@@ -19,17 +19,15 @@ Description: The anomaly detector implementation on APP Sli
 import math
 from typing import List

-import numpy as np
-
 from anteater.core.anomaly import Anomaly
 from anteater.model.algorithms.spectral_residual import SpectralResidual
-from anteater.model.slope import smooth_slope
 from anteater.model.smoother import conv_smooth
+from anteater.model.three_sigma import three_sigma
 from anteater.module.detector import Detector
 from anteater.source.anomaly_report import AnomalyReport
 from anteater.source.metric_loader import MetricLoader
 from anteater.template.app_anomaly_template import AppAnomalyTemplate
-from anteater.utils.data_load import load_kpi_feature
+from anteater.utils.common import divide
 from anteater.utils.datetime import DateTimeManager as dt
 from anteater.utils.log import logger

@@ -40,48 +38,52 @@ class APPSliDetector(Detector):
     """

     def __init__(self, data_loader: MetricLoader, anomaly_report: AnomalyReport):
-        super().__init__(data_loader, anomaly_report)
-        self.kpis, self.features = load_kpi_feature('app_sli_rtt.json')
+        file_name = 'app_sli_rtt.json'
+        super().__init__(data_loader, anomaly_report, file_name)

     def execute_detect(self, machine_id: str):
         for kpi in self.kpis:
-            parameter = kpi.parameter
             if kpi.kpi_type == 'rtt':
-                anomalies = self.detect_rtt(kpi, machine_id, parameter)
+                anomalies = self.detect_rtt(kpi, machine_id)
             else:
-                anomalies = self.detect_tps(kpi, machine_id, parameter)
+                anomalies = self.detect_tps(kpi, machine_id)

             for anomaly in anomalies:
-                self.report(anomaly, kpi.entity_name, machine_id)
+                self.report(anomaly, machine_id)

-    def detect_rtt(self, kpi, machine_id: str, parameter: dict) -> List[Anomaly]:
+    def detect_rtt(self, kpi, machine_id: str) -> List[Anomaly]:
         """Detects rtt by rule-based model"""
-        start, end = dt.last(minutes=10)
-        time_series_list = self.data_loader.get_metric(
+        look_back = kpi.params.get('look_back', None)
+        box_pts = kpi.params.get('box_pts', None)
+        obs_size = kpi.params.get('obs_size', None)
+        outlier_ratio_th = kpi.params.get('outlier_ratio_th', None)
+
+        start, end = dt.last(minutes=look_back)
+        ts_list = self.data_loader.get_metric(
             start, end, kpi.metric, label_name='machine_id', label_value=machine_id)

-        if not time_series_list:
+        if not ts_list:
             logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
             return []

         point_count = self.data_loader.expected_point_length(start, end)
         anomalies = []
-        threshold = parameter['threshold']
-        min_nsec = parameter['min_nsec']
-        for time_series in time_series_list:
-            if len(time_series.values) < point_count * 0.9 or len(time_series.values) > point_count * 1.5:
+        for _ts in ts_list:
+            if len(_ts.values) < point_count * 0.9 or len(_ts.values) > point_count * 1.5:
                 continue

-            score = max(smooth_slope(time_series, windows_length=13))
-            if math.isnan(score) or math.isinf(score):
-                continue
+            smoothed_val = conv_smooth(_ts.values, box_pts=box_pts)
+            outlier, mean, std = three_sigma(smoothed_val, obs_size=obs_size, method='min')
+            ratio = divide(len(outlier), obs_size)

-            avg_nsec = np.mean(time_series.values[-13:])
-            if score >= threshold and avg_nsec >= min_nsec:
+            if outlier and ratio >= outlier_ratio_th:
+                logger.info(f'Ratio: {ratio}, Outlier Ratio TH: {outlier_ratio_th}, '
+                            f'Mean: {mean}, Std: {std}')
                 anomalies.append(
-                    Anomaly(metric=time_series.metric,
-                            labels=time_series.labels,
-                            score=score,
+                    Anomaly(metric=_ts.metric,
+                            labels=_ts.labels,
+                            score=ratio,
+                            entity_name=kpi.entity_name,
                             description=kpi.description))

         anomalies = sorted(anomalies, key=lambda x: x.score, reverse=True)
@@ -91,9 +93,14 @@ class APPSliDetector(Detector):

         return anomalies

-    def detect_tps(self, kpi, machine_id: str, parameter: dict) -> List[Anomaly]:
+    def detect_tps(self, kpi, machine_id: str) -> List[Anomaly]:
         """Detects tps by rule based model"""
-        start, end = dt.last(minutes=10)
+        look_back = kpi.params.get('look_back', None)
+        box_pts = kpi.params.get('box_pts', None)
+        obs_size = kpi.params.get('obs_size', None)
+        outlier_ratio_th = kpi.params.get('outlier_ratio_th', None)
+
+        start, end = dt.last(minutes=look_back)
         time_series_list = self.data_loader.get_metric(
             start, end, kpi.metric, label_name='machine_id', label_value=machine_id)

@@ -103,21 +110,21 @@ class APPSliDetector(Detector):

         point_count = self.data_loader.expected_point_length(start, end)
         anomalies = []
-        threshold = parameter['threshold']
-        for time_series in time_series_list:
-            if len(time_series.values) < point_count * 0.9 or len(time_series.values) > point_count * 1.5:
+        for _ts in time_series_list:
+            if len(_ts.values) < point_count * 0.9 or len(_ts.values) > point_count * 1.5:
                 continue
-            pre_values = time_series.values[:-25]
-            cur_values = time_series.values[-25:]
-            mean = np.mean(pre_values)
-            std = np.std(pre_values)
-            outlier = [val for val in cur_values if val < mean - 3 * std]
+            smoothed_val = conv_smooth(_ts.values, box_pts=box_pts)
+            outlier, mean, std = three_sigma(smoothed_val, obs_size=obs_size, method='min')
+            ratio = divide(len(outlier), obs_size)

-            if outlier and len(outlier) >= len(cur_values) * 0.3:
+            if outlier and ratio >= outlier_ratio_th:
+                logger.info(f'Ratio: {ratio}, Outlier Ratio TH: {outlier_ratio_th}, '
+                            f'Mean: {mean}, Std: {std}')
                 anomalies.append(
-                    Anomaly(metric=time_series.metric,
-                            labels=time_series.labels,
-                            score=1,
+                    Anomaly(metric=_ts.metric,
+                            labels=_ts.labels,
+                            score=ratio,
+                            entity_name=kpi.entity_name,
                             description=kpi.description))

         anomalies = sorted(anomalies, key=lambda x: x.score, reverse=True)
@@ -161,7 +168,8 @@ class APPSliDetector(Detector):

         return result[0: top_n]

-    def report(self, anomaly: Anomaly, entity_name: str, machine_id: str):
+    def report(self, anomaly: Anomaly, machine_id: str):
+        """Reports a single anomaly at each time"""
         feature_metrics = [f.metric for f in self.features]
         description = {f.metric: f.description for f in self.features}
         cause_metrics = self.detect_features(feature_metrics, machine_id, top_n=60)
@@ -172,6 +180,5 @@ class APPSliDetector(Detector):
              'description': description.get(cause[0].metric, '')}
             for cause in cause_metrics]
         timestamp = dt.utc_now()
-        template = AppAnomalyTemplate(timestamp, machine_id, anomaly.metric, entity_name)
-        template.labels = anomaly.labels
+        template = AppAnomalyTemplate(timestamp, machine_id, anomaly.metric, anomaly.entity_name)
         self.anomaly_report.sent_anomaly(anomaly, cause_metrics, template)
diff --git a/anteater/module/detector.py b/anteater/module/detector.py
index 51dabbd..bfe516e 100644
--- a/anteater/module/detector.py
+++ b/anteater/module/detector.py
@@ -11,37 +11,69 @@
 # See the Mulan PSL v2 for more details.
 # ******************************************************************************/

-from abc import abstractmethod
+from abc import abstractmethod, ABC

 from anteater.core.anomaly import Anomaly
 from anteater.source.anomaly_report import AnomalyReport
 from anteater.source.metric_loader import MetricLoader
+from anteater.utils.common import same_intersection_key_value
+from anteater.utils.data_load import load_kpi_feature
 from anteater.utils.datetime import DateTimeManager as dt
 from anteater.utils.log import logger
 from anteater.utils.timer import timer


-class Detector:
-    """The base detector class"""
-    def __init__(self, data_loader: MetricLoader, anomaly_report: AnomalyReport):
+class Detector(ABC):
+    """The anomaly detector base class"""
+    def __init__(
+            self,
+            data_loader: MetricLoader,
+            anomaly_report: AnomalyReport,
+            file_name: str):
+        """The detector base class initializer"""
         self.data_loader = data_loader
         self.anomaly_report = anomaly_report
+        self.kpis, self.features = load_kpi_feature(file_name)
+
+    @staticmethod
+    def filter_ts(ts_list, filters):
+        result = []
+        for _ts in ts_list:
+            if same_intersection_key_value(_ts.labels, filters):
+                result.append(_ts)
+
+        return result

     @abstractmethod
     def execute_detect(self, machine_id):
+        """Executes anomaly detection on specified machine id"""
+        pass
+
+    @abstractmethod
+    def report(self, anomaly: Anomaly, machine_id: str):
+        """Reports a single anomaly at each time"""
         pass

     @timer
     def detect(self):
+        """The main function of detector"""
+        if not self.kpis:
+            logger.debug(f"Null kpis in detector: {self.__class__.__name__}!")
+            return
+
         logger.info(f"Run detector: {self.__class__.__name__}!")
-        start, end = dt.last(minutes=1)
-        metrics_kpi = [k.metric for k in self.kpis]
-        metrics_feat = [f.metric for f in self.features]
-        metrics = metrics_kpi + metrics_feat
-        machine_ids = self.data_loader.get_unique_machines(start, end, metrics)
+        self.pre_process()
+        machine_ids = self.get_unique_machine_id()
         for _id in machine_ids:
             self.execute_detect(_id)

-    @abstractmethod
-    def report(self, anomaly: Anomaly, entity_name: str, machine_id: str):
+    def pre_process(self):
+        """Executes pre-process for generating necessary parameters"""
         pass
+
+    def get_unique_machine_id(self):
+        """Gets unique machine ids during past minutes"""
+        start, end = dt.last(minutes=1)
+        metrics = [_kpi.metric for _kpi in self.kpis]
+        machine_ids = self.data_loader.get_unique_machines(start, end, metrics)
+        return machine_ids
diff --git a/anteater/module/proc_io_latency_detector.py b/anteater/module/proc_io_latency_detector.py
index ee1d7c6..3ea2c51 100644
--- a/anteater/module/proc_io_latency_detector.py
+++ b/anteater/module/proc_io_latency_detector.py
@@ -16,11 +16,13 @@ import math
 from anteater.core.anomaly import Anomaly
 from anteater.model.algorithms.spectral_residual import SpectralResidual
 from anteater.model.slope import smooth_slope
+from anteater.model.smoother import conv_smooth
+from anteater.model.three_sigma import three_sigma
 from anteater.module.detector import Detector
 from anteater.source.anomaly_report import AnomalyReport
 from anteater.source.metric_loader import MetricLoader
 from anteater.template.sys_anomaly_template import SysAnomalyTemplate
-from anteater.utils.data_load import load_kpi_feature
+from anteater.utils.common import divide
 from anteater.utils.datetime import DateTimeManager as dt
 from anteater.utils.log import logger

@@ -31,40 +33,50 @@ class ProcIOLatencyDetector(Detector):
     """

     def __init__(self, data_loader: MetricLoader, anomaly_report: AnomalyReport):
-        super().__init__(data_loader, anomaly_report)
-        self.kpis, self.features = load_kpi_feature('proc_io_latency.json')
+        file_name = 'proc_io_latency.json'
+        super().__init__(data_loader, anomaly_report, file_name)

     def execute_detect(self, machine_id):
         for kpi in self.kpis:
-            parameter = kpi.parameter
-            start, end = dt.last(minutes=10)
-            time_series_list = self.data_loader.get_metric(
+            look_back = kpi.params.get('look_back', None)
+            box_pts = kpi.params.get('box_pts', None)
+            obs_size = kpi.params.get('obs_size', None)
+            outlier_ratio_th = kpi.params.get('outlier_ratio_th', None)
+
+            start, end = dt.last(minutes=look_back)
+            ts_list = self.data_loader.get_metric(
                 start, end, kpi.metric, label_name='machine_id', label_value=machine_id)

-            if not time_series_list:
+            if not ts_list:
                 logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
                 return

             point_count = self.data_loader.expected_point_length(start, end)
             anomalies = []
-            threshold = parameter['threshold']
-            for time_series in time_series_list:
-                if len(time_series.values) < point_count * 0.9 or len(time_series.values) > point_count * 1.5:
+            for _ts in ts_list:
+                if len(_ts.values) < point_count * 0.9 or len(_ts.values) > point_count * 1.5:
                     continue

-                if sum(time_series.values) == 0:
+                if sum(_ts.values) == 0:
                     continue

-                score = max(smooth_slope(time_series, windows_length=13))
+                score = max(smooth_slope(_ts, windows_length=13))

                 if math.isnan(score) or math.isinf(score):
                     continue

-                if score > threshold:
+                smoothed_val = conv_smooth(_ts.values, box_pts=box_pts)
+                outlier, mean, std = three_sigma(smoothed_val, obs_size=obs_size, method='min')
+                ratio = divide(len(outlier), obs_size)
+
+                if outlier and ratio >= outlier_ratio_th:
+                    logger.info(f'Ratio: {ratio}, Outlier Ratio TH: {outlier_ratio_th}, '
+                                f'Mean: {mean}, Std: {std}')
                     anomalies.append(
-                        Anomaly(metric=time_series.metric,
-                                labels=time_series.labels,
+                        Anomaly(metric=_ts.metric,
+                                labels=_ts.labels,
                                 score=score,
+                                entity_name=kpi.entity_name,
                                 description=kpi.description))

             anomalies = sorted(anomalies, key=lambda x: x.score, reverse=True)
@@ -72,7 +84,7 @@ class ProcIOLatencyDetector(Detector):
             if anomalies:
                 logger.info('Sys io latency anomalies was detected.')
                 for anomaly in anomalies:
-                    self.report(anomaly, kpi.entity_name, machine_id)
+                    self.report(anomaly, machine_id)

     def detect_features(self, machine_id: str, top_n=3):
         priorities = {f.metric: f.priority for f in self.features}
@@ -110,7 +122,7 @@ class ProcIOLatencyDetector(Detector):

         return result

-    def report(self, anomaly: Anomaly, entity_name: str, machine_id: str):
+    def report(self, anomaly: Anomaly, machine_id: str):
         description = {f.metric: f.description for f in self.features}
         cause_metrics = self.detect_features(machine_id, top_n=3)
         cause_metrics = [
@@ -123,6 +135,5 @@ class ProcIOLatencyDetector(Detector):
                  cause[0].labels.get('comm', ''))}
             for cause in cause_metrics]
         timestamp = dt.utc_now()
-        template = SysAnomalyTemplate(timestamp, machine_id, anomaly.metric, entity_name)
-        template.labels = anomaly.labels
+        template = SysAnomalyTemplate(timestamp, machine_id, anomaly.metric, anomaly.entity_name)
         self.anomaly_report.sent_anomaly(anomaly, cause_metrics, template)
diff --git a/anteater/module/sys_io_latency_detector.py b/anteater/module/sys_io_latency_detector.py
index f459a57..c29ce72 100644
--- a/anteater/module/sys_io_latency_detector.py
+++ b/anteater/module/sys_io_latency_detector.py
@@ -15,12 +15,13 @@ import math

 from anteater.core.anomaly import Anomaly
 from anteater.model.algorithms.spectral_residual import SpectralResidual
-from anteater.model.slope import smooth_slope
+from anteater.model.smoother import conv_smooth
+from anteater.model.three_sigma import three_sigma
 from anteater.module.detector import Detector
 from anteater.source.anomaly_report import AnomalyReport
 from anteater.source.metric_loader import MetricLoader
 from anteater.template.sys_anomaly_template import SysAnomalyTemplate
-from anteater.utils.data_load import load_kpi_feature
+from anteater.utils.common import divide, same_intersection_key_value
 from anteater.utils.datetime import DateTimeManager as dt
 from anteater.utils.log import logger

@@ -31,100 +32,104 @@ class SysIOLatencyDetector(Detector):
     """

     def __init__(self, data_loader: MetricLoader, anomaly_report: AnomalyReport):
-        super().__init__(data_loader, anomaly_report)
-        self.kpis, self.features = load_kpi_feature('sys_io_latency.json')
+        """The system i/o latency detector initializer"""
+        file_name = 'sys_io_latency.json'
+        super().__init__(data_loader, anomaly_report, file_name)

     def execute_detect(self, machine_id: str):
+        """Executes the detector based on machine id"""
         kpi = self.kpis[0]
-        parameter = kpi.parameter
+        look_back = kpi.params.get('look_back', None)
+        box_pts = kpi.params.get('box_pts', None)
+        obs_size = kpi.params.get('obs_size', None)
+        outlier_ratio_th = kpi.params.get('outlier_ratio_th', None)

-        start, end = dt.last(minutes=10)
-        time_series_list = self.data_loader.get_metric(
-            start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
+        start, end = dt.last(minutes=look_back)
+        ts_list = self.data_loader.\
+            get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)

-        if not time_series_list:
+        if not ts_list:
             logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
             return

         point_count = self.data_loader.expected_point_length(start, end)
         anomalies = []
-        threshold = parameter['threshold']
-        for time_series in time_series_list:
-            if len(time_series.values) < point_count * 0.9 or len(time_series.values) > point_count * 1.5:
+        for _ts in ts_list:
+            if len(_ts.values) < point_count * 0.9 or len(_ts.values) > point_count * 1.5:
                 continue

-            if sum(time_series.values) == 0:
+            if sum(_ts.values) == 0:
                 continue

-            score = max(smooth_slope(time_series, windows_length=13))
+            smoothed_val = conv_smooth(_ts.values, box_pts=box_pts)
+            outlier, mean, std = three_sigma(smoothed_val, obs_size=obs_size, method='max')
+            ratio = divide(len(outlier), obs_size)

-            if math.isnan(score) or math.isinf(score):
-                continue
-
-            if score > threshold:
+            if outlier and ratio >= outlier_ratio_th:
+                logger.info(f'Ratio: {ratio}, Outlier Ratio TH: {outlier_ratio_th}, '
+                            f'Mean: {mean}, Std: {std}')
                 anomalies.append(
-                    Anomaly(metric=time_series.metric,
-                            labels=time_series.labels,
-                            score=score,
+                    Anomaly(metric=_ts.metric,
+                            labels=_ts.labels,
+                            score=ratio,
+                            entity_name=kpi.entity_name,
                             description=kpi.description))

-        anomalies = sorted(anomalies, key=lambda x: x.score, reverse=True)
-
         if anomalies:
             logger.info('Sys io latency anomalies was detected.')
+            anomalies = sorted(anomalies, key=lambda x: x.score, reverse=True)
             for anomaly in anomalies:
-                self.report(anomaly, kpi.entity_name, machine_id)
+                self.report(anomaly, machine_id)

-    def detect_features(self, machine_id: str, top_n: int):
+    def find_cause_metrics(self, machine_id: str, filters: dict, top_n: int):
+        """Detects the abnormal features and reports the caused metrics"""
         priorities = {f.metric: f.priority for f in self.features}
         start, end = dt.last(minutes=6)
-        time_series_list = []
+        ts_list = []
         for metric in priorities.keys():
-            time_series = self.data_loader.get_metric(
+            _ts_list = self.data_loader.get_metric(
                 start, end, metric, label_name='machine_id', label_value=machine_id)
-            time_series_list.extend(time_series)
+            for _ts in _ts_list:
+                if same_intersection_key_value(_ts.labels, filters):
+                    ts_list.append(_ts)

         point_count = self.data_loader.expected_point_length(start, end)
-        sr_model = SpectralResidual(12, 24, 50)
-
+        model = SpectralResidual(12, 24, 50)
         result = []
-        for time_series in time_series_list:
-            if len(time_series.values) < point_count * 0.9 or \
-                    len(time_series.values) > point_count * 1.5:
+        for _ts in ts_list:
+            if len(_ts.values) < point_count * 0.9 or \
+                    len(_ts.values) > point_count * 1.5:
                 continue

-            values = time_series.values
-
-            if all(x == values[0] for x in values):
+            if all(x == _ts.values[0] for x in _ts.values):
                 continue

-            scores = sr_model.compute_score(values)
-            score = max(scores[-13:])
-
+            score = max(model.compute_score(_ts.values)[-13:])
             if math.isnan(score) or math.isinf(score):
                 continue

-            result.append((time_series, score))
+            result.append((_ts, score))

         result = sorted(result, key=lambda x: x[1], reverse=True)[0: top_n]
         result = sorted(result, key=lambda x: priorities[x[0].metric])

         return result

-    def report(self, anomaly: Anomaly, entity_name: str, machine_id: str):
-        feature_metrics = [f.metric for f in self.features]
+    def report(self, anomaly: Anomaly, machine_id: str):
+        """Reports the anomaly with it's caused metrics"""
         description = {f.metric: f.description for f in self.features}
-        cause_metrics = self.detect_features(machine_id, top_n=3)
+        cause_metrics = self.find_cause_metrics(machine_id, anomaly.labels, top_n=3)
         cause_metrics = [
-            {'metric': cause[0].metric,
-             'label': cause[0].labels,
-             'score': cause[1],
-             'description': description.get(cause[0].metric, '').format(
-                 cause[0].labels.get('disk_name', ''),
-                 cause[0].labels.get('tgid', ''),
-                 cause[0].labels.get('comm', ''))}
+            {
+                'metric': cause[0].metric,
+                'label': cause[0].labels,
+                'score': cause[1],
+                'description': description.get(cause[0].metric, '').format(
+                    cause[0].labels.get('disk_name', ''),
+                    cause[0].labels.get('tgid', ''),
+                    cause[0].labels.get('comm', ''))
+            }
             for cause in cause_metrics]
         timestamp = dt.utc_now()
-        template = SysAnomalyTemplate(timestamp, machine_id, anomaly.metric, entity_name)
-        template.labels = anomaly.labels
+        template = SysAnomalyTemplate(timestamp, machine_id, anomaly.metric, anomaly.entity_name)
         self.anomaly_report.sent_anomaly(anomaly, cause_metrics, template)
diff --git a/anteater/module/sys_tcp_establish_detector.py b/anteater/module/sys_tcp_establish_detector.py
index 4b49b25..3ca61c5 100644
--- a/anteater/module/sys_tcp_establish_detector.py
+++ b/anteater/module/sys_tcp_establish_detector.py
@@ -11,15 +11,17 @@
 # See the Mulan PSL v2 for more details.
 # ******************************************************************************/

+from functools import reduce
+from typing import List
+
 import numpy as np

-from anteater.core.anomaly import Anomaly
+from anteater.core.anomaly import Anomaly, CauseMetric
 from anteater.module.detector import Detector
 from anteater.source.anomaly_report import AnomalyReport
 from anteater.source.metric_loader import MetricLoader
 from anteater.template.sys_anomaly_template import SysAnomalyTemplate
-from anteater.utils.common import divide
-from anteater.utils.data_load import load_kpi_feature
+from anteater.utils.common import divide, same_intersection_key_value
 from anteater.utils.datetime import DateTimeManager as dt
 from anteater.utils.log import logger

@@ -36,69 +38,85 @@ class SysTcpEstablishDetector(Detector):
     """

     def __init__(self, data_loader: MetricLoader, anomaly_report: AnomalyReport):
-        super().__init__(data_loader, anomaly_report)
-        self.kpis, self.features = load_kpi_feature('sys_tcp_establish.json')
+        file_name = 'sys_tcp_establish.json'
+        super().__init__(data_loader, anomaly_report, file_name)

-    def execute_detect(self, machine_id: str):
+        self.mean = None
+        self.std = None
+
+    def pre_process(self):
+        """Calculates ts values mean and std"""
         kpi = self.kpis[0]
-        start_30_minutes, _ = dt.last(minutes=30)
-        start_3_minutes, end = dt.last(minutes=3)
+        look_back = kpi.params.get('look_back', None)

-        pre_ts = self.data_loader.get_metric(
-            start_30_minutes, start_3_minutes, kpi.metric, label_name='machine_id', label_value=machine_id)
-        pre_establish_time = [t.values[0] for t in pre_ts if t.values]
+        start, _ = dt.last(minutes=look_back)
+        mid, _ = dt.last(minutes=3)

-        ts = self.data_loader.get_metric(
-            start_3_minutes, end, kpi.metric, label_name='machine_id', label_value=machine_id)
-        establish_time = [t.values[0] for t in ts if t.values]
+        ts_list = self.data_loader.get_metric(start, mid, kpi.metric)
+        establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in ts_list])

-        mean = np.mean(pre_establish_time)
-        std = np.std(pre_establish_time)
+        self.mean = np.mean(establish_time)
+        self.std = np.std(establish_time)

-        outlier = [val for val in establish_time if abs(val - mean) > 3 * std]
+    def execute_detect(self, machine_id: str):
+        """Executes the detector based on machine id"""
+        kpi = self.kpis[0]
+        outlier_ratio_th = kpi.params.get('outlier_ratio_th', None)

-        if outlier and len(outlier) > len(ts) * 0.3:
+        start, end = dt.last(minutes=3)
+        ts_list = self.data_loader. \
+            get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
+        establish_time = reduce(lambda x, y: x + y, [list(set(_ts.values)) for _ts in ts_list])
+
+        outlier = [val for val in establish_time if abs(val - self.mean) > 3 * self.std]
+        ratio = divide(len(outlier), len(establish_time))
+        if outlier and ratio > outlier_ratio_th:
+            logger.info(f'Ratio: {ratio}, Outlier Ratio TH: {outlier_ratio_th}, '
+                        f'Mean: {self.mean}, Std: {self.std}')
             logger.info('Sys tcp establish anomalies was detected.')
-            if establish_time:
-                percentile = divide(len(outlier), len(establish_time))
-            else:
-                percentile = 0
             anomaly = Anomaly(
                 metric=kpi.metric,
                 labels={},
-                description=kpi.description.format(percentile, min(outlier)))
-            self.report(anomaly, kpi.entity_name, machine_id)
+                entity_name=kpi.entity_name,
+                description=kpi.description.format(ratio, min(outlier)))
+            self.report(anomaly, machine_id)

-    def detect_features(self, machine_id: str):
+    def find_cause_metrics(self, machine_id: str, filters: dict) -> List[CauseMetric]:
+        """Detects the abnormal features and reports the caused metrics"""
+        priorities = {f.metric: f.priority for f in self.features}
         start, end = dt.last(minutes=3)
-        time_series_list = []
-        metrics = [f.metric for f in self.features]
-        for metric in metrics:
-            time_series = self.data_loader.get_metric(
-                start, end, metric, label_name='machine_id', label_value=machine_id)
-            time_series_list.extend(time_series)
+        ts_list = []
+        for metric in priorities.keys():
+            _ts_list = self.data_loader.\
+                get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
+            filtered_ts_list = self.filter_ts(_ts_list, filters)
+            ts_list.extend(filtered_ts_list)

         result = []
-        for ts in time_series_list:
-            if ts.values and max(ts.values) > 0:
-                result.append((ts, max(ts.values)))
+        for _ts in ts_list:
+            if _ts.values and max(_ts.values) > 0:
+                cause_metric = CauseMetric(ts=_ts, score=max(_ts.values))
+                result.append(cause_metric)

-        result = sorted(result, key=lambda x: x[1], reverse=True)
+        result = sorted(result, key=lambda x: x.score, reverse=True)

         return result

-    def report(self, anomaly: Anomaly, entity_name: str, machine_id: str):
+    def report(self, anomaly: Anomaly, machine_id: str):
+        """Reports a single anomaly at each time"""
         description = {f.metric: f.description for f in self.features}
-        cause_metrics = self.detect_features(machine_id)
+        cause_metrics = self.find_cause_metrics(machine_id, anomaly.labels)
         cause_metrics = [
-            {'metric': cause[0].metric,
-             'label': cause[0].labels,
-             'score': cause[1],
-             'description': description.get(cause[0].metric, '').format(
-                 cause[0].labels.get('ppid', ''),
-                 cause[0].labels.get('s_port', ''))}
+            {
+                'metric': cause.ts.metric,
+                'label': cause.ts.labels,
+                'score': cause.score,
+                'description': description.get(cause.ts.metric, '').format(
+                    cause.ts.labels.get('ppid', ''),
+                    cause.ts.labels.get('s_port', ''))
+            }
             for cause in cause_metrics]
+
         timestamp = dt.utc_now()
-        template = SysAnomalyTemplate(timestamp, machine_id, anomaly.metric, entity_name)
-        template.labels = anomaly.labels
+        template = SysAnomalyTemplate(timestamp, machine_id, anomaly.metric, anomaly.entity_name)
         self.anomaly_report.sent_anomaly(anomaly, cause_metrics, template)
diff --git a/anteater/module/sys_tcp_transmission_detector.py b/anteater/module/sys_tcp_transmission_detector.py
index 9af4760..ad383b4 100644
--- a/anteater/module/sys_tcp_transmission_detector.py
+++ b/anteater/module/sys_tcp_transmission_detector.py
@@ -16,11 +16,13 @@ import math
 from anteater.core.anomaly import Anomaly
 from anteater.model.algorithms.spectral_residual import SpectralResidual
 from anteater.model.slope import smooth_slope
+from anteater.model.smoother import conv_smooth
+from anteater.model.three_sigma import three_sigma
 from anteater.module.detector import Detector
 from anteater.source.anomaly_report import AnomalyReport
 from anteater.source.metric_loader import MetricLoader
 from anteater.template.sys_anomaly_template import SysAnomalyTemplate
-from anteater.utils.data_load import load_kpi_feature
+from anteater.utils.common import divide
 from anteater.utils.datetime import DateTimeManager as dt
 from anteater.utils.log import logger

@@ -31,40 +33,45 @@ class SysTcpTransmissionDetector(Detector):
     """

     def __init__(self, data_loader: MetricLoader, anomaly_report: AnomalyReport):
-        super().__init__(data_loader, anomaly_report)
-        self.kpis, self.features = load_kpi_feature('sys_tcp_transmission.json')
+        file_name = 'sys_tcp_transmission.json'
+        super().__init__(data_loader, anomaly_report, file_name)

     def execute_detect(self, machine_id: str):
         for kpi in self.kpis:
-            parameter = kpi.parameter
-            start, end = dt.last(minutes=10)
-            time_series_list = self.data_loader.get_metric(
-                start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
+            look_back = kpi.params.get('look_back', None)
+            box_pts = kpi.params.get('box_pts', None)
+            obs_size = kpi.params.get('obs_size', None)
+            outlier_ratio_th = kpi.params.get('outlier_ratio_th', None)

-            if not time_series_list:
+            start, end = dt.last(minutes=look_back)
+            ts_list = self.data_loader.\
+                get_metric(start, end, kpi.metric, label_name='machine_id', label_value=machine_id)
+
+            if not ts_list:
                 logger.warning(f'Key metric {kpi.metric} is null on the target machine {machine_id}!')
                 return

             point_count = self.data_loader.expected_point_length(start, end)
             anomalies = []
-            threshold = parameter['threshold']
-            for time_series in time_series_list:
-                if len(time_series.values) < point_count * 0.9 or len(time_series.values) > point_count * 1.5:
+            for _ts in ts_list:
+                if len(_ts.values) < point_count * 0.9 or len(_ts.values) > point_count * 1.5:
                     continue

-                if sum(time_series.values) == 0:
+                if sum(_ts.values) == 0:
                     continue

-                score = max(smooth_slope(time_series, windows_length=13))
-
-                if math.isnan(score) or math.isinf(score):
-                    continue
+                smoothed_val = conv_smooth(_ts.values, box_pts=box_pts)
+                outlier, mean, std = three_sigma(smoothed_val, obs_size=obs_size, method='min')
+                ratio = divide(len(outlier), obs_size)

-                if score > threshold:
+                if outlier and ratio >= outlier_ratio_th:
+                    logger.info(f'Ratio: {ratio}, Outlier Ratio TH: {outlier_ratio_th}, '
+                                f'Mean: {mean}, Std: {std}')
                     anomalies.append(
-                        Anomaly(metric=time_series.metric,
-                                labels=time_series.labels,
-                                score=score,
+                        Anomaly(metric=_ts.metric,
+                                labels=_ts.labels,
+                                score=ratio,
+                                entity_name=kpi.entity_name,
                                 description=kpi.description))

             anomalies = sorted(anomalies, key=lambda x: x.score, reverse=True)
@@ -72,45 +79,45 @@ class SysTcpTransmissionDetector(Detector):
             if anomalies:
                 logger.info('Sys io latency anomalies was detected.')
                 for anomaly in anomalies:
-                    self.report(anomaly, kpi.entity_name, machine_id)
+                    self.report(anomaly, machine_id)

     def detect_features(self, machine_id: str, top_n=3):
         priorities = {f.metric: f.priority for f in self.features}
         start, end = dt.last(minutes=6)
-        time_series_list = []
+        ts_list = []
         for metric in priorities.keys():
-            time_series = self.data_loader.get_metric(
-                start, end, metric, label_name='machine_id', label_value=machine_id)
-            time_series_list.extend(time_series)
+            _ts = self.data_loader.\
+                get_metric(start, end, metric, label_name='machine_id', label_value=machine_id)
+            ts_list.extend(_ts)

         point_count = self.data_loader.expected_point_length(start, end)
-        sr_model = SpectralResidual(12, 24, 50)
+        model = SpectralResidual(12, 24, 50)

         result = []
-        for time_series in time_series_list:
-            if len(time_series.values) < point_count * 0.9 or \
-                    len(time_series.values) > point_count * 1.5:
+        for _ts in ts_list:
+            if len(_ts.values) < point_count * 0.9 or \
+                    len(_ts.values) > point_count * 1.5:
                 continue

-            values = time_series.values
+            values = _ts.values

             if all(x == values[0] for x in values):
                 continue

-            scores = sr_model.compute_score(values)
+            scores = model.compute_score(values)
             score = max(scores[-13:])

             if math.isnan(score) or math.isinf(score):
                 continue

-            result.append((time_series, score))
+            result.append((_ts, score))

         result = sorted(result, key=lambda x: x[1], reverse=True)[0: top_n]
         result = sorted(result, key=lambda x: priorities[x[0].metric])

         return result

-    def report(self, anomaly: Anomaly, entity_name: str, machine_id: str):
+    def report(self, anomaly: Anomaly, machine_id: str):
         description = {f.metric: f.description for f in self.features}
         cause_metrics = self.detect_features(machine_id, top_n=3)
         cause_metrics = [
@@ -124,6 +131,5 @@ class SysTcpTransmissionDetector(Detector):
                  cause[0].labels.get('server_port', ''))}
             for cause in cause_metrics]
         timestamp = dt.utc_now()
-        template = SysAnomalyTemplate(timestamp, machine_id, anomaly.metric, entity_name)
-        template.labels = anomaly.labels
+        template = SysAnomalyTemplate(timestamp, machine_id, anomaly.metric, anomaly.entity_name)
         self.anomaly_report.sent_anomaly(anomaly, cause_metrics, template)
diff --git a/anteater/provider/base.py b/anteater/provider/base.py
index eda2d1b..35d1d1f 100644
--- a/anteater/provider/base.py
+++ b/anteater/provider/base.py
@@ -56,16 +56,16 @@ class TimeSeriesProvider:
     def fetch(url, params: Dict, **args) -> List:
         """Fetches data from prometheus server by http request"""
         try:
-            response = requests.get(url, params, timeout=30, **args).json()
+            response = requests.get(url, params, timeout=30, **args)
         except requests.RequestException as e:
             logger.error(f"RequestException: {e}!")
             return []
-
+        response = response.json()
         result = []
         if response and response.get("status") == 'success':
             result = response.get('data', {}).get('result', [])
         else:
-            logger.error(f"PrometheusAdapter get data failed, "
+            logger.error(f"Prometheus get data failed, "
                          f"error: {response.get('error')}, query_url: {url}, params: {params}.")

         return result
@@ -87,17 +87,19 @@ class TimeSeriesProvider:
             data = self.fetch(self.url, params, headers=headers)

             for item in data:
-                zipped_values = list(zip(*item.get("values")))
+                zipped_values = list(zip(*item.get('values')))
                 time_stamps = list(zipped_values[0])
                 values = [float(v) for v in zipped_values[1]]

-                key = tuple(sorted(item.get("metric").items()))
+                key = tuple(sorted(item.get('metric').items()))
                 if key in tmp_index:
                     result[tmp_index.get(key)].extend(time_stamps, values)
                 else:
+                    labels = item.get('metric')
+                    labels.pop('__name__', None)
                     time_series = TimeSeries(
                         metric,
-                        item.get("metric"),
+                        labels,
                         time_stamps,
                         values)
                     tmp_index[key] = len(result)
diff --git a/anteater/source/anomaly_report.py b/anteater/source/anomaly_report.py
index 2205f44..41542b7 100644
--- a/anteater/source/anomaly_report.py
+++ b/anteater/source/anomaly_report.py
@@ -48,6 +48,7 @@ class AnomalyReport:
         entity_name = template.entity_name
         labels = anomaly.labels

+        template.labels = labels
         template.entity_id = self.get_entity_id(machine_id, entity_name, labels, keys)
         template.keys = keys
         template.description = anomaly.description
diff --git a/anteater/template/template.py b/anteater/template/template.py
index 52befaa..d86a8cb 100644
--- a/anteater/template/template.py
+++ b/anteater/template/template.py
@@ -21,8 +21,8 @@ class Template:
         self.machine_id = machine_id
         self.metric_id = metric_id
         self.entity_name = entity_name
-        self.labels = {}

+        self.labels = {}
         self.entity_id = ""
         self.description = ""
         self.cause_metrics = {}
diff --git a/anteater/utils/common.py b/anteater/utils/common.py
index d6c80ab..a99a1ef 100644
--- a/anteater/utils/common.py
+++ b/anteater/utils/common.py
@@ -170,3 +170,15 @@ def divide(x, y):
         return x / y
     else:
         return 0
+
+
+def same_intersection_key_value(first: dict, second: dict):
+    """Checks there are same key value pairs between two dictionaries
+    intersections by the key
+    """
+    same_keys = set(first.keys()) & set(second.keys())
+    for key in same_keys:
+        if first[key] != second[key]:
+            return False
+
+    return True
diff --git a/config/gala-anteater.yaml b/config/gala-anteater.yaml
index b771c16..a01eb47 100644
--- a/config/gala-anteater.yaml
+++ b/config/gala-anteater.yaml
@@ -3,7 +3,7 @@ Global:
     is_sys: false

 Kafka:
-  server: "9.16.143.92"
+  server: "localhost"
   port: "9092"
   model_topic: "gala_anteater_hybrid_model"
   model_group_id: "gala_anteater_1"
@@ -12,8 +12,8 @@ Kafka:
   meta_entity_name: "sli"

 Prometheus:
-  server: "10.137.17.122"
-  port: "29090"
+  server: "localhost"
+  port: "9090"
   steps: 5

 Aom:
diff --git a/config/module/app_sli_rtt.json b/config/module/app_sli_rtt.json
index 4372f07..0744416 100644
--- a/config/module/app_sli_rtt.json
+++ b/config/module/app_sli_rtt.json
@@ -6,9 +6,11 @@
       "entity_name": "sli",
       "enable": true,
       "description": "sli rtt 异常",
-      "parameter": {
-        "min_nsec": 200000000,
-        "threshold": 0.8
+      "params": {
+        "look_back": 10,
+        "box_pts": 3,
+        "obs_size": 25,
+        "outlier_ratio_th": 0.3
       }
     },
     {
@@ -17,8 +19,11 @@
       "entity_name": "sli",
       "enable": true,
       "description": "sli tps 异常",
-      "parameter": {
-        "threshold": -0.5
+      "params": {
+        "look_back": 10,
+        "box_pts": 3,
+        "obs_size": 25,
+        "outlier_ratio_th": 0.3
       }
     }
   ],
diff --git a/config/module/proc_io_latency.json b/config/module/proc_io_latency.json
index 05bcd81..967ee10 100644
--- a/config/module/proc_io_latency.json
+++ b/config/module/proc_io_latency.json
@@ -6,8 +6,11 @@
       "entity_name": "thread",
       "enable": true,
       "description": "Process IO_wait time (unit: us)",
-      "parameter": {
-        "threshold": 0.5
+      "params": {
+        "look_back": 10,
+        "box_pts": 3,
+        "obs_size": 25,
+        "outlier_ratio_th": 0.3
       }
     },
     {
@@ -16,8 +19,11 @@
       "entity_name": "proc",
       "enable": true,
       "description": "I/O operation delay at the BIO layer (unit: us)",
-      "parameter": {
-        "threshold": 0.5
+      "params": {
+        "look_back": 10,
+        "box_pts": 3,
+        "obs_size": 25,
+        "outlier_ratio_th": 0.3
       }
     }
   ],
diff --git a/config/module/sys_io_latency.json b/config/module/sys_io_latency.json
index 66e5c5a..f790192 100644
--- a/config/module/sys_io_latency.json
+++ b/config/module/sys_io_latency.json
@@ -5,9 +5,12 @@
       "kpi_type": "",
       "entity_name": "block",
       "enable": true,
-      "description": "Block I/O latency performance",
-      "parameter": {
-        "threshold": 0.5
+      "description": "Block I/O latency performance is deteriorating!",
+      "params": {
+        "look_back": 20,
+        "box_pts": 7,
+        "obs_size": 25,
+        "outlier_ratio_th": 0.3
       }
     }
   ],
diff --git a/config/module/sys_tcp_establish.json b/config/module/sys_tcp_establish.json
index d695c2c..b6589ed 100644
--- a/config/module/sys_tcp_establish.json
+++ b/config/module/sys_tcp_establish.json
@@ -6,7 +6,10 @@
       "entity_name": "tcp_link",
       "enable": true,
       "description": "RTT of syn packet(us): existing {:.0%} syn packets rtt are more than {:.0f} us",
-      "parameter": {}
+      "params": {
+        "look_back": 30,
+        "outlier_ratio_th": 0.3
+      }
     }
   ],
   "Features": [
diff --git a/config/module/sys_tcp_transmission.json b/config/module/sys_tcp_transmission.json
index 5347cc9..522056f 100644
--- a/config/module/sys_tcp_transmission.json
+++ b/config/module/sys_tcp_transmission.json
@@ -6,8 +6,37 @@
       "entity_name": "tcp_link",
       "enable": true,
       "description": "Smoothed Round Trip Time(us)",
-      "parameter": {
-        "threshold": 0.5
+      "params": {
+        "look_back": 10,
+        "box_pts": 3,
+        "obs_size": 25,
+        "outlier_ratio_th": 0.3
+      }
+    },
+    {
+      "metric": "gala_gopher_net_tcp_in_segs",
+      "kpi_type": "in_segs",
+      "entity_name": "tcp_link",
+      "enable": true,
+      "description": "Total number of segments received",
+      "params": {
+        "look_back": 10,
+        "box_pts": 3,
+        "obs_size": 25,
+        "outlier_ratio_th": 0.3
+      }
+    },
+    {
+      "metric": "gala_gopher_net_tcp_out_segs",
+      "kpi_type": "out_segs",
+      "entity_name": "tcp_link",
+      "enable": true,
+      "description": "Total number of segments sent",
+      "params": {
+        "look_back": 10,
+        "box_pts": 3,
+        "obs_size": 25,
+        "outlier_ratio_th": 0.3
       }
     }
   ],
--
2.37.0.windows.1