!20 Update to 1.0.1: support cross host cause location
From: @algorithmofdish Reviewed-by: @dowzyx Signed-off-by: @dowzyx
This commit is contained in:
commit
98b4ebef87
@ -1,679 +0,0 @@
|
||||
From ac48cfdcac8c93ba59754899237a37dac40659ac Mon Sep 17 00:00:00 2001
|
||||
From: algorithmofdish <hexiujun1@huawei.com>
|
||||
Date: Fri, 18 Nov 2022 11:07:47 +0800
|
||||
Subject: [PATCH] feat(infer): cause inference optimization
|
||||
|
||||
1. categorize metrics and select one as cause with max abnormal score;
|
||||
2. add virtual metric to complete the fault propagation path;
|
||||
3. use abnormal score without nomarlization;
|
||||
4. fix a bug where subgraph are incompletely obtained.
|
||||
---
|
||||
cause_inference/__main__.py | 4 +-
|
||||
cause_inference/arangodb.py | 24 +++--
|
||||
cause_inference/cause_infer.py | 11 +--
|
||||
cause_inference/infer_policy.py | 84 ++++++++++++++----
|
||||
cause_inference/model.py | 64 +++++++++++---
|
||||
cause_inference/rule_parser.py | 149 ++++++++++++++++++++++++--------
|
||||
config/infer-rule.yaml | 98 +++++++++++++++++----
|
||||
7 files changed, 337 insertions(+), 97 deletions(-)
|
||||
|
||||
diff --git a/cause_inference/__main__.py b/cause_inference/__main__.py
|
||||
index baddbe4..093f7ac 100644
|
||||
--- a/cause_inference/__main__.py
|
||||
+++ b/cause_inference/__main__.py
|
||||
@@ -15,7 +15,7 @@ from cause_inference.config import init_infer_config
|
||||
from cause_inference.model import AbnormalEvent
|
||||
from cause_inference.cause_infer import cause_locating
|
||||
from cause_inference.cause_infer import parse_abn_evt
|
||||
-from cause_inference.cause_infer import normalize_abn_score
|
||||
+from cause_inference.cause_infer import preprocess_abn_score
|
||||
from cause_inference.rule_parser import rule_engine
|
||||
from cause_inference.exceptions import InferenceException
|
||||
from cause_inference.exceptions import DataParseException
|
||||
@@ -172,7 +172,7 @@ def get_recommend_metric_evts(abn_kpi_data: dict) -> List[AbnormalEvent]:
|
||||
metric_evt = AbnormalEvent(
|
||||
timestamp=abn_kpi_data.get('Timestamp'),
|
||||
abnormal_metric_id=metric_data.get('metric', ''),
|
||||
- abnormal_score=normalize_abn_score(metric_data.get('score')),
|
||||
+ abnormal_score=preprocess_abn_score(metric_data.get('score')),
|
||||
metric_labels=metric_data.get('label', {}),
|
||||
desc=metric_data.get('description', '')
|
||||
)
|
||||
diff --git a/cause_inference/arangodb.py b/cause_inference/arangodb.py
|
||||
index 424e500..d7982b7 100644
|
||||
--- a/cause_inference/arangodb.py
|
||||
+++ b/cause_inference/arangodb.py
|
||||
@@ -22,6 +22,14 @@ def connect_to_arangodb(arango_url, db_name):
|
||||
return conn.databases[db_name]
|
||||
|
||||
|
||||
+def query_all(db, aql_query, bind_vars=None, raw_results=True):
|
||||
+ res = []
|
||||
+ query_hdl = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=raw_results)
|
||||
+ for item in query_hdl:
|
||||
+ res.append(item)
|
||||
+ return res
|
||||
+
|
||||
+
|
||||
def query_recent_topo_ts(db: Database, ts):
|
||||
bind_vars = {'@collection': _TIMESTAMP_COLL_NAME, 'ts': ts}
|
||||
aql_query = '''
|
||||
@@ -32,12 +40,12 @@ def query_recent_topo_ts(db: Database, ts):
|
||||
RETURN t._key
|
||||
'''
|
||||
try:
|
||||
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||||
+ query_res = query_all(db, aql_query, bind_vars)
|
||||
except AQLQueryError as ex:
|
||||
raise DBException(ex) from ex
|
||||
- if query_res.get('error') or not query_res.get('result'):
|
||||
+ if len(query_res) == 0:
|
||||
raise DBException('Can not find topological graph at the abnormal timestamp {}'.format(ts))
|
||||
- last_ts = query_res.get('result')[0]
|
||||
+ last_ts = query_res[0]
|
||||
return int(last_ts)
|
||||
|
||||
|
||||
@@ -56,12 +64,12 @@ def query_topo_entities(db: Database, ts, query_options=None):
|
||||
return v
|
||||
'''.format(filter_str)
|
||||
try:
|
||||
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||||
+ query_res = query_all(db, aql_query, bind_vars)
|
||||
except AQLQueryError as ex:
|
||||
raise DBException(ex) from ex
|
||||
- if query_res.get('error') or not query_res.get('result'):
|
||||
+ if len(query_res) == 0:
|
||||
raise DBException('Can not find observe entities satisfied.')
|
||||
- return query_res.get('result')
|
||||
+ return query_res
|
||||
|
||||
|
||||
def query_subgraph(db, ts, start_node_id, edge_collection, depth=1):
|
||||
@@ -83,12 +91,12 @@ def query_subgraph(db, ts, start_node_id, edge_collection, depth=1):
|
||||
return {{"vertex": v, "edge": e}}
|
||||
'''.format(edge_coll_str)
|
||||
try:
|
||||
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||||
+ query_res = query_all(db, aql_query, bind_vars)
|
||||
except AQLQueryError as ex:
|
||||
raise DBException(ex) from ex
|
||||
vertices = {}
|
||||
edges = {}
|
||||
- for item in query_res.get('result'):
|
||||
+ for item in query_res:
|
||||
vertex = item.get('vertex')
|
||||
edge = item.get('edge')
|
||||
vertices.setdefault(vertex.get('_id'), vertex)
|
||||
diff --git a/cause_inference/cause_infer.py b/cause_inference/cause_infer.py
|
||||
index 1954abf..dff26d0 100644
|
||||
--- a/cause_inference/cause_infer.py
|
||||
+++ b/cause_inference/cause_infer.py
|
||||
@@ -23,8 +23,8 @@ from cause_inference.infer_policy import InferPolicy
|
||||
from cause_inference.infer_policy import get_infer_policy
|
||||
|
||||
|
||||
-def normalize_abn_score(score):
|
||||
- return expit(score)
|
||||
+def preprocess_abn_score(score):
|
||||
+ return max(0, score)
|
||||
|
||||
|
||||
# 因果推理
|
||||
@@ -172,9 +172,10 @@ def cause_locating(abnormal_kpi: AbnormalEvent, abnormal_metrics: List[AbnormalE
|
||||
# 5. 生成异常指标之间的因果图
|
||||
causal_graph.init_metric_cause_graph()
|
||||
|
||||
- logger.logger.debug("Causal graph nodes are: {}".format(causal_graph.entity_cause_graph.nodes))
|
||||
- logger.logger.debug("Causal graph predecessors: {}".format(causal_graph.entity_cause_graph.pred))
|
||||
- logger.logger.debug("Causal graph successors: {}".format(causal_graph.entity_cause_graph.succ))
|
||||
+ logger.logger.debug("Entity cause graph nodes are: {}".format(causal_graph.entity_cause_graph.nodes))
|
||||
+ logger.logger.debug("Entity cause graph edges are: {}".format(causal_graph.entity_cause_graph.edges))
|
||||
+ logger.logger.debug("Metric cause graph nodes are: {}".format(causal_graph.metric_cause_graph.nodes))
|
||||
+ logger.logger.debug("Metric cause graph edges are: {}".format(causal_graph.metric_cause_graph.edges))
|
||||
|
||||
# 6. 以故障传播图 + 异常KPI为输入,执行根因推导算法,输出 topK 根因指标
|
||||
infer_policy = get_infer_policy(infer_config.infer_conf.get('infer_policy'))
|
||||
diff --git a/cause_inference/infer_policy.py b/cause_inference/infer_policy.py
|
||||
index c0952e7..0663e74 100644
|
||||
--- a/cause_inference/infer_policy.py
|
||||
+++ b/cause_inference/infer_policy.py
|
||||
@@ -5,9 +5,10 @@ from typing import List
|
||||
|
||||
import networkx as nx
|
||||
|
||||
-from cause_inference.cause_infer import CausalGraph
|
||||
-from cause_inference.cause_infer import Cause
|
||||
+from cause_inference.model import CausalGraph
|
||||
+from cause_inference.model import Cause
|
||||
from cause_inference.exceptions import InferenceException
|
||||
+from cause_inference import rule_parser
|
||||
|
||||
|
||||
class InferPolicy(ABC):
|
||||
@@ -114,22 +115,28 @@ class DfsPolicy(InferPolicy):
|
||||
if length < 1:
|
||||
return 0.0
|
||||
total_score = 0.0
|
||||
+ num_of_valid_node = 0
|
||||
for node in path[:length]:
|
||||
+ if is_virtual_node(node):
|
||||
+ continue
|
||||
total_score += cause_graph.nodes[node].get('abnormal_score', 0)
|
||||
- if length != 0:
|
||||
- total_score /= length
|
||||
+ num_of_valid_node += 1
|
||||
+ if num_of_valid_node != 0:
|
||||
+ total_score /= num_of_valid_node
|
||||
return total_score
|
||||
|
||||
- def infer(self, causal_graph: CausalGraph, top_k: int) -> List[Cause]:
|
||||
- cause_graph = causal_graph.metric_cause_graph
|
||||
- abn_node_id = (causal_graph.entity_id_of_abn_kpi, causal_graph.abnormal_kpi.abnormal_metric_id)
|
||||
-
|
||||
- reverse_graph = nx.DiGraph()
|
||||
- reverse_graph.add_nodes_from(cause_graph.nodes)
|
||||
+ @staticmethod
|
||||
+ def reverse_graph(cause_graph):
|
||||
+ reversed_graph = nx.DiGraph()
|
||||
+ reversed_graph.add_nodes_from(cause_graph.nodes)
|
||||
for from_, to in cause_graph.edges:
|
||||
- reverse_graph.add_edge(to, from_)
|
||||
+ reversed_graph.add_edge(to, from_)
|
||||
+ return reversed_graph
|
||||
|
||||
- successors = nx.dfs_successors(reverse_graph, abn_node_id)
|
||||
+ @staticmethod
|
||||
+ def get_all_paths_to_abn_node(abn_node_id, cause_graph):
|
||||
+ reversed_graph = DfsPolicy.reverse_graph(cause_graph)
|
||||
+ successors = nx.dfs_successors(reversed_graph, abn_node_id)
|
||||
paths = []
|
||||
path = []
|
||||
|
||||
@@ -145,24 +152,63 @@ class DfsPolicy(InferPolicy):
|
||||
|
||||
path.append(abn_node_id)
|
||||
dfs_path(abn_node_id)
|
||||
+ return paths
|
||||
|
||||
+ @staticmethod
|
||||
+ def get_scored_paths(cause_graph, paths) -> list:
|
||||
scored_paths = []
|
||||
- for p in paths:
|
||||
+ for path in paths:
|
||||
scored_paths.append({
|
||||
- 'score': self.calc_path_score(p, cause_graph),
|
||||
- 'path': p
|
||||
+ 'score': DfsPolicy.calc_path_score(path, cause_graph),
|
||||
+ 'path': path
|
||||
})
|
||||
- scored_paths = sorted(scored_paths, key=lambda k: k['score'], reverse=True)
|
||||
- scored_paths = scored_paths[:top_k]
|
||||
+ return scored_paths
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def get_top_paths(scored_paths, top_k) -> list:
|
||||
+ top_paths = []
|
||||
+ node_selected = set()
|
||||
+ metric_selected = set()
|
||||
+ for path in scored_paths:
|
||||
+ if len(top_paths) == top_k:
|
||||
+ break
|
||||
+ cause_node_id = path.get('path')[0]
|
||||
+ if cause_node_id in node_selected:
|
||||
+ continue
|
||||
+ if cause_node_id[1] in metric_selected:
|
||||
+ continue
|
||||
+ if is_virtual_node(cause_node_id):
|
||||
+ continue
|
||||
+ node_selected.add(cause_node_id)
|
||||
+ metric_selected.add(cause_node_id[1])
|
||||
+ top_paths.append(path)
|
||||
+
|
||||
+ return top_paths
|
||||
|
||||
+ @staticmethod
|
||||
+ def get_top_causes(top_paths) -> List[Cause]:
|
||||
res = []
|
||||
- for item in scored_paths:
|
||||
+ for item in top_paths:
|
||||
cause_node_id = item.get('path')[0]
|
||||
cause = Cause(cause_node_id[1], cause_node_id[0], item.get('score'), item.get('path'))
|
||||
res.append(cause)
|
||||
-
|
||||
return res
|
||||
|
||||
+ def infer(self, causal_graph: CausalGraph, top_k: int) -> List[Cause]:
|
||||
+ cause_graph = causal_graph.metric_cause_graph
|
||||
+ abn_node_id = (causal_graph.entity_id_of_abn_kpi, causal_graph.abnormal_kpi.abnormal_metric_id)
|
||||
+
|
||||
+ paths = self.get_all_paths_to_abn_node(abn_node_id, cause_graph)
|
||||
+ scored_paths = self.get_scored_paths(cause_graph, paths)
|
||||
+ scored_paths = sorted(scored_paths, key=lambda k: k['score'], reverse=True)
|
||||
+ top_paths = self.get_top_paths(scored_paths, top_k)
|
||||
+
|
||||
+ return self.get_top_causes(top_paths)
|
||||
+
|
||||
+
|
||||
+def is_virtual_node(node_id) -> bool:
|
||||
+ return rule_parser.is_virtual_metric(node_id[1])
|
||||
+
|
||||
|
||||
def get_infer_policy(policy: str, **options) -> InferPolicy:
|
||||
if policy == 'dfs':
|
||||
diff --git a/cause_inference/model.py b/cause_inference/model.py
|
||||
index eb473da..136e4c3 100644
|
||||
--- a/cause_inference/model.py
|
||||
+++ b/cause_inference/model.py
|
||||
@@ -7,6 +7,7 @@ from spider.conf.observe_meta import ObserveMetaMgt
|
||||
from spider.util import logger
|
||||
from spider.util.entity import concate_entity_id
|
||||
from spider.util.entity import escape_entity_id
|
||||
+from cause_inference import rule_parser
|
||||
|
||||
|
||||
class AbnormalEvent:
|
||||
@@ -77,6 +78,10 @@ class CausalGraph:
|
||||
self.metric_cause_graph = nx.DiGraph()
|
||||
self.init_casual_graph()
|
||||
|
||||
+ @staticmethod
|
||||
+ def is_virtual_metric_group(metric_group) -> bool:
|
||||
+ return len(metric_group) == 1 and rule_parser.is_virtual_metric(metric_group[0])
|
||||
+
|
||||
def init_casual_graph(self):
|
||||
for node_id, node_attrs in self.topo_nodes.items():
|
||||
self.entity_cause_graph.add_node(node_id, **node_attrs)
|
||||
@@ -134,19 +139,54 @@ class CausalGraph:
|
||||
self.init_metric_edge(edge)
|
||||
|
||||
def init_metric_edge(self, entity_edge):
|
||||
- from_entity_id = entity_edge[0]
|
||||
- to_entity_id = entity_edge[1]
|
||||
+ f_entity_id = entity_edge[0]
|
||||
+ t_entity_id = entity_edge[1]
|
||||
+ avail_relations = self.get_avail_metric_causal_relations(entity_edge)
|
||||
+
|
||||
+ unique = set()
|
||||
+ for f_metric_group, t_metric_group in avail_relations:
|
||||
+ if self.is_virtual_metric_group(f_metric_group):
|
||||
+ self.add_virtual_metric_node(f_entity_id, f_metric_group[0])
|
||||
+ if self.is_virtual_metric_group(t_metric_group):
|
||||
+ self.add_virtual_metric_node(t_entity_id, t_metric_group[0])
|
||||
+
|
||||
+ f_metric_id = self.metric_with_largest_abn_score(f_metric_group, f_entity_id)
|
||||
+ t_metric_id = self.metric_with_largest_abn_score(t_metric_group, t_entity_id)
|
||||
+ if (f_metric_id, t_metric_id) not in unique:
|
||||
+ self.metric_cause_graph.add_edge((f_entity_id, f_metric_id), (t_entity_id, t_metric_id))
|
||||
+ unique.add((f_metric_id, t_metric_id))
|
||||
+
|
||||
+ def add_virtual_metric_node(self, entity_id, metric_id):
|
||||
+ self.metric_cause_graph.add_node((entity_id, metric_id))
|
||||
+
|
||||
+ def get_avail_metric_causal_relations(self, entity_edge):
|
||||
+ f_metric_ids = self.get_abn_metric_ids(entity_edge[0])
|
||||
+ t_metric_ids = self.get_abn_metric_ids(entity_edge[1])
|
||||
rule_meta = self.entity_cause_graph.edges[entity_edge].get('rule_meta')
|
||||
- for from_metric_evt in self.get_abnormal_metrics_of_node(from_entity_id):
|
||||
- for to_metric_evt in self.get_abnormal_metrics_of_node(to_entity_id):
|
||||
- from_metric_id = from_metric_evt.abnormal_metric_id
|
||||
- to_metric_id = to_metric_evt.abnormal_metric_id
|
||||
- if rule_meta is not None and not rule_meta.check_metric_pair(from_metric_id, to_metric_id):
|
||||
- continue
|
||||
- self.metric_cause_graph.add_edge(
|
||||
- (from_entity_id, from_metric_id),
|
||||
- (to_entity_id, to_metric_id)
|
||||
- )
|
||||
+ return rule_meta.get_avail_causal_relations(f_metric_ids, t_metric_ids)
|
||||
+
|
||||
+ def get_abn_metric_ids(self, entity_id):
|
||||
+ metric_evts = self.get_abnormal_metrics_of_node(entity_id)
|
||||
+ return [evt.abnormal_metric_id for evt in metric_evts]
|
||||
+
|
||||
+ def metric_with_largest_abn_score(self, metric_group: list, entity_id) -> str:
|
||||
+ if len(metric_group) == 1:
|
||||
+ return metric_group[0]
|
||||
+
|
||||
+ metric_evt_map = {}
|
||||
+ metric_evts = self.get_abnormal_metrics_of_node(entity_id)
|
||||
+ for metric_evt in metric_evts:
|
||||
+ metric_evt_map.setdefault(metric_evt.abnormal_metric_id, metric_evt)
|
||||
+
|
||||
+ metric_id_of_largest = metric_group[0]
|
||||
+ largest_abn_score = metric_evt_map.get(metric_id_of_largest).abnormal_score
|
||||
+ for metric_id in metric_group:
|
||||
+ abn_score = metric_evt_map.get(metric_id).abnormal_score
|
||||
+ if abn_score > largest_abn_score:
|
||||
+ metric_id_of_largest = metric_id
|
||||
+ largest_abn_score = abn_score
|
||||
+
|
||||
+ return metric_id_of_largest
|
||||
|
||||
|
||||
class Cause:
|
||||
diff --git a/cause_inference/rule_parser.py b/cause_inference/rule_parser.py
|
||||
index f08ed3c..eb6227c 100644
|
||||
--- a/cause_inference/rule_parser.py
|
||||
+++ b/cause_inference/rule_parser.py
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
from abc import ABCMeta
|
||||
from abc import abstractmethod
|
||||
-from typing import List
|
||||
+from typing import List, Dict, Tuple
|
||||
|
||||
import yaml
|
||||
|
||||
@@ -9,33 +9,86 @@ from spider.conf.observe_meta import RelationType
|
||||
from spider.conf.observe_meta import EntityType
|
||||
from spider.util import logger
|
||||
|
||||
+METRIC_CATEGORY_ALL = 'ALL'
|
||||
+METRIC_CATEGORY_OTHER = 'OTHER'
|
||||
+METRIC_CATEGORY_VIRTUAL = 'VIRTUAL'
|
||||
+METRIC_ID_OF_CATEGORY_VIRTUAL = 'virtual_metric'
|
||||
|
||||
-class MetricPairSet:
|
||||
- def __init__(self, from_: set, to_: set):
|
||||
+
|
||||
+def is_virtual_metric(metric_id: str) -> bool:
|
||||
+ return metric_id == METRIC_ID_OF_CATEGORY_VIRTUAL
|
||||
+
|
||||
+
|
||||
+class MetricCategoryPair:
|
||||
+ def __init__(self, from_: str, to_: str):
|
||||
self.from_ = from_
|
||||
self.to_ = to_
|
||||
|
||||
- def check_metric_pair(self, from_metric_id: str, to_metric_id: str) -> bool:
|
||||
- if self.from_ and from_metric_id not in self.from_:
|
||||
- return False
|
||||
- if self.to_ and to_metric_id not in self.to_:
|
||||
- return False
|
||||
- return True
|
||||
-
|
||||
|
||||
class RuleMeta:
|
||||
- def __init__(self, from_type, to_type, metric_range=None):
|
||||
+ def __init__(self, from_type, to_type, from_categories=None, to_categories=None, metric_range=None):
|
||||
self.from_type = from_type
|
||||
self.to_type = to_type
|
||||
- self.metric_range: List[MetricPairSet] = metric_range or []
|
||||
-
|
||||
- def check_metric_pair(self, from_metric_id: str, to_metric_id: str) -> bool:
|
||||
- if not self.metric_range:
|
||||
- return True
|
||||
- for item in self.metric_range:
|
||||
- if item.check_metric_pair(from_metric_id, to_metric_id):
|
||||
- return True
|
||||
- return False
|
||||
+ self.from_categories = from_categories or {}
|
||||
+ self.to_categories = to_categories or {}
|
||||
+ self.category_pairs: List[MetricCategoryPair] = metric_range or []
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def aggregate_metric_from_groups(category_type, metric_groups) -> List[list]:
|
||||
+ res = []
|
||||
+ if category_type == METRIC_CATEGORY_ALL:
|
||||
+ for cate_type, metric_group in metric_groups.items():
|
||||
+ if cate_type == METRIC_CATEGORY_VIRTUAL:
|
||||
+ continue
|
||||
+ elif cate_type == METRIC_CATEGORY_OTHER:
|
||||
+ res.extend([metric] for metric in metric_group)
|
||||
+ else:
|
||||
+ res.append(metric_group)
|
||||
+ else:
|
||||
+ metric_group = metric_groups.get(category_type)
|
||||
+ if metric_group:
|
||||
+ res.append(metric_group)
|
||||
+
|
||||
+ return res
|
||||
+
|
||||
+ @staticmethod
|
||||
+ def _group_metric_by_category(metrics, categories) -> Dict[str, list]:
|
||||
+ parts = {}
|
||||
+ parted_metrics = set()
|
||||
+ for cate_type, cate_metrics in categories.items():
|
||||
+ part = []
|
||||
+ for metric in metrics:
|
||||
+ if metric in cate_metrics:
|
||||
+ part.append(metric)
|
||||
+ parted_metrics.add(metric)
|
||||
+ if len(part) > 0:
|
||||
+ parts.setdefault(cate_type, part)
|
||||
+
|
||||
+ other_part = []
|
||||
+ for metric in metrics:
|
||||
+ if metric not in parted_metrics:
|
||||
+ other_part.append(metric)
|
||||
+ if len(other_part) > 0:
|
||||
+ parts.setdefault(METRIC_CATEGORY_OTHER, other_part)
|
||||
+
|
||||
+ virtual_part = [METRIC_ID_OF_CATEGORY_VIRTUAL]
|
||||
+ parts.setdefault(METRIC_CATEGORY_VIRTUAL, virtual_part)
|
||||
+
|
||||
+ return parts
|
||||
+
|
||||
+ def get_avail_causal_relations(self, real_from_metrics, real_to_metrics) -> List[Tuple[list, list]]:
|
||||
+ causal_relations = []
|
||||
+
|
||||
+ from_groups = self._group_metric_by_category(real_from_metrics, self.from_categories)
|
||||
+ to_groups = self._group_metric_by_category(real_to_metrics, self.to_categories)
|
||||
+ for cate_pair in self.category_pairs:
|
||||
+ all_from_metrics = self.aggregate_metric_from_groups(cate_pair.from_, from_groups)
|
||||
+ all_to_metrics = self.aggregate_metric_from_groups(cate_pair.to_, to_groups)
|
||||
+ for from_metrics in all_from_metrics:
|
||||
+ for to_metrics in all_to_metrics:
|
||||
+ causal_relations.append((from_metrics, to_metrics))
|
||||
+
|
||||
+ return causal_relations
|
||||
|
||||
|
||||
class Rule(metaclass=ABCMeta):
|
||||
@@ -180,7 +233,8 @@ class NicRule1(Rule):
|
||||
class RuleEngine:
|
||||
def __init__(self):
|
||||
self.rules: List[Rule] = []
|
||||
- self.rule_metas = {}
|
||||
+ self.metric_categories = {}
|
||||
+ self.rule_metas: Dict[tuple, RuleMeta] = {}
|
||||
|
||||
def add_rule(self, rule: Rule):
|
||||
self.rules.append(rule)
|
||||
@@ -190,33 +244,58 @@ class RuleEngine:
|
||||
rule.rule_parsing(causal_graph)
|
||||
|
||||
def load_rule_meta_from_yaml(self, rule_path: str) -> bool:
|
||||
- abs_rule_path = os.path.abspath(rule_path)
|
||||
- if not os.path.exists(abs_rule_path):
|
||||
- logger.logger.warning("Rule meta path '{}' not exist", abs_rule_path)
|
||||
- return True
|
||||
try:
|
||||
- with open(abs_rule_path, 'r') as file:
|
||||
+ with open(os.path.abspath(rule_path), 'r') as file:
|
||||
data = yaml.safe_load(file)
|
||||
except IOError as ex:
|
||||
logger.logger.warning(ex)
|
||||
return False
|
||||
|
||||
- infer_rules = data.get("infer_rules", [])
|
||||
- for rule_meta in infer_rules:
|
||||
- saved_metric_range = []
|
||||
- for item in rule_meta.get("metric_range", []):
|
||||
- saved_metric_range.append(MetricPairSet(set(item.get('from', [])), set(item.get('to', []))))
|
||||
- saved_rule_meta = RuleMeta(rule_meta.get('from_type'), rule_meta.get('to_type'), saved_metric_range)
|
||||
- self.rule_metas.setdefault((rule_meta.get("from_type"), rule_meta.get("to_type")), saved_rule_meta)
|
||||
-
|
||||
+ self.load_rule_meta_from_dict(data)
|
||||
return True
|
||||
|
||||
+ def create_default_rule_meta(self, from_type, to_type):
|
||||
+ return RuleMeta(
|
||||
+ from_type,
|
||||
+ to_type,
|
||||
+ self.metric_categories.get(from_type),
|
||||
+ self.metric_categories.get(to_type),
|
||||
+ [MetricCategoryPair(METRIC_CATEGORY_ALL, METRIC_CATEGORY_ALL)]
|
||||
+ )
|
||||
+
|
||||
def add_rule_meta(self, causal_graph):
|
||||
entity_cause_graph = causal_graph.entity_cause_graph
|
||||
for edge in entity_cause_graph.edges:
|
||||
from_type = entity_cause_graph.nodes[edge[0]].get('type')
|
||||
to_type = entity_cause_graph.nodes[edge[1]].get('type')
|
||||
- entity_cause_graph.edges[edge]["rule_meta"] = self.rule_metas.get((from_type, to_type))
|
||||
+ rule_meta = self.rule_metas.get((from_type, to_type))
|
||||
+ if not rule_meta:
|
||||
+ rule_meta = self.create_default_rule_meta(from_type, to_type)
|
||||
+ entity_cause_graph.edges[edge]["rule_meta"] = rule_meta
|
||||
+
|
||||
+ def load_rule_meta_from_dict(self, data: dict):
|
||||
+ self.load_metric_categories(data.get('metric_categories', {}))
|
||||
+ self.load_infer_rules(data.get("infer_rules", []))
|
||||
+
|
||||
+ def load_metric_categories(self, metric_categories: dict):
|
||||
+ for entity_type, categories in metric_categories.items():
|
||||
+ category_dict = {}
|
||||
+ for category in categories:
|
||||
+ category_dict.setdefault(category.get('category'), category.get('metrics'))
|
||||
+ self.metric_categories.setdefault(entity_type, category_dict)
|
||||
+
|
||||
+ def load_infer_rules(self, infer_rules: list):
|
||||
+ for rule_meta in infer_rules:
|
||||
+ from_entity_type = rule_meta.get('from_type')
|
||||
+ to_entity_type = rule_meta.get('to_type')
|
||||
+ saved_metric_range = []
|
||||
+ for item in rule_meta.get("metric_range", []):
|
||||
+ from_category = item.get('from')
|
||||
+ to_category = item.get('to')
|
||||
+ saved_metric_range.append(MetricCategoryPair(from_category, to_category))
|
||||
+ saved_rule_meta = RuleMeta(from_entity_type, to_entity_type, self.metric_categories.get(from_entity_type),
|
||||
+ self.metric_categories.get(to_entity_type), saved_metric_range)
|
||||
+ self.rule_metas.setdefault((from_entity_type, to_entity_type), saved_rule_meta)
|
||||
|
||||
|
||||
rule_engine = RuleEngine()
|
||||
diff --git a/config/infer-rule.yaml b/config/infer-rule.yaml
|
||||
index c3bc3f3..d287972 100644
|
||||
--- a/config/infer-rule.yaml
|
||||
+++ b/config/infer-rule.yaml
|
||||
@@ -1,32 +1,98 @@
|
||||
+metric_categories:
|
||||
+ proc:
|
||||
+ -
|
||||
+ category: PROC_CPU
|
||||
+ metrics:
|
||||
+ - gala_gopher_proc_utime_jiffies
|
||||
+ - gala_gopher_proc_stime_jiffies
|
||||
+ -
|
||||
+ category: PROC_IO_LOAD
|
||||
+ metrics:
|
||||
+ - gala_gopher_proc_read_bytes
|
||||
+ - gala_gopher_proc_write_bytes
|
||||
+ - gala_gopher_proc_less_4k_io_read
|
||||
+ - gala_gopher_proc_less_4k_io_write
|
||||
+ - gala_gopher_proc_greater_4k_io_read
|
||||
+ - gala_gopher_proc_greater_4k_io_write
|
||||
+ disk:
|
||||
+ -
|
||||
+ category: DISK_LOAD
|
||||
+ metrics:
|
||||
+ - gala_gopher_disk_rspeed_kB
|
||||
+ - gala_gopher_disk_wspeed_kB
|
||||
+ - gala_gopher_disk_rspeed
|
||||
+ - gala_gopher_disk_wspeed
|
||||
+ -
|
||||
+ category: DISK_DELAY
|
||||
+ metrics:
|
||||
+ - gala_gopher_disk_r_await
|
||||
+ - gala_gopher_disk_w_await
|
||||
+ - gala_gopher_disk_rareq
|
||||
+ - gala_gopher_disk_wareq
|
||||
+ block:
|
||||
+ -
|
||||
+ category: BLOCK_DELAY
|
||||
+ metrics:
|
||||
+ - gala_gopher_block_latency_req_max
|
||||
+ - gala_gopher_block_latency_req_last
|
||||
+ - gala_gopher_block_latency_req_sum
|
||||
+ - gala_gopher_block_latency_req_jitter
|
||||
+ nic:
|
||||
+ -
|
||||
+ category: NIC_DROP
|
||||
+ metrics:
|
||||
+ - gala_gopher_nic_tc_sent_drop
|
||||
+ - gala_gopher_nic_tx_dropped
|
||||
+ - gala_gopher_nic_rx_dropped
|
||||
+ cpu:
|
||||
+ -
|
||||
+ category: CPU_TOTAL
|
||||
+ metrics:
|
||||
+ - gala_gopher_cpu_total_used_per
|
||||
+
|
||||
infer_rules:
|
||||
-
|
||||
from_type: cpu
|
||||
to_type: proc
|
||||
metric_range:
|
||||
-
|
||||
- from: []
|
||||
- to:
|
||||
- - gala_gopher_proc_utime_jiffies
|
||||
- - gala_gopher_proc_stime_jiffies
|
||||
+ from: CPU_TOTAL
|
||||
+ to: PROC_CPU
|
||||
-
|
||||
from_type: block
|
||||
to_type: proc
|
||||
metric_range:
|
||||
-
|
||||
- from: []
|
||||
- to:
|
||||
- - gala_gopher_proc_ns_ext4_read
|
||||
- - gala_gopher_proc_ns_ext4_write
|
||||
- - gala_gopher_proc_ns_ext4_flush
|
||||
- - gala_gopher_proc_ns_overlay_read
|
||||
- - gala_gopher_proc_ns_overlay_write
|
||||
- - gala_gopher_proc_ns_overlay_flush
|
||||
+ from: BLOCK_DELAY
|
||||
+ to: VIRTUAL
|
||||
-
|
||||
from_type: proc
|
||||
to_type: disk
|
||||
metric_range:
|
||||
-
|
||||
- from:
|
||||
- - gala_gopher_proc_read_bytes
|
||||
- - gala_gopher_proc_write_bytes
|
||||
- to: []
|
||||
\ No newline at end of file
|
||||
+ from: PROC_IO_LOAD
|
||||
+ to: ALL
|
||||
+ -
|
||||
+ from_type: disk
|
||||
+ to_type: block
|
||||
+ metric_range:
|
||||
+ -
|
||||
+ from: DISK_DELAY
|
||||
+ to: BLOCK_DELAY
|
||||
+ -
|
||||
+ from_type: proc
|
||||
+ to_type: sli
|
||||
+ metric_range:
|
||||
+ -
|
||||
+ from: VIRTUAL
|
||||
+ to: ALL
|
||||
+ -
|
||||
+ from: PROC_CPU
|
||||
+ to: ALL
|
||||
+ -
|
||||
+ from_type: tcp_link
|
||||
+ to_type: proc
|
||||
+ metric_range:
|
||||
+ -
|
||||
+ from: ALL
|
||||
+ to: VIRTUAL
|
||||
\ No newline at end of file
|
||||
--
|
||||
2.21.0.windows.1
|
||||
|
||||
@ -1,108 +0,0 @@
|
||||
From 8a89c59b0c6194afdd6ed1c9bbd949b2cc62aeb0 Mon Sep 17 00:00:00 2001
|
||||
From: algorithmofdish <hexiujun1@huawei.com>
|
||||
Date: Tue, 29 Nov 2022 17:07:12 +0800
|
||||
Subject: [PATCH] refactor(infer): adaptation for abnormal event output change
|
||||
|
||||
---
|
||||
cause_inference/__main__.py | 25 +++++++++++++++++++++----
|
||||
cause_inference/cause_infer.py | 21 ---------------------
|
||||
2 files changed, 21 insertions(+), 25 deletions(-)
|
||||
|
||||
diff --git a/cause_inference/__main__.py b/cause_inference/__main__.py
|
||||
index 093f7ac..d586b1f 100644
|
||||
--- a/cause_inference/__main__.py
|
||||
+++ b/cause_inference/__main__.py
|
||||
@@ -14,7 +14,6 @@ from cause_inference.config import infer_config
|
||||
from cause_inference.config import init_infer_config
|
||||
from cause_inference.model import AbnormalEvent
|
||||
from cause_inference.cause_infer import cause_locating
|
||||
-from cause_inference.cause_infer import parse_abn_evt
|
||||
from cause_inference.cause_infer import preprocess_abn_score
|
||||
from cause_inference.rule_parser import rule_engine
|
||||
from cause_inference.exceptions import InferenceException
|
||||
@@ -164,16 +163,34 @@ def init_obsv_meta_coll_thd():
|
||||
return obsv_meta_coll_thread
|
||||
|
||||
|
||||
+def parse_abn_evt(data) -> AbnormalEvent:
|
||||
+ resource = data.get('Resource', {})
|
||||
+ attrs = data.get('Attributes', {})
|
||||
+ if not resource.get('metric'):
|
||||
+ raise DataParseException('Attribute "Resource.metric" required in abnormal event')
|
||||
+ if not attrs.get('entity_id') and not resource.get('labels'):
|
||||
+ raise DataParseException('metric_label or entity_id info need in abnormal event')
|
||||
+ abn_evt = AbnormalEvent(
|
||||
+ timestamp=data.get('Timestamp'),
|
||||
+ abnormal_metric_id=resource.get('metric'),
|
||||
+ abnormal_score=preprocess_abn_score(resource.get('score', 0.0)),
|
||||
+ metric_labels=resource.get('labels'),
|
||||
+ abnormal_entity_id=attrs.get('entity_id'),
|
||||
+ desc=resource.get('description', '') or data.get('Body', '')
|
||||
+ )
|
||||
+ return abn_evt
|
||||
+
|
||||
+
|
||||
def get_recommend_metric_evts(abn_kpi_data: dict) -> List[AbnormalEvent]:
|
||||
metric_evts = []
|
||||
obsv_meta_mgt = ObserveMetaMgt()
|
||||
- recommend_metrics = abn_kpi_data.get('Resource', {}).get('recommend_metrics', {})
|
||||
+ recommend_metrics = abn_kpi_data.get('Resource', {}).get('cause_metrics', {})
|
||||
for metric_data in recommend_metrics:
|
||||
metric_evt = AbnormalEvent(
|
||||
timestamp=abn_kpi_data.get('Timestamp'),
|
||||
abnormal_metric_id=metric_data.get('metric', ''),
|
||||
- abnormal_score=preprocess_abn_score(metric_data.get('score')),
|
||||
- metric_labels=metric_data.get('label', {}),
|
||||
+ abnormal_score=preprocess_abn_score(metric_data.get('score', 0.0)),
|
||||
+ metric_labels=metric_data.get('labels', {}),
|
||||
desc=metric_data.get('description', '')
|
||||
)
|
||||
if not metric_evt.update_entity_id(obsv_meta_mgt):
|
||||
diff --git a/cause_inference/cause_infer.py b/cause_inference/cause_infer.py
|
||||
index dff26d0..6873c1e 100644
|
||||
--- a/cause_inference/cause_infer.py
|
||||
+++ b/cause_inference/cause_infer.py
|
||||
@@ -1,7 +1,5 @@
|
||||
from typing import List
|
||||
|
||||
-from scipy.special import expit
|
||||
-
|
||||
from cause_inference.model import Cause
|
||||
from cause_inference.model import CausalGraph
|
||||
from cause_inference.model import AbnormalEvent
|
||||
@@ -12,7 +10,6 @@ from spider.collector.data_collector import DataCollector
|
||||
from spider.collector.prometheus_collector import PrometheusCollector
|
||||
from cause_inference.exceptions import InferenceException
|
||||
from cause_inference.exceptions import DBException
|
||||
-from cause_inference.exceptions import DataParseException
|
||||
from cause_inference.config import infer_config
|
||||
from cause_inference.rule_parser import rule_engine
|
||||
from cause_inference.arangodb import connect_to_arangodb
|
||||
@@ -68,24 +65,6 @@ def query_abnormal_topo_subgraph(abnormal_event: AbnormalEvent):
|
||||
return subgraph
|
||||
|
||||
|
||||
-def parse_abn_evt(data) -> AbnormalEvent:
|
||||
- resource = data.get('Resource', {})
|
||||
- attrs = data.get('Attributes', {})
|
||||
- if not resource.get('metrics'):
|
||||
- raise DataParseException('Atribute "Resource.metrics" required in abnormal event')
|
||||
- if not attrs.get('entity_id') and not resource.get('metric_label'):
|
||||
- raise DataParseException('metric_label or entity_id info need in abnormal event')
|
||||
- abn_evt = AbnormalEvent(
|
||||
- timestamp=data.get('Timestamp'),
|
||||
- abnormal_metric_id=resource.get('metrics'),
|
||||
- abnormal_score=1.0,
|
||||
- metric_labels=resource.get('metric_label'),
|
||||
- abnormal_entity_id=attrs.get('entity_id'),
|
||||
- desc=resource.get('description', '') or data.get('Body', '')
|
||||
- )
|
||||
- return abn_evt
|
||||
-
|
||||
-
|
||||
def parse_entity_id(orig_entity_id: str) -> str:
|
||||
fs_idx = orig_entity_id.index('/')
|
||||
return orig_entity_id[fs_idx+1:]
|
||||
--
|
||||
2.21.0.windows.1
|
||||
|
||||
Binary file not shown.
BIN
gala-spider-1.0.1.tar.gz
Normal file
BIN
gala-spider-1.0.1.tar.gz
Normal file
Binary file not shown.
@ -1,8 +1,8 @@
|
||||
%define debug_package %{nil}
|
||||
|
||||
Name: gala-spider
|
||||
Version: 1.0.0
|
||||
Release: 6
|
||||
Version: 1.0.1
|
||||
Release: 1
|
||||
Summary: OS topological graph storage service and cause inference service for gala-ops project
|
||||
License: MulanPSL2
|
||||
URL: https://gitee.com/openeuler/gala-spider
|
||||
@ -11,9 +11,6 @@ Source0: %{name}-%{version}.tar.gz
|
||||
BuildRequires: python3-setuptools systemd
|
||||
Requires: python3-%{name} = %{version}-%{release}
|
||||
|
||||
patch0: 0001-cause-inference-optimization.patch
|
||||
patch1: 0002-adaptation-for-abnormal-event-output-change.patch
|
||||
|
||||
|
||||
%description
|
||||
OS topological graph storage service for gala-ops project
|
||||
@ -114,6 +111,7 @@ fi
|
||||
%config(noreplace) %{_sysconfdir}/gala-inference/gala-inference.yaml
|
||||
%config(noreplace) %{_sysconfdir}/gala-inference/ext-observe-meta.yaml
|
||||
%config(noreplace) %{_sysconfdir}/gala-inference/infer-rule.yaml
|
||||
%config(noreplace) %{_sysconfdir}/gala-inference/cause-keyword.yaml
|
||||
%{_bindir}/gala-inference
|
||||
%{_unitdir}/gala-inference.service
|
||||
|
||||
@ -124,6 +122,9 @@ fi
|
||||
|
||||
|
||||
%changelog
|
||||
* Wed Dec 14 2022 algorithmofdish <hexiujun1@huawei.com> - 1.0.1-1
|
||||
- Update to 1.0.1: support cross host cause location
|
||||
|
||||
* Sat Dec 10 2022 algorithmofdish <hexiujun1@huawei.com> - 1.0.0-6
|
||||
- Adaptation for abnormal event output change
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user