cause inference optimization
This commit is contained in:
parent
1161d56324
commit
ccc0e7ad28
679
0001-cause-inference-optimization.patch
Normal file
679
0001-cause-inference-optimization.patch
Normal file
@ -0,0 +1,679 @@
|
|||||||
|
From ac48cfdcac8c93ba59754899237a37dac40659ac Mon Sep 17 00:00:00 2001
|
||||||
|
From: algorithmofdish <hexiujun1@huawei.com>
|
||||||
|
Date: Fri, 18 Nov 2022 11:07:47 +0800
|
||||||
|
Subject: [PATCH] feat(infer): cause inference optimization
|
||||||
|
|
||||||
|
1. categorize metrics and select one as cause with max abnormal score;
|
||||||
|
2. add virtual metric to complete the fault propagation path;
|
||||||
|
3. use abnormal score without nomarlization;
|
||||||
|
4. fix a bug where subgraph are incompletely obtained.
|
||||||
|
---
|
||||||
|
cause_inference/__main__.py | 4 +-
|
||||||
|
cause_inference/arangodb.py | 24 +++--
|
||||||
|
cause_inference/cause_infer.py | 11 +--
|
||||||
|
cause_inference/infer_policy.py | 84 ++++++++++++++----
|
||||||
|
cause_inference/model.py | 64 +++++++++++---
|
||||||
|
cause_inference/rule_parser.py | 149 ++++++++++++++++++++++++--------
|
||||||
|
config/infer-rule.yaml | 98 +++++++++++++++++----
|
||||||
|
7 files changed, 337 insertions(+), 97 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/cause_inference/__main__.py b/cause_inference/__main__.py
|
||||||
|
index baddbe4..093f7ac 100644
|
||||||
|
--- a/cause_inference/__main__.py
|
||||||
|
+++ b/cause_inference/__main__.py
|
||||||
|
@@ -15,7 +15,7 @@ from cause_inference.config import init_infer_config
|
||||||
|
from cause_inference.model import AbnormalEvent
|
||||||
|
from cause_inference.cause_infer import cause_locating
|
||||||
|
from cause_inference.cause_infer import parse_abn_evt
|
||||||
|
-from cause_inference.cause_infer import normalize_abn_score
|
||||||
|
+from cause_inference.cause_infer import preprocess_abn_score
|
||||||
|
from cause_inference.rule_parser import rule_engine
|
||||||
|
from cause_inference.exceptions import InferenceException
|
||||||
|
from cause_inference.exceptions import DataParseException
|
||||||
|
@@ -172,7 +172,7 @@ def get_recommend_metric_evts(abn_kpi_data: dict) -> List[AbnormalEvent]:
|
||||||
|
metric_evt = AbnormalEvent(
|
||||||
|
timestamp=abn_kpi_data.get('Timestamp'),
|
||||||
|
abnormal_metric_id=metric_data.get('metric', ''),
|
||||||
|
- abnormal_score=normalize_abn_score(metric_data.get('score')),
|
||||||
|
+ abnormal_score=preprocess_abn_score(metric_data.get('score')),
|
||||||
|
metric_labels=metric_data.get('label', {}),
|
||||||
|
desc=metric_data.get('description', '')
|
||||||
|
)
|
||||||
|
diff --git a/cause_inference/arangodb.py b/cause_inference/arangodb.py
|
||||||
|
index 424e500..d7982b7 100644
|
||||||
|
--- a/cause_inference/arangodb.py
|
||||||
|
+++ b/cause_inference/arangodb.py
|
||||||
|
@@ -22,6 +22,14 @@ def connect_to_arangodb(arango_url, db_name):
|
||||||
|
return conn.databases[db_name]
|
||||||
|
|
||||||
|
|
||||||
|
+def query_all(db, aql_query, bind_vars=None, raw_results=True):
|
||||||
|
+ res = []
|
||||||
|
+ query_hdl = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=raw_results)
|
||||||
|
+ for item in query_hdl:
|
||||||
|
+ res.append(item)
|
||||||
|
+ return res
|
||||||
|
+
|
||||||
|
+
|
||||||
|
def query_recent_topo_ts(db: Database, ts):
|
||||||
|
bind_vars = {'@collection': _TIMESTAMP_COLL_NAME, 'ts': ts}
|
||||||
|
aql_query = '''
|
||||||
|
@@ -32,12 +40,12 @@ def query_recent_topo_ts(db: Database, ts):
|
||||||
|
RETURN t._key
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||||||
|
+ query_res = query_all(db, aql_query, bind_vars)
|
||||||
|
except AQLQueryError as ex:
|
||||||
|
raise DBException(ex) from ex
|
||||||
|
- if query_res.get('error') or not query_res.get('result'):
|
||||||
|
+ if len(query_res) == 0:
|
||||||
|
raise DBException('Can not find topological graph at the abnormal timestamp {}'.format(ts))
|
||||||
|
- last_ts = query_res.get('result')[0]
|
||||||
|
+ last_ts = query_res[0]
|
||||||
|
return int(last_ts)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -56,12 +64,12 @@ def query_topo_entities(db: Database, ts, query_options=None):
|
||||||
|
return v
|
||||||
|
'''.format(filter_str)
|
||||||
|
try:
|
||||||
|
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||||||
|
+ query_res = query_all(db, aql_query, bind_vars)
|
||||||
|
except AQLQueryError as ex:
|
||||||
|
raise DBException(ex) from ex
|
||||||
|
- if query_res.get('error') or not query_res.get('result'):
|
||||||
|
+ if len(query_res) == 0:
|
||||||
|
raise DBException('Can not find observe entities satisfied.')
|
||||||
|
- return query_res.get('result')
|
||||||
|
+ return query_res
|
||||||
|
|
||||||
|
|
||||||
|
def query_subgraph(db, ts, start_node_id, edge_collection, depth=1):
|
||||||
|
@@ -83,12 +91,12 @@ def query_subgraph(db, ts, start_node_id, edge_collection, depth=1):
|
||||||
|
return {{"vertex": v, "edge": e}}
|
||||||
|
'''.format(edge_coll_str)
|
||||||
|
try:
|
||||||
|
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||||||
|
+ query_res = query_all(db, aql_query, bind_vars)
|
||||||
|
except AQLQueryError as ex:
|
||||||
|
raise DBException(ex) from ex
|
||||||
|
vertices = {}
|
||||||
|
edges = {}
|
||||||
|
- for item in query_res.get('result'):
|
||||||
|
+ for item in query_res:
|
||||||
|
vertex = item.get('vertex')
|
||||||
|
edge = item.get('edge')
|
||||||
|
vertices.setdefault(vertex.get('_id'), vertex)
|
||||||
|
diff --git a/cause_inference/cause_infer.py b/cause_inference/cause_infer.py
|
||||||
|
index 1954abf..dff26d0 100644
|
||||||
|
--- a/cause_inference/cause_infer.py
|
||||||
|
+++ b/cause_inference/cause_infer.py
|
||||||
|
@@ -23,8 +23,8 @@ from cause_inference.infer_policy import InferPolicy
|
||||||
|
from cause_inference.infer_policy import get_infer_policy
|
||||||
|
|
||||||
|
|
||||||
|
-def normalize_abn_score(score):
|
||||||
|
- return expit(score)
|
||||||
|
+def preprocess_abn_score(score):
|
||||||
|
+ return max(0, score)
|
||||||
|
|
||||||
|
|
||||||
|
# 因果推理
|
||||||
|
@@ -172,9 +172,10 @@ def cause_locating(abnormal_kpi: AbnormalEvent, abnormal_metrics: List[AbnormalE
|
||||||
|
# 5. 生成异常指标之间的因果图
|
||||||
|
causal_graph.init_metric_cause_graph()
|
||||||
|
|
||||||
|
- logger.logger.debug("Causal graph nodes are: {}".format(causal_graph.entity_cause_graph.nodes))
|
||||||
|
- logger.logger.debug("Causal graph predecessors: {}".format(causal_graph.entity_cause_graph.pred))
|
||||||
|
- logger.logger.debug("Causal graph successors: {}".format(causal_graph.entity_cause_graph.succ))
|
||||||
|
+ logger.logger.debug("Entity cause graph nodes are: {}".format(causal_graph.entity_cause_graph.nodes))
|
||||||
|
+ logger.logger.debug("Entity cause graph edges are: {}".format(causal_graph.entity_cause_graph.edges))
|
||||||
|
+ logger.logger.debug("Metric cause graph nodes are: {}".format(causal_graph.metric_cause_graph.nodes))
|
||||||
|
+ logger.logger.debug("Metric cause graph edges are: {}".format(causal_graph.metric_cause_graph.edges))
|
||||||
|
|
||||||
|
# 6. 以故障传播图 + 异常KPI为输入,执行根因推导算法,输出 topK 根因指标
|
||||||
|
infer_policy = get_infer_policy(infer_config.infer_conf.get('infer_policy'))
|
||||||
|
diff --git a/cause_inference/infer_policy.py b/cause_inference/infer_policy.py
|
||||||
|
index c0952e7..0663e74 100644
|
||||||
|
--- a/cause_inference/infer_policy.py
|
||||||
|
+++ b/cause_inference/infer_policy.py
|
||||||
|
@@ -5,9 +5,10 @@ from typing import List
|
||||||
|
|
||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
-from cause_inference.cause_infer import CausalGraph
|
||||||
|
-from cause_inference.cause_infer import Cause
|
||||||
|
+from cause_inference.model import CausalGraph
|
||||||
|
+from cause_inference.model import Cause
|
||||||
|
from cause_inference.exceptions import InferenceException
|
||||||
|
+from cause_inference import rule_parser
|
||||||
|
|
||||||
|
|
||||||
|
class InferPolicy(ABC):
|
||||||
|
@@ -114,22 +115,28 @@ class DfsPolicy(InferPolicy):
|
||||||
|
if length < 1:
|
||||||
|
return 0.0
|
||||||
|
total_score = 0.0
|
||||||
|
+ num_of_valid_node = 0
|
||||||
|
for node in path[:length]:
|
||||||
|
+ if is_virtual_node(node):
|
||||||
|
+ continue
|
||||||
|
total_score += cause_graph.nodes[node].get('abnormal_score', 0)
|
||||||
|
- if length != 0:
|
||||||
|
- total_score /= length
|
||||||
|
+ num_of_valid_node += 1
|
||||||
|
+ if num_of_valid_node != 0:
|
||||||
|
+ total_score /= num_of_valid_node
|
||||||
|
return total_score
|
||||||
|
|
||||||
|
- def infer(self, causal_graph: CausalGraph, top_k: int) -> List[Cause]:
|
||||||
|
- cause_graph = causal_graph.metric_cause_graph
|
||||||
|
- abn_node_id = (causal_graph.entity_id_of_abn_kpi, causal_graph.abnormal_kpi.abnormal_metric_id)
|
||||||
|
-
|
||||||
|
- reverse_graph = nx.DiGraph()
|
||||||
|
- reverse_graph.add_nodes_from(cause_graph.nodes)
|
||||||
|
+ @staticmethod
|
||||||
|
+ def reverse_graph(cause_graph):
|
||||||
|
+ reversed_graph = nx.DiGraph()
|
||||||
|
+ reversed_graph.add_nodes_from(cause_graph.nodes)
|
||||||
|
for from_, to in cause_graph.edges:
|
||||||
|
- reverse_graph.add_edge(to, from_)
|
||||||
|
+ reversed_graph.add_edge(to, from_)
|
||||||
|
+ return reversed_graph
|
||||||
|
|
||||||
|
- successors = nx.dfs_successors(reverse_graph, abn_node_id)
|
||||||
|
+ @staticmethod
|
||||||
|
+ def get_all_paths_to_abn_node(abn_node_id, cause_graph):
|
||||||
|
+ reversed_graph = DfsPolicy.reverse_graph(cause_graph)
|
||||||
|
+ successors = nx.dfs_successors(reversed_graph, abn_node_id)
|
||||||
|
paths = []
|
||||||
|
path = []
|
||||||
|
|
||||||
|
@@ -145,24 +152,63 @@ class DfsPolicy(InferPolicy):
|
||||||
|
|
||||||
|
path.append(abn_node_id)
|
||||||
|
dfs_path(abn_node_id)
|
||||||
|
+ return paths
|
||||||
|
|
||||||
|
+ @staticmethod
|
||||||
|
+ def get_scored_paths(cause_graph, paths) -> list:
|
||||||
|
scored_paths = []
|
||||||
|
- for p in paths:
|
||||||
|
+ for path in paths:
|
||||||
|
scored_paths.append({
|
||||||
|
- 'score': self.calc_path_score(p, cause_graph),
|
||||||
|
- 'path': p
|
||||||
|
+ 'score': DfsPolicy.calc_path_score(path, cause_graph),
|
||||||
|
+ 'path': path
|
||||||
|
})
|
||||||
|
- scored_paths = sorted(scored_paths, key=lambda k: k['score'], reverse=True)
|
||||||
|
- scored_paths = scored_paths[:top_k]
|
||||||
|
+ return scored_paths
|
||||||
|
+
|
||||||
|
+ @staticmethod
|
||||||
|
+ def get_top_paths(scored_paths, top_k) -> list:
|
||||||
|
+ top_paths = []
|
||||||
|
+ node_selected = set()
|
||||||
|
+ metric_selected = set()
|
||||||
|
+ for path in scored_paths:
|
||||||
|
+ if len(top_paths) == top_k:
|
||||||
|
+ break
|
||||||
|
+ cause_node_id = path.get('path')[0]
|
||||||
|
+ if cause_node_id in node_selected:
|
||||||
|
+ continue
|
||||||
|
+ if cause_node_id[1] in metric_selected:
|
||||||
|
+ continue
|
||||||
|
+ if is_virtual_node(cause_node_id):
|
||||||
|
+ continue
|
||||||
|
+ node_selected.add(cause_node_id)
|
||||||
|
+ metric_selected.add(cause_node_id[1])
|
||||||
|
+ top_paths.append(path)
|
||||||
|
+
|
||||||
|
+ return top_paths
|
||||||
|
|
||||||
|
+ @staticmethod
|
||||||
|
+ def get_top_causes(top_paths) -> List[Cause]:
|
||||||
|
res = []
|
||||||
|
- for item in scored_paths:
|
||||||
|
+ for item in top_paths:
|
||||||
|
cause_node_id = item.get('path')[0]
|
||||||
|
cause = Cause(cause_node_id[1], cause_node_id[0], item.get('score'), item.get('path'))
|
||||||
|
res.append(cause)
|
||||||
|
-
|
||||||
|
return res
|
||||||
|
|
||||||
|
+ def infer(self, causal_graph: CausalGraph, top_k: int) -> List[Cause]:
|
||||||
|
+ cause_graph = causal_graph.metric_cause_graph
|
||||||
|
+ abn_node_id = (causal_graph.entity_id_of_abn_kpi, causal_graph.abnormal_kpi.abnormal_metric_id)
|
||||||
|
+
|
||||||
|
+ paths = self.get_all_paths_to_abn_node(abn_node_id, cause_graph)
|
||||||
|
+ scored_paths = self.get_scored_paths(cause_graph, paths)
|
||||||
|
+ scored_paths = sorted(scored_paths, key=lambda k: k['score'], reverse=True)
|
||||||
|
+ top_paths = self.get_top_paths(scored_paths, top_k)
|
||||||
|
+
|
||||||
|
+ return self.get_top_causes(top_paths)
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+def is_virtual_node(node_id) -> bool:
|
||||||
|
+ return rule_parser.is_virtual_metric(node_id[1])
|
||||||
|
+
|
||||||
|
|
||||||
|
def get_infer_policy(policy: str, **options) -> InferPolicy:
|
||||||
|
if policy == 'dfs':
|
||||||
|
diff --git a/cause_inference/model.py b/cause_inference/model.py
|
||||||
|
index eb473da..136e4c3 100644
|
||||||
|
--- a/cause_inference/model.py
|
||||||
|
+++ b/cause_inference/model.py
|
||||||
|
@@ -7,6 +7,7 @@ from spider.conf.observe_meta import ObserveMetaMgt
|
||||||
|
from spider.util import logger
|
||||||
|
from spider.util.entity import concate_entity_id
|
||||||
|
from spider.util.entity import escape_entity_id
|
||||||
|
+from cause_inference import rule_parser
|
||||||
|
|
||||||
|
|
||||||
|
class AbnormalEvent:
|
||||||
|
@@ -77,6 +78,10 @@ class CausalGraph:
|
||||||
|
self.metric_cause_graph = nx.DiGraph()
|
||||||
|
self.init_casual_graph()
|
||||||
|
|
||||||
|
+ @staticmethod
|
||||||
|
+ def is_virtual_metric_group(metric_group) -> bool:
|
||||||
|
+ return len(metric_group) == 1 and rule_parser.is_virtual_metric(metric_group[0])
|
||||||
|
+
|
||||||
|
def init_casual_graph(self):
|
||||||
|
for node_id, node_attrs in self.topo_nodes.items():
|
||||||
|
self.entity_cause_graph.add_node(node_id, **node_attrs)
|
||||||
|
@@ -134,19 +139,54 @@ class CausalGraph:
|
||||||
|
self.init_metric_edge(edge)
|
||||||
|
|
||||||
|
def init_metric_edge(self, entity_edge):
|
||||||
|
- from_entity_id = entity_edge[0]
|
||||||
|
- to_entity_id = entity_edge[1]
|
||||||
|
+ f_entity_id = entity_edge[0]
|
||||||
|
+ t_entity_id = entity_edge[1]
|
||||||
|
+ avail_relations = self.get_avail_metric_causal_relations(entity_edge)
|
||||||
|
+
|
||||||
|
+ unique = set()
|
||||||
|
+ for f_metric_group, t_metric_group in avail_relations:
|
||||||
|
+ if self.is_virtual_metric_group(f_metric_group):
|
||||||
|
+ self.add_virtual_metric_node(f_entity_id, f_metric_group[0])
|
||||||
|
+ if self.is_virtual_metric_group(t_metric_group):
|
||||||
|
+ self.add_virtual_metric_node(t_entity_id, t_metric_group[0])
|
||||||
|
+
|
||||||
|
+ f_metric_id = self.metric_with_largest_abn_score(f_metric_group, f_entity_id)
|
||||||
|
+ t_metric_id = self.metric_with_largest_abn_score(t_metric_group, t_entity_id)
|
||||||
|
+ if (f_metric_id, t_metric_id) not in unique:
|
||||||
|
+ self.metric_cause_graph.add_edge((f_entity_id, f_metric_id), (t_entity_id, t_metric_id))
|
||||||
|
+ unique.add((f_metric_id, t_metric_id))
|
||||||
|
+
|
||||||
|
+ def add_virtual_metric_node(self, entity_id, metric_id):
|
||||||
|
+ self.metric_cause_graph.add_node((entity_id, metric_id))
|
||||||
|
+
|
||||||
|
+ def get_avail_metric_causal_relations(self, entity_edge):
|
||||||
|
+ f_metric_ids = self.get_abn_metric_ids(entity_edge[0])
|
||||||
|
+ t_metric_ids = self.get_abn_metric_ids(entity_edge[1])
|
||||||
|
rule_meta = self.entity_cause_graph.edges[entity_edge].get('rule_meta')
|
||||||
|
- for from_metric_evt in self.get_abnormal_metrics_of_node(from_entity_id):
|
||||||
|
- for to_metric_evt in self.get_abnormal_metrics_of_node(to_entity_id):
|
||||||
|
- from_metric_id = from_metric_evt.abnormal_metric_id
|
||||||
|
- to_metric_id = to_metric_evt.abnormal_metric_id
|
||||||
|
- if rule_meta is not None and not rule_meta.check_metric_pair(from_metric_id, to_metric_id):
|
||||||
|
- continue
|
||||||
|
- self.metric_cause_graph.add_edge(
|
||||||
|
- (from_entity_id, from_metric_id),
|
||||||
|
- (to_entity_id, to_metric_id)
|
||||||
|
- )
|
||||||
|
+ return rule_meta.get_avail_causal_relations(f_metric_ids, t_metric_ids)
|
||||||
|
+
|
||||||
|
+ def get_abn_metric_ids(self, entity_id):
|
||||||
|
+ metric_evts = self.get_abnormal_metrics_of_node(entity_id)
|
||||||
|
+ return [evt.abnormal_metric_id for evt in metric_evts]
|
||||||
|
+
|
||||||
|
+ def metric_with_largest_abn_score(self, metric_group: list, entity_id) -> str:
|
||||||
|
+ if len(metric_group) == 1:
|
||||||
|
+ return metric_group[0]
|
||||||
|
+
|
||||||
|
+ metric_evt_map = {}
|
||||||
|
+ metric_evts = self.get_abnormal_metrics_of_node(entity_id)
|
||||||
|
+ for metric_evt in metric_evts:
|
||||||
|
+ metric_evt_map.setdefault(metric_evt.abnormal_metric_id, metric_evt)
|
||||||
|
+
|
||||||
|
+ metric_id_of_largest = metric_group[0]
|
||||||
|
+ largest_abn_score = metric_evt_map.get(metric_id_of_largest).abnormal_score
|
||||||
|
+ for metric_id in metric_group:
|
||||||
|
+ abn_score = metric_evt_map.get(metric_id).abnormal_score
|
||||||
|
+ if abn_score > largest_abn_score:
|
||||||
|
+ metric_id_of_largest = metric_id
|
||||||
|
+ largest_abn_score = abn_score
|
||||||
|
+
|
||||||
|
+ return metric_id_of_largest
|
||||||
|
|
||||||
|
|
||||||
|
class Cause:
|
||||||
|
diff --git a/cause_inference/rule_parser.py b/cause_inference/rule_parser.py
|
||||||
|
index f08ed3c..eb6227c 100644
|
||||||
|
--- a/cause_inference/rule_parser.py
|
||||||
|
+++ b/cause_inference/rule_parser.py
|
||||||
|
@@ -1,7 +1,7 @@
|
||||||
|
import os
|
||||||
|
from abc import ABCMeta
|
||||||
|
from abc import abstractmethod
|
||||||
|
-from typing import List
|
||||||
|
+from typing import List, Dict, Tuple
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
@@ -9,33 +9,86 @@ from spider.conf.observe_meta import RelationType
|
||||||
|
from spider.conf.observe_meta import EntityType
|
||||||
|
from spider.util import logger
|
||||||
|
|
||||||
|
+METRIC_CATEGORY_ALL = 'ALL'
|
||||||
|
+METRIC_CATEGORY_OTHER = 'OTHER'
|
||||||
|
+METRIC_CATEGORY_VIRTUAL = 'VIRTUAL'
|
||||||
|
+METRIC_ID_OF_CATEGORY_VIRTUAL = 'virtual_metric'
|
||||||
|
|
||||||
|
-class MetricPairSet:
|
||||||
|
- def __init__(self, from_: set, to_: set):
|
||||||
|
+
|
||||||
|
+def is_virtual_metric(metric_id: str) -> bool:
|
||||||
|
+ return metric_id == METRIC_ID_OF_CATEGORY_VIRTUAL
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+class MetricCategoryPair:
|
||||||
|
+ def __init__(self, from_: str, to_: str):
|
||||||
|
self.from_ = from_
|
||||||
|
self.to_ = to_
|
||||||
|
|
||||||
|
- def check_metric_pair(self, from_metric_id: str, to_metric_id: str) -> bool:
|
||||||
|
- if self.from_ and from_metric_id not in self.from_:
|
||||||
|
- return False
|
||||||
|
- if self.to_ and to_metric_id not in self.to_:
|
||||||
|
- return False
|
||||||
|
- return True
|
||||||
|
-
|
||||||
|
|
||||||
|
class RuleMeta:
|
||||||
|
- def __init__(self, from_type, to_type, metric_range=None):
|
||||||
|
+ def __init__(self, from_type, to_type, from_categories=None, to_categories=None, metric_range=None):
|
||||||
|
self.from_type = from_type
|
||||||
|
self.to_type = to_type
|
||||||
|
- self.metric_range: List[MetricPairSet] = metric_range or []
|
||||||
|
-
|
||||||
|
- def check_metric_pair(self, from_metric_id: str, to_metric_id: str) -> bool:
|
||||||
|
- if not self.metric_range:
|
||||||
|
- return True
|
||||||
|
- for item in self.metric_range:
|
||||||
|
- if item.check_metric_pair(from_metric_id, to_metric_id):
|
||||||
|
- return True
|
||||||
|
- return False
|
||||||
|
+ self.from_categories = from_categories or {}
|
||||||
|
+ self.to_categories = to_categories or {}
|
||||||
|
+ self.category_pairs: List[MetricCategoryPair] = metric_range or []
|
||||||
|
+
|
||||||
|
+ @staticmethod
|
||||||
|
+ def aggregate_metric_from_groups(category_type, metric_groups) -> List[list]:
|
||||||
|
+ res = []
|
||||||
|
+ if category_type == METRIC_CATEGORY_ALL:
|
||||||
|
+ for cate_type, metric_group in metric_groups.items():
|
||||||
|
+ if cate_type == METRIC_CATEGORY_VIRTUAL:
|
||||||
|
+ continue
|
||||||
|
+ elif cate_type == METRIC_CATEGORY_OTHER:
|
||||||
|
+ res.extend([metric] for metric in metric_group)
|
||||||
|
+ else:
|
||||||
|
+ res.append(metric_group)
|
||||||
|
+ else:
|
||||||
|
+ metric_group = metric_groups.get(category_type)
|
||||||
|
+ if metric_group:
|
||||||
|
+ res.append(metric_group)
|
||||||
|
+
|
||||||
|
+ return res
|
||||||
|
+
|
||||||
|
+ @staticmethod
|
||||||
|
+ def _group_metric_by_category(metrics, categories) -> Dict[str, list]:
|
||||||
|
+ parts = {}
|
||||||
|
+ parted_metrics = set()
|
||||||
|
+ for cate_type, cate_metrics in categories.items():
|
||||||
|
+ part = []
|
||||||
|
+ for metric in metrics:
|
||||||
|
+ if metric in cate_metrics:
|
||||||
|
+ part.append(metric)
|
||||||
|
+ parted_metrics.add(metric)
|
||||||
|
+ if len(part) > 0:
|
||||||
|
+ parts.setdefault(cate_type, part)
|
||||||
|
+
|
||||||
|
+ other_part = []
|
||||||
|
+ for metric in metrics:
|
||||||
|
+ if metric not in parted_metrics:
|
||||||
|
+ other_part.append(metric)
|
||||||
|
+ if len(other_part) > 0:
|
||||||
|
+ parts.setdefault(METRIC_CATEGORY_OTHER, other_part)
|
||||||
|
+
|
||||||
|
+ virtual_part = [METRIC_ID_OF_CATEGORY_VIRTUAL]
|
||||||
|
+ parts.setdefault(METRIC_CATEGORY_VIRTUAL, virtual_part)
|
||||||
|
+
|
||||||
|
+ return parts
|
||||||
|
+
|
||||||
|
+ def get_avail_causal_relations(self, real_from_metrics, real_to_metrics) -> List[Tuple[list, list]]:
|
||||||
|
+ causal_relations = []
|
||||||
|
+
|
||||||
|
+ from_groups = self._group_metric_by_category(real_from_metrics, self.from_categories)
|
||||||
|
+ to_groups = self._group_metric_by_category(real_to_metrics, self.to_categories)
|
||||||
|
+ for cate_pair in self.category_pairs:
|
||||||
|
+ all_from_metrics = self.aggregate_metric_from_groups(cate_pair.from_, from_groups)
|
||||||
|
+ all_to_metrics = self.aggregate_metric_from_groups(cate_pair.to_, to_groups)
|
||||||
|
+ for from_metrics in all_from_metrics:
|
||||||
|
+ for to_metrics in all_to_metrics:
|
||||||
|
+ causal_relations.append((from_metrics, to_metrics))
|
||||||
|
+
|
||||||
|
+ return causal_relations
|
||||||
|
|
||||||
|
|
||||||
|
class Rule(metaclass=ABCMeta):
|
||||||
|
@@ -180,7 +233,8 @@ class NicRule1(Rule):
|
||||||
|
class RuleEngine:
|
||||||
|
def __init__(self):
|
||||||
|
self.rules: List[Rule] = []
|
||||||
|
- self.rule_metas = {}
|
||||||
|
+ self.metric_categories = {}
|
||||||
|
+ self.rule_metas: Dict[tuple, RuleMeta] = {}
|
||||||
|
|
||||||
|
def add_rule(self, rule: Rule):
|
||||||
|
self.rules.append(rule)
|
||||||
|
@@ -190,33 +244,58 @@ class RuleEngine:
|
||||||
|
rule.rule_parsing(causal_graph)
|
||||||
|
|
||||||
|
def load_rule_meta_from_yaml(self, rule_path: str) -> bool:
|
||||||
|
- abs_rule_path = os.path.abspath(rule_path)
|
||||||
|
- if not os.path.exists(abs_rule_path):
|
||||||
|
- logger.logger.warning("Rule meta path '{}' not exist", abs_rule_path)
|
||||||
|
- return True
|
||||||
|
try:
|
||||||
|
- with open(abs_rule_path, 'r') as file:
|
||||||
|
+ with open(os.path.abspath(rule_path), 'r') as file:
|
||||||
|
data = yaml.safe_load(file)
|
||||||
|
except IOError as ex:
|
||||||
|
logger.logger.warning(ex)
|
||||||
|
return False
|
||||||
|
|
||||||
|
- infer_rules = data.get("infer_rules", [])
|
||||||
|
- for rule_meta in infer_rules:
|
||||||
|
- saved_metric_range = []
|
||||||
|
- for item in rule_meta.get("metric_range", []):
|
||||||
|
- saved_metric_range.append(MetricPairSet(set(item.get('from', [])), set(item.get('to', []))))
|
||||||
|
- saved_rule_meta = RuleMeta(rule_meta.get('from_type'), rule_meta.get('to_type'), saved_metric_range)
|
||||||
|
- self.rule_metas.setdefault((rule_meta.get("from_type"), rule_meta.get("to_type")), saved_rule_meta)
|
||||||
|
-
|
||||||
|
+ self.load_rule_meta_from_dict(data)
|
||||||
|
return True
|
||||||
|
|
||||||
|
+ def create_default_rule_meta(self, from_type, to_type):
|
||||||
|
+ return RuleMeta(
|
||||||
|
+ from_type,
|
||||||
|
+ to_type,
|
||||||
|
+ self.metric_categories.get(from_type),
|
||||||
|
+ self.metric_categories.get(to_type),
|
||||||
|
+ [MetricCategoryPair(METRIC_CATEGORY_ALL, METRIC_CATEGORY_ALL)]
|
||||||
|
+ )
|
||||||
|
+
|
||||||
|
def add_rule_meta(self, causal_graph):
|
||||||
|
entity_cause_graph = causal_graph.entity_cause_graph
|
||||||
|
for edge in entity_cause_graph.edges:
|
||||||
|
from_type = entity_cause_graph.nodes[edge[0]].get('type')
|
||||||
|
to_type = entity_cause_graph.nodes[edge[1]].get('type')
|
||||||
|
- entity_cause_graph.edges[edge]["rule_meta"] = self.rule_metas.get((from_type, to_type))
|
||||||
|
+ rule_meta = self.rule_metas.get((from_type, to_type))
|
||||||
|
+ if not rule_meta:
|
||||||
|
+ rule_meta = self.create_default_rule_meta(from_type, to_type)
|
||||||
|
+ entity_cause_graph.edges[edge]["rule_meta"] = rule_meta
|
||||||
|
+
|
||||||
|
+ def load_rule_meta_from_dict(self, data: dict):
|
||||||
|
+ self.load_metric_categories(data.get('metric_categories', {}))
|
||||||
|
+ self.load_infer_rules(data.get("infer_rules", []))
|
||||||
|
+
|
||||||
|
+ def load_metric_categories(self, metric_categories: dict):
|
||||||
|
+ for entity_type, categories in metric_categories.items():
|
||||||
|
+ category_dict = {}
|
||||||
|
+ for category in categories:
|
||||||
|
+ category_dict.setdefault(category.get('category'), category.get('metrics'))
|
||||||
|
+ self.metric_categories.setdefault(entity_type, category_dict)
|
||||||
|
+
|
||||||
|
+ def load_infer_rules(self, infer_rules: list):
|
||||||
|
+ for rule_meta in infer_rules:
|
||||||
|
+ from_entity_type = rule_meta.get('from_type')
|
||||||
|
+ to_entity_type = rule_meta.get('to_type')
|
||||||
|
+ saved_metric_range = []
|
||||||
|
+ for item in rule_meta.get("metric_range", []):
|
||||||
|
+ from_category = item.get('from')
|
||||||
|
+ to_category = item.get('to')
|
||||||
|
+ saved_metric_range.append(MetricCategoryPair(from_category, to_category))
|
||||||
|
+ saved_rule_meta = RuleMeta(from_entity_type, to_entity_type, self.metric_categories.get(from_entity_type),
|
||||||
|
+ self.metric_categories.get(to_entity_type), saved_metric_range)
|
||||||
|
+ self.rule_metas.setdefault((from_entity_type, to_entity_type), saved_rule_meta)
|
||||||
|
|
||||||
|
|
||||||
|
rule_engine = RuleEngine()
|
||||||
|
diff --git a/config/infer-rule.yaml b/config/infer-rule.yaml
|
||||||
|
index c3bc3f3..d287972 100644
|
||||||
|
--- a/config/infer-rule.yaml
|
||||||
|
+++ b/config/infer-rule.yaml
|
||||||
|
@@ -1,32 +1,98 @@
|
||||||
|
+metric_categories:
|
||||||
|
+ proc:
|
||||||
|
+ -
|
||||||
|
+ category: PROC_CPU
|
||||||
|
+ metrics:
|
||||||
|
+ - gala_gopher_proc_utime_jiffies
|
||||||
|
+ - gala_gopher_proc_stime_jiffies
|
||||||
|
+ -
|
||||||
|
+ category: PROC_IO_LOAD
|
||||||
|
+ metrics:
|
||||||
|
+ - gala_gopher_proc_read_bytes
|
||||||
|
+ - gala_gopher_proc_write_bytes
|
||||||
|
+ - gala_gopher_proc_less_4k_io_read
|
||||||
|
+ - gala_gopher_proc_less_4k_io_write
|
||||||
|
+ - gala_gopher_proc_greater_4k_io_read
|
||||||
|
+ - gala_gopher_proc_greater_4k_io_write
|
||||||
|
+ disk:
|
||||||
|
+ -
|
||||||
|
+ category: DISK_LOAD
|
||||||
|
+ metrics:
|
||||||
|
+ - gala_gopher_disk_rspeed_kB
|
||||||
|
+ - gala_gopher_disk_wspeed_kB
|
||||||
|
+ - gala_gopher_disk_rspeed
|
||||||
|
+ - gala_gopher_disk_wspeed
|
||||||
|
+ -
|
||||||
|
+ category: DISK_DELAY
|
||||||
|
+ metrics:
|
||||||
|
+ - gala_gopher_disk_r_await
|
||||||
|
+ - gala_gopher_disk_w_await
|
||||||
|
+ - gala_gopher_disk_rareq
|
||||||
|
+ - gala_gopher_disk_wareq
|
||||||
|
+ block:
|
||||||
|
+ -
|
||||||
|
+ category: BLOCK_DELAY
|
||||||
|
+ metrics:
|
||||||
|
+ - gala_gopher_block_latency_req_max
|
||||||
|
+ - gala_gopher_block_latency_req_last
|
||||||
|
+ - gala_gopher_block_latency_req_sum
|
||||||
|
+ - gala_gopher_block_latency_req_jitter
|
||||||
|
+ nic:
|
||||||
|
+ -
|
||||||
|
+ category: NIC_DROP
|
||||||
|
+ metrics:
|
||||||
|
+ - gala_gopher_nic_tc_sent_drop
|
||||||
|
+ - gala_gopher_nic_tx_dropped
|
||||||
|
+ - gala_gopher_nic_rx_dropped
|
||||||
|
+ cpu:
|
||||||
|
+ -
|
||||||
|
+ category: CPU_TOTAL
|
||||||
|
+ metrics:
|
||||||
|
+ - gala_gopher_cpu_total_used_per
|
||||||
|
+
|
||||||
|
infer_rules:
|
||||||
|
-
|
||||||
|
from_type: cpu
|
||||||
|
to_type: proc
|
||||||
|
metric_range:
|
||||||
|
-
|
||||||
|
- from: []
|
||||||
|
- to:
|
||||||
|
- - gala_gopher_proc_utime_jiffies
|
||||||
|
- - gala_gopher_proc_stime_jiffies
|
||||||
|
+ from: CPU_TOTAL
|
||||||
|
+ to: PROC_CPU
|
||||||
|
-
|
||||||
|
from_type: block
|
||||||
|
to_type: proc
|
||||||
|
metric_range:
|
||||||
|
-
|
||||||
|
- from: []
|
||||||
|
- to:
|
||||||
|
- - gala_gopher_proc_ns_ext4_read
|
||||||
|
- - gala_gopher_proc_ns_ext4_write
|
||||||
|
- - gala_gopher_proc_ns_ext4_flush
|
||||||
|
- - gala_gopher_proc_ns_overlay_read
|
||||||
|
- - gala_gopher_proc_ns_overlay_write
|
||||||
|
- - gala_gopher_proc_ns_overlay_flush
|
||||||
|
+ from: BLOCK_DELAY
|
||||||
|
+ to: VIRTUAL
|
||||||
|
-
|
||||||
|
from_type: proc
|
||||||
|
to_type: disk
|
||||||
|
metric_range:
|
||||||
|
-
|
||||||
|
- from:
|
||||||
|
- - gala_gopher_proc_read_bytes
|
||||||
|
- - gala_gopher_proc_write_bytes
|
||||||
|
- to: []
|
||||||
|
\ No newline at end of file
|
||||||
|
+ from: PROC_IO_LOAD
|
||||||
|
+ to: ALL
|
||||||
|
+ -
|
||||||
|
+ from_type: disk
|
||||||
|
+ to_type: block
|
||||||
|
+ metric_range:
|
||||||
|
+ -
|
||||||
|
+ from: DISK_DELAY
|
||||||
|
+ to: BLOCK_DELAY
|
||||||
|
+ -
|
||||||
|
+ from_type: proc
|
||||||
|
+ to_type: sli
|
||||||
|
+ metric_range:
|
||||||
|
+ -
|
||||||
|
+ from: VIRTUAL
|
||||||
|
+ to: ALL
|
||||||
|
+ -
|
||||||
|
+ from: PROC_CPU
|
||||||
|
+ to: ALL
|
||||||
|
+ -
|
||||||
|
+ from_type: tcp_link
|
||||||
|
+ to_type: proc
|
||||||
|
+ metric_range:
|
||||||
|
+ -
|
||||||
|
+ from: ALL
|
||||||
|
+ to: VIRTUAL
|
||||||
|
\ No newline at end of file
|
||||||
|
--
|
||||||
|
2.21.0.windows.1
|
||||||
|
|
||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
Name: gala-spider
|
Name: gala-spider
|
||||||
Version: 1.0.0
|
Version: 1.0.0
|
||||||
Release: 2
|
Release: 3
|
||||||
Summary: OS topological graph storage service and cause inference service for gala-ops project
|
Summary: OS topological graph storage service and cause inference service for gala-ops project
|
||||||
License: MulanPSL2
|
License: MulanPSL2
|
||||||
URL: https://gitee.com/openeuler/gala-spider
|
URL: https://gitee.com/openeuler/gala-spider
|
||||||
@ -11,6 +11,8 @@ Source0: %{name}-%{version}.tar.gz
|
|||||||
BuildRequires: python3-setuptools systemd
|
BuildRequires: python3-setuptools systemd
|
||||||
Requires: python3-%{name} = %{version}-%{release}
|
Requires: python3-%{name} = %{version}-%{release}
|
||||||
|
|
||||||
|
patch0: 0001-cause-inference-optimization.patch
|
||||||
|
|
||||||
|
|
||||||
%description
|
%description
|
||||||
OS topological graph storage service for gala-ops project
|
OS topological graph storage service for gala-ops project
|
||||||
@ -41,7 +43,7 @@ Python3 package of gala-inference
|
|||||||
|
|
||||||
|
|
||||||
%prep
|
%prep
|
||||||
%autosetup
|
%autosetup -p1
|
||||||
|
|
||||||
|
|
||||||
%build
|
%build
|
||||||
@ -113,6 +115,9 @@ fi
|
|||||||
|
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Tue Nov 22 2022 algorithmofdish <hexiujun1@huawei.com> - 1.0.0-3
|
||||||
|
- Cause inference optimization
|
||||||
|
|
||||||
* Mon Nov 14 2022 Zhen Chen <chenzhen126@huawei.com> - 1.0.0-2
|
* Mon Nov 14 2022 Zhen Chen <chenzhen126@huawei.com> - 1.0.0-2
|
||||||
- Update 1.0.0 tarball
|
- Update 1.0.0 tarball
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user