680 lines
26 KiB
Diff
680 lines
26 KiB
Diff
From ac48cfdcac8c93ba59754899237a37dac40659ac Mon Sep 17 00:00:00 2001
|
||
From: algorithmofdish <hexiujun1@huawei.com>
|
||
Date: Fri, 18 Nov 2022 11:07:47 +0800
|
||
Subject: [PATCH] feat(infer): cause inference optimization
|
||
|
||
1. categorize metrics and select one as cause with max abnormal score;
|
||
2. add virtual metric to complete the fault propagation path;
|
||
3. use abnormal score without nomarlization;
|
||
4. fix a bug where subgraph are incompletely obtained.
|
||
---
|
||
cause_inference/__main__.py | 4 +-
|
||
cause_inference/arangodb.py | 24 +++--
|
||
cause_inference/cause_infer.py | 11 +--
|
||
cause_inference/infer_policy.py | 84 ++++++++++++++----
|
||
cause_inference/model.py | 64 +++++++++++---
|
||
cause_inference/rule_parser.py | 149 ++++++++++++++++++++++++--------
|
||
config/infer-rule.yaml | 98 +++++++++++++++++----
|
||
7 files changed, 337 insertions(+), 97 deletions(-)
|
||
|
||
diff --git a/cause_inference/__main__.py b/cause_inference/__main__.py
|
||
index baddbe4..093f7ac 100644
|
||
--- a/cause_inference/__main__.py
|
||
+++ b/cause_inference/__main__.py
|
||
@@ -15,7 +15,7 @@ from cause_inference.config import init_infer_config
|
||
from cause_inference.model import AbnormalEvent
|
||
from cause_inference.cause_infer import cause_locating
|
||
from cause_inference.cause_infer import parse_abn_evt
|
||
-from cause_inference.cause_infer import normalize_abn_score
|
||
+from cause_inference.cause_infer import preprocess_abn_score
|
||
from cause_inference.rule_parser import rule_engine
|
||
from cause_inference.exceptions import InferenceException
|
||
from cause_inference.exceptions import DataParseException
|
||
@@ -172,7 +172,7 @@ def get_recommend_metric_evts(abn_kpi_data: dict) -> List[AbnormalEvent]:
|
||
metric_evt = AbnormalEvent(
|
||
timestamp=abn_kpi_data.get('Timestamp'),
|
||
abnormal_metric_id=metric_data.get('metric', ''),
|
||
- abnormal_score=normalize_abn_score(metric_data.get('score')),
|
||
+ abnormal_score=preprocess_abn_score(metric_data.get('score')),
|
||
metric_labels=metric_data.get('label', {}),
|
||
desc=metric_data.get('description', '')
|
||
)
|
||
diff --git a/cause_inference/arangodb.py b/cause_inference/arangodb.py
|
||
index 424e500..d7982b7 100644
|
||
--- a/cause_inference/arangodb.py
|
||
+++ b/cause_inference/arangodb.py
|
||
@@ -22,6 +22,14 @@ def connect_to_arangodb(arango_url, db_name):
|
||
return conn.databases[db_name]
|
||
|
||
|
||
+def query_all(db, aql_query, bind_vars=None, raw_results=True):
|
||
+ res = []
|
||
+ query_hdl = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=raw_results)
|
||
+ for item in query_hdl:
|
||
+ res.append(item)
|
||
+ return res
|
||
+
|
||
+
|
||
def query_recent_topo_ts(db: Database, ts):
|
||
bind_vars = {'@collection': _TIMESTAMP_COLL_NAME, 'ts': ts}
|
||
aql_query = '''
|
||
@@ -32,12 +40,12 @@ def query_recent_topo_ts(db: Database, ts):
|
||
RETURN t._key
|
||
'''
|
||
try:
|
||
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||
+ query_res = query_all(db, aql_query, bind_vars)
|
||
except AQLQueryError as ex:
|
||
raise DBException(ex) from ex
|
||
- if query_res.get('error') or not query_res.get('result'):
|
||
+ if len(query_res) == 0:
|
||
raise DBException('Can not find topological graph at the abnormal timestamp {}'.format(ts))
|
||
- last_ts = query_res.get('result')[0]
|
||
+ last_ts = query_res[0]
|
||
return int(last_ts)
|
||
|
||
|
||
@@ -56,12 +64,12 @@ def query_topo_entities(db: Database, ts, query_options=None):
|
||
return v
|
||
'''.format(filter_str)
|
||
try:
|
||
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||
+ query_res = query_all(db, aql_query, bind_vars)
|
||
except AQLQueryError as ex:
|
||
raise DBException(ex) from ex
|
||
- if query_res.get('error') or not query_res.get('result'):
|
||
+ if len(query_res) == 0:
|
||
raise DBException('Can not find observe entities satisfied.')
|
||
- return query_res.get('result')
|
||
+ return query_res
|
||
|
||
|
||
def query_subgraph(db, ts, start_node_id, edge_collection, depth=1):
|
||
@@ -83,12 +91,12 @@ def query_subgraph(db, ts, start_node_id, edge_collection, depth=1):
|
||
return {{"vertex": v, "edge": e}}
|
||
'''.format(edge_coll_str)
|
||
try:
|
||
- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response
|
||
+ query_res = query_all(db, aql_query, bind_vars)
|
||
except AQLQueryError as ex:
|
||
raise DBException(ex) from ex
|
||
vertices = {}
|
||
edges = {}
|
||
- for item in query_res.get('result'):
|
||
+ for item in query_res:
|
||
vertex = item.get('vertex')
|
||
edge = item.get('edge')
|
||
vertices.setdefault(vertex.get('_id'), vertex)
|
||
diff --git a/cause_inference/cause_infer.py b/cause_inference/cause_infer.py
|
||
index 1954abf..dff26d0 100644
|
||
--- a/cause_inference/cause_infer.py
|
||
+++ b/cause_inference/cause_infer.py
|
||
@@ -23,8 +23,8 @@ from cause_inference.infer_policy import InferPolicy
|
||
from cause_inference.infer_policy import get_infer_policy
|
||
|
||
|
||
-def normalize_abn_score(score):
|
||
- return expit(score)
|
||
+def preprocess_abn_score(score):
|
||
+ return max(0, score)
|
||
|
||
|
||
# 因果推理
|
||
@@ -172,9 +172,10 @@ def cause_locating(abnormal_kpi: AbnormalEvent, abnormal_metrics: List[AbnormalE
|
||
# 5. 生成异常指标之间的因果图
|
||
causal_graph.init_metric_cause_graph()
|
||
|
||
- logger.logger.debug("Causal graph nodes are: {}".format(causal_graph.entity_cause_graph.nodes))
|
||
- logger.logger.debug("Causal graph predecessors: {}".format(causal_graph.entity_cause_graph.pred))
|
||
- logger.logger.debug("Causal graph successors: {}".format(causal_graph.entity_cause_graph.succ))
|
||
+ logger.logger.debug("Entity cause graph nodes are: {}".format(causal_graph.entity_cause_graph.nodes))
|
||
+ logger.logger.debug("Entity cause graph edges are: {}".format(causal_graph.entity_cause_graph.edges))
|
||
+ logger.logger.debug("Metric cause graph nodes are: {}".format(causal_graph.metric_cause_graph.nodes))
|
||
+ logger.logger.debug("Metric cause graph edges are: {}".format(causal_graph.metric_cause_graph.edges))
|
||
|
||
# 6. 以故障传播图 + 异常KPI为输入,执行根因推导算法,输出 topK 根因指标
|
||
infer_policy = get_infer_policy(infer_config.infer_conf.get('infer_policy'))
|
||
diff --git a/cause_inference/infer_policy.py b/cause_inference/infer_policy.py
|
||
index c0952e7..0663e74 100644
|
||
--- a/cause_inference/infer_policy.py
|
||
+++ b/cause_inference/infer_policy.py
|
||
@@ -5,9 +5,10 @@ from typing import List
|
||
|
||
import networkx as nx
|
||
|
||
-from cause_inference.cause_infer import CausalGraph
|
||
-from cause_inference.cause_infer import Cause
|
||
+from cause_inference.model import CausalGraph
|
||
+from cause_inference.model import Cause
|
||
from cause_inference.exceptions import InferenceException
|
||
+from cause_inference import rule_parser
|
||
|
||
|
||
class InferPolicy(ABC):
|
||
@@ -114,22 +115,28 @@ class DfsPolicy(InferPolicy):
|
||
if length < 1:
|
||
return 0.0
|
||
total_score = 0.0
|
||
+ num_of_valid_node = 0
|
||
for node in path[:length]:
|
||
+ if is_virtual_node(node):
|
||
+ continue
|
||
total_score += cause_graph.nodes[node].get('abnormal_score', 0)
|
||
- if length != 0:
|
||
- total_score /= length
|
||
+ num_of_valid_node += 1
|
||
+ if num_of_valid_node != 0:
|
||
+ total_score /= num_of_valid_node
|
||
return total_score
|
||
|
||
- def infer(self, causal_graph: CausalGraph, top_k: int) -> List[Cause]:
|
||
- cause_graph = causal_graph.metric_cause_graph
|
||
- abn_node_id = (causal_graph.entity_id_of_abn_kpi, causal_graph.abnormal_kpi.abnormal_metric_id)
|
||
-
|
||
- reverse_graph = nx.DiGraph()
|
||
- reverse_graph.add_nodes_from(cause_graph.nodes)
|
||
+ @staticmethod
|
||
+ def reverse_graph(cause_graph):
|
||
+ reversed_graph = nx.DiGraph()
|
||
+ reversed_graph.add_nodes_from(cause_graph.nodes)
|
||
for from_, to in cause_graph.edges:
|
||
- reverse_graph.add_edge(to, from_)
|
||
+ reversed_graph.add_edge(to, from_)
|
||
+ return reversed_graph
|
||
|
||
- successors = nx.dfs_successors(reverse_graph, abn_node_id)
|
||
+ @staticmethod
|
||
+ def get_all_paths_to_abn_node(abn_node_id, cause_graph):
|
||
+ reversed_graph = DfsPolicy.reverse_graph(cause_graph)
|
||
+ successors = nx.dfs_successors(reversed_graph, abn_node_id)
|
||
paths = []
|
||
path = []
|
||
|
||
@@ -145,24 +152,63 @@ class DfsPolicy(InferPolicy):
|
||
|
||
path.append(abn_node_id)
|
||
dfs_path(abn_node_id)
|
||
+ return paths
|
||
|
||
+ @staticmethod
|
||
+ def get_scored_paths(cause_graph, paths) -> list:
|
||
scored_paths = []
|
||
- for p in paths:
|
||
+ for path in paths:
|
||
scored_paths.append({
|
||
- 'score': self.calc_path_score(p, cause_graph),
|
||
- 'path': p
|
||
+ 'score': DfsPolicy.calc_path_score(path, cause_graph),
|
||
+ 'path': path
|
||
})
|
||
- scored_paths = sorted(scored_paths, key=lambda k: k['score'], reverse=True)
|
||
- scored_paths = scored_paths[:top_k]
|
||
+ return scored_paths
|
||
+
|
||
+ @staticmethod
|
||
+ def get_top_paths(scored_paths, top_k) -> list:
|
||
+ top_paths = []
|
||
+ node_selected = set()
|
||
+ metric_selected = set()
|
||
+ for path in scored_paths:
|
||
+ if len(top_paths) == top_k:
|
||
+ break
|
||
+ cause_node_id = path.get('path')[0]
|
||
+ if cause_node_id in node_selected:
|
||
+ continue
|
||
+ if cause_node_id[1] in metric_selected:
|
||
+ continue
|
||
+ if is_virtual_node(cause_node_id):
|
||
+ continue
|
||
+ node_selected.add(cause_node_id)
|
||
+ metric_selected.add(cause_node_id[1])
|
||
+ top_paths.append(path)
|
||
+
|
||
+ return top_paths
|
||
|
||
+ @staticmethod
|
||
+ def get_top_causes(top_paths) -> List[Cause]:
|
||
res = []
|
||
- for item in scored_paths:
|
||
+ for item in top_paths:
|
||
cause_node_id = item.get('path')[0]
|
||
cause = Cause(cause_node_id[1], cause_node_id[0], item.get('score'), item.get('path'))
|
||
res.append(cause)
|
||
-
|
||
return res
|
||
|
||
+ def infer(self, causal_graph: CausalGraph, top_k: int) -> List[Cause]:
|
||
+ cause_graph = causal_graph.metric_cause_graph
|
||
+ abn_node_id = (causal_graph.entity_id_of_abn_kpi, causal_graph.abnormal_kpi.abnormal_metric_id)
|
||
+
|
||
+ paths = self.get_all_paths_to_abn_node(abn_node_id, cause_graph)
|
||
+ scored_paths = self.get_scored_paths(cause_graph, paths)
|
||
+ scored_paths = sorted(scored_paths, key=lambda k: k['score'], reverse=True)
|
||
+ top_paths = self.get_top_paths(scored_paths, top_k)
|
||
+
|
||
+ return self.get_top_causes(top_paths)
|
||
+
|
||
+
|
||
+def is_virtual_node(node_id) -> bool:
|
||
+ return rule_parser.is_virtual_metric(node_id[1])
|
||
+
|
||
|
||
def get_infer_policy(policy: str, **options) -> InferPolicy:
|
||
if policy == 'dfs':
|
||
diff --git a/cause_inference/model.py b/cause_inference/model.py
|
||
index eb473da..136e4c3 100644
|
||
--- a/cause_inference/model.py
|
||
+++ b/cause_inference/model.py
|
||
@@ -7,6 +7,7 @@ from spider.conf.observe_meta import ObserveMetaMgt
|
||
from spider.util import logger
|
||
from spider.util.entity import concate_entity_id
|
||
from spider.util.entity import escape_entity_id
|
||
+from cause_inference import rule_parser
|
||
|
||
|
||
class AbnormalEvent:
|
||
@@ -77,6 +78,10 @@ class CausalGraph:
|
||
self.metric_cause_graph = nx.DiGraph()
|
||
self.init_casual_graph()
|
||
|
||
+ @staticmethod
|
||
+ def is_virtual_metric_group(metric_group) -> bool:
|
||
+ return len(metric_group) == 1 and rule_parser.is_virtual_metric(metric_group[0])
|
||
+
|
||
def init_casual_graph(self):
|
||
for node_id, node_attrs in self.topo_nodes.items():
|
||
self.entity_cause_graph.add_node(node_id, **node_attrs)
|
||
@@ -134,19 +139,54 @@ class CausalGraph:
|
||
self.init_metric_edge(edge)
|
||
|
||
def init_metric_edge(self, entity_edge):
|
||
- from_entity_id = entity_edge[0]
|
||
- to_entity_id = entity_edge[1]
|
||
+ f_entity_id = entity_edge[0]
|
||
+ t_entity_id = entity_edge[1]
|
||
+ avail_relations = self.get_avail_metric_causal_relations(entity_edge)
|
||
+
|
||
+ unique = set()
|
||
+ for f_metric_group, t_metric_group in avail_relations:
|
||
+ if self.is_virtual_metric_group(f_metric_group):
|
||
+ self.add_virtual_metric_node(f_entity_id, f_metric_group[0])
|
||
+ if self.is_virtual_metric_group(t_metric_group):
|
||
+ self.add_virtual_metric_node(t_entity_id, t_metric_group[0])
|
||
+
|
||
+ f_metric_id = self.metric_with_largest_abn_score(f_metric_group, f_entity_id)
|
||
+ t_metric_id = self.metric_with_largest_abn_score(t_metric_group, t_entity_id)
|
||
+ if (f_metric_id, t_metric_id) not in unique:
|
||
+ self.metric_cause_graph.add_edge((f_entity_id, f_metric_id), (t_entity_id, t_metric_id))
|
||
+ unique.add((f_metric_id, t_metric_id))
|
||
+
|
||
+ def add_virtual_metric_node(self, entity_id, metric_id):
|
||
+ self.metric_cause_graph.add_node((entity_id, metric_id))
|
||
+
|
||
+ def get_avail_metric_causal_relations(self, entity_edge):
|
||
+ f_metric_ids = self.get_abn_metric_ids(entity_edge[0])
|
||
+ t_metric_ids = self.get_abn_metric_ids(entity_edge[1])
|
||
rule_meta = self.entity_cause_graph.edges[entity_edge].get('rule_meta')
|
||
- for from_metric_evt in self.get_abnormal_metrics_of_node(from_entity_id):
|
||
- for to_metric_evt in self.get_abnormal_metrics_of_node(to_entity_id):
|
||
- from_metric_id = from_metric_evt.abnormal_metric_id
|
||
- to_metric_id = to_metric_evt.abnormal_metric_id
|
||
- if rule_meta is not None and not rule_meta.check_metric_pair(from_metric_id, to_metric_id):
|
||
- continue
|
||
- self.metric_cause_graph.add_edge(
|
||
- (from_entity_id, from_metric_id),
|
||
- (to_entity_id, to_metric_id)
|
||
- )
|
||
+ return rule_meta.get_avail_causal_relations(f_metric_ids, t_metric_ids)
|
||
+
|
||
+ def get_abn_metric_ids(self, entity_id):
|
||
+ metric_evts = self.get_abnormal_metrics_of_node(entity_id)
|
||
+ return [evt.abnormal_metric_id for evt in metric_evts]
|
||
+
|
||
+ def metric_with_largest_abn_score(self, metric_group: list, entity_id) -> str:
|
||
+ if len(metric_group) == 1:
|
||
+ return metric_group[0]
|
||
+
|
||
+ metric_evt_map = {}
|
||
+ metric_evts = self.get_abnormal_metrics_of_node(entity_id)
|
||
+ for metric_evt in metric_evts:
|
||
+ metric_evt_map.setdefault(metric_evt.abnormal_metric_id, metric_evt)
|
||
+
|
||
+ metric_id_of_largest = metric_group[0]
|
||
+ largest_abn_score = metric_evt_map.get(metric_id_of_largest).abnormal_score
|
||
+ for metric_id in metric_group:
|
||
+ abn_score = metric_evt_map.get(metric_id).abnormal_score
|
||
+ if abn_score > largest_abn_score:
|
||
+ metric_id_of_largest = metric_id
|
||
+ largest_abn_score = abn_score
|
||
+
|
||
+ return metric_id_of_largest
|
||
|
||
|
||
class Cause:
|
||
diff --git a/cause_inference/rule_parser.py b/cause_inference/rule_parser.py
|
||
index f08ed3c..eb6227c 100644
|
||
--- a/cause_inference/rule_parser.py
|
||
+++ b/cause_inference/rule_parser.py
|
||
@@ -1,7 +1,7 @@
|
||
import os
|
||
from abc import ABCMeta
|
||
from abc import abstractmethod
|
||
-from typing import List
|
||
+from typing import List, Dict, Tuple
|
||
|
||
import yaml
|
||
|
||
@@ -9,33 +9,86 @@ from spider.conf.observe_meta import RelationType
|
||
from spider.conf.observe_meta import EntityType
|
||
from spider.util import logger
|
||
|
||
+METRIC_CATEGORY_ALL = 'ALL'
|
||
+METRIC_CATEGORY_OTHER = 'OTHER'
|
||
+METRIC_CATEGORY_VIRTUAL = 'VIRTUAL'
|
||
+METRIC_ID_OF_CATEGORY_VIRTUAL = 'virtual_metric'
|
||
|
||
-class MetricPairSet:
|
||
- def __init__(self, from_: set, to_: set):
|
||
+
|
||
+def is_virtual_metric(metric_id: str) -> bool:
|
||
+ return metric_id == METRIC_ID_OF_CATEGORY_VIRTUAL
|
||
+
|
||
+
|
||
+class MetricCategoryPair:
|
||
+ def __init__(self, from_: str, to_: str):
|
||
self.from_ = from_
|
||
self.to_ = to_
|
||
|
||
- def check_metric_pair(self, from_metric_id: str, to_metric_id: str) -> bool:
|
||
- if self.from_ and from_metric_id not in self.from_:
|
||
- return False
|
||
- if self.to_ and to_metric_id not in self.to_:
|
||
- return False
|
||
- return True
|
||
-
|
||
|
||
class RuleMeta:
|
||
- def __init__(self, from_type, to_type, metric_range=None):
|
||
+ def __init__(self, from_type, to_type, from_categories=None, to_categories=None, metric_range=None):
|
||
self.from_type = from_type
|
||
self.to_type = to_type
|
||
- self.metric_range: List[MetricPairSet] = metric_range or []
|
||
-
|
||
- def check_metric_pair(self, from_metric_id: str, to_metric_id: str) -> bool:
|
||
- if not self.metric_range:
|
||
- return True
|
||
- for item in self.metric_range:
|
||
- if item.check_metric_pair(from_metric_id, to_metric_id):
|
||
- return True
|
||
- return False
|
||
+ self.from_categories = from_categories or {}
|
||
+ self.to_categories = to_categories or {}
|
||
+ self.category_pairs: List[MetricCategoryPair] = metric_range or []
|
||
+
|
||
+ @staticmethod
|
||
+ def aggregate_metric_from_groups(category_type, metric_groups) -> List[list]:
|
||
+ res = []
|
||
+ if category_type == METRIC_CATEGORY_ALL:
|
||
+ for cate_type, metric_group in metric_groups.items():
|
||
+ if cate_type == METRIC_CATEGORY_VIRTUAL:
|
||
+ continue
|
||
+ elif cate_type == METRIC_CATEGORY_OTHER:
|
||
+ res.extend([metric] for metric in metric_group)
|
||
+ else:
|
||
+ res.append(metric_group)
|
||
+ else:
|
||
+ metric_group = metric_groups.get(category_type)
|
||
+ if metric_group:
|
||
+ res.append(metric_group)
|
||
+
|
||
+ return res
|
||
+
|
||
+ @staticmethod
|
||
+ def _group_metric_by_category(metrics, categories) -> Dict[str, list]:
|
||
+ parts = {}
|
||
+ parted_metrics = set()
|
||
+ for cate_type, cate_metrics in categories.items():
|
||
+ part = []
|
||
+ for metric in metrics:
|
||
+ if metric in cate_metrics:
|
||
+ part.append(metric)
|
||
+ parted_metrics.add(metric)
|
||
+ if len(part) > 0:
|
||
+ parts.setdefault(cate_type, part)
|
||
+
|
||
+ other_part = []
|
||
+ for metric in metrics:
|
||
+ if metric not in parted_metrics:
|
||
+ other_part.append(metric)
|
||
+ if len(other_part) > 0:
|
||
+ parts.setdefault(METRIC_CATEGORY_OTHER, other_part)
|
||
+
|
||
+ virtual_part = [METRIC_ID_OF_CATEGORY_VIRTUAL]
|
||
+ parts.setdefault(METRIC_CATEGORY_VIRTUAL, virtual_part)
|
||
+
|
||
+ return parts
|
||
+
|
||
+ def get_avail_causal_relations(self, real_from_metrics, real_to_metrics) -> List[Tuple[list, list]]:
|
||
+ causal_relations = []
|
||
+
|
||
+ from_groups = self._group_metric_by_category(real_from_metrics, self.from_categories)
|
||
+ to_groups = self._group_metric_by_category(real_to_metrics, self.to_categories)
|
||
+ for cate_pair in self.category_pairs:
|
||
+ all_from_metrics = self.aggregate_metric_from_groups(cate_pair.from_, from_groups)
|
||
+ all_to_metrics = self.aggregate_metric_from_groups(cate_pair.to_, to_groups)
|
||
+ for from_metrics in all_from_metrics:
|
||
+ for to_metrics in all_to_metrics:
|
||
+ causal_relations.append((from_metrics, to_metrics))
|
||
+
|
||
+ return causal_relations
|
||
|
||
|
||
class Rule(metaclass=ABCMeta):
|
||
@@ -180,7 +233,8 @@ class NicRule1(Rule):
|
||
class RuleEngine:
|
||
def __init__(self):
|
||
self.rules: List[Rule] = []
|
||
- self.rule_metas = {}
|
||
+ self.metric_categories = {}
|
||
+ self.rule_metas: Dict[tuple, RuleMeta] = {}
|
||
|
||
def add_rule(self, rule: Rule):
|
||
self.rules.append(rule)
|
||
@@ -190,33 +244,58 @@ class RuleEngine:
|
||
rule.rule_parsing(causal_graph)
|
||
|
||
def load_rule_meta_from_yaml(self, rule_path: str) -> bool:
|
||
- abs_rule_path = os.path.abspath(rule_path)
|
||
- if not os.path.exists(abs_rule_path):
|
||
- logger.logger.warning("Rule meta path '{}' not exist", abs_rule_path)
|
||
- return True
|
||
try:
|
||
- with open(abs_rule_path, 'r') as file:
|
||
+ with open(os.path.abspath(rule_path), 'r') as file:
|
||
data = yaml.safe_load(file)
|
||
except IOError as ex:
|
||
logger.logger.warning(ex)
|
||
return False
|
||
|
||
- infer_rules = data.get("infer_rules", [])
|
||
- for rule_meta in infer_rules:
|
||
- saved_metric_range = []
|
||
- for item in rule_meta.get("metric_range", []):
|
||
- saved_metric_range.append(MetricPairSet(set(item.get('from', [])), set(item.get('to', []))))
|
||
- saved_rule_meta = RuleMeta(rule_meta.get('from_type'), rule_meta.get('to_type'), saved_metric_range)
|
||
- self.rule_metas.setdefault((rule_meta.get("from_type"), rule_meta.get("to_type")), saved_rule_meta)
|
||
-
|
||
+ self.load_rule_meta_from_dict(data)
|
||
return True
|
||
|
||
+ def create_default_rule_meta(self, from_type, to_type):
|
||
+ return RuleMeta(
|
||
+ from_type,
|
||
+ to_type,
|
||
+ self.metric_categories.get(from_type),
|
||
+ self.metric_categories.get(to_type),
|
||
+ [MetricCategoryPair(METRIC_CATEGORY_ALL, METRIC_CATEGORY_ALL)]
|
||
+ )
|
||
+
|
||
def add_rule_meta(self, causal_graph):
|
||
entity_cause_graph = causal_graph.entity_cause_graph
|
||
for edge in entity_cause_graph.edges:
|
||
from_type = entity_cause_graph.nodes[edge[0]].get('type')
|
||
to_type = entity_cause_graph.nodes[edge[1]].get('type')
|
||
- entity_cause_graph.edges[edge]["rule_meta"] = self.rule_metas.get((from_type, to_type))
|
||
+ rule_meta = self.rule_metas.get((from_type, to_type))
|
||
+ if not rule_meta:
|
||
+ rule_meta = self.create_default_rule_meta(from_type, to_type)
|
||
+ entity_cause_graph.edges[edge]["rule_meta"] = rule_meta
|
||
+
|
||
+ def load_rule_meta_from_dict(self, data: dict):
|
||
+ self.load_metric_categories(data.get('metric_categories', {}))
|
||
+ self.load_infer_rules(data.get("infer_rules", []))
|
||
+
|
||
+ def load_metric_categories(self, metric_categories: dict):
|
||
+ for entity_type, categories in metric_categories.items():
|
||
+ category_dict = {}
|
||
+ for category in categories:
|
||
+ category_dict.setdefault(category.get('category'), category.get('metrics'))
|
||
+ self.metric_categories.setdefault(entity_type, category_dict)
|
||
+
|
||
+ def load_infer_rules(self, infer_rules: list):
|
||
+ for rule_meta in infer_rules:
|
||
+ from_entity_type = rule_meta.get('from_type')
|
||
+ to_entity_type = rule_meta.get('to_type')
|
||
+ saved_metric_range = []
|
||
+ for item in rule_meta.get("metric_range", []):
|
||
+ from_category = item.get('from')
|
||
+ to_category = item.get('to')
|
||
+ saved_metric_range.append(MetricCategoryPair(from_category, to_category))
|
||
+ saved_rule_meta = RuleMeta(from_entity_type, to_entity_type, self.metric_categories.get(from_entity_type),
|
||
+ self.metric_categories.get(to_entity_type), saved_metric_range)
|
||
+ self.rule_metas.setdefault((from_entity_type, to_entity_type), saved_rule_meta)
|
||
|
||
|
||
rule_engine = RuleEngine()
|
||
diff --git a/config/infer-rule.yaml b/config/infer-rule.yaml
|
||
index c3bc3f3..d287972 100644
|
||
--- a/config/infer-rule.yaml
|
||
+++ b/config/infer-rule.yaml
|
||
@@ -1,32 +1,98 @@
|
||
+metric_categories:
|
||
+ proc:
|
||
+ -
|
||
+ category: PROC_CPU
|
||
+ metrics:
|
||
+ - gala_gopher_proc_utime_jiffies
|
||
+ - gala_gopher_proc_stime_jiffies
|
||
+ -
|
||
+ category: PROC_IO_LOAD
|
||
+ metrics:
|
||
+ - gala_gopher_proc_read_bytes
|
||
+ - gala_gopher_proc_write_bytes
|
||
+ - gala_gopher_proc_less_4k_io_read
|
||
+ - gala_gopher_proc_less_4k_io_write
|
||
+ - gala_gopher_proc_greater_4k_io_read
|
||
+ - gala_gopher_proc_greater_4k_io_write
|
||
+ disk:
|
||
+ -
|
||
+ category: DISK_LOAD
|
||
+ metrics:
|
||
+ - gala_gopher_disk_rspeed_kB
|
||
+ - gala_gopher_disk_wspeed_kB
|
||
+ - gala_gopher_disk_rspeed
|
||
+ - gala_gopher_disk_wspeed
|
||
+ -
|
||
+ category: DISK_DELAY
|
||
+ metrics:
|
||
+ - gala_gopher_disk_r_await
|
||
+ - gala_gopher_disk_w_await
|
||
+ - gala_gopher_disk_rareq
|
||
+ - gala_gopher_disk_wareq
|
||
+ block:
|
||
+ -
|
||
+ category: BLOCK_DELAY
|
||
+ metrics:
|
||
+ - gala_gopher_block_latency_req_max
|
||
+ - gala_gopher_block_latency_req_last
|
||
+ - gala_gopher_block_latency_req_sum
|
||
+ - gala_gopher_block_latency_req_jitter
|
||
+ nic:
|
||
+ -
|
||
+ category: NIC_DROP
|
||
+ metrics:
|
||
+ - gala_gopher_nic_tc_sent_drop
|
||
+ - gala_gopher_nic_tx_dropped
|
||
+ - gala_gopher_nic_rx_dropped
|
||
+ cpu:
|
||
+ -
|
||
+ category: CPU_TOTAL
|
||
+ metrics:
|
||
+ - gala_gopher_cpu_total_used_per
|
||
+
|
||
infer_rules:
|
||
-
|
||
from_type: cpu
|
||
to_type: proc
|
||
metric_range:
|
||
-
|
||
- from: []
|
||
- to:
|
||
- - gala_gopher_proc_utime_jiffies
|
||
- - gala_gopher_proc_stime_jiffies
|
||
+ from: CPU_TOTAL
|
||
+ to: PROC_CPU
|
||
-
|
||
from_type: block
|
||
to_type: proc
|
||
metric_range:
|
||
-
|
||
- from: []
|
||
- to:
|
||
- - gala_gopher_proc_ns_ext4_read
|
||
- - gala_gopher_proc_ns_ext4_write
|
||
- - gala_gopher_proc_ns_ext4_flush
|
||
- - gala_gopher_proc_ns_overlay_read
|
||
- - gala_gopher_proc_ns_overlay_write
|
||
- - gala_gopher_proc_ns_overlay_flush
|
||
+ from: BLOCK_DELAY
|
||
+ to: VIRTUAL
|
||
-
|
||
from_type: proc
|
||
to_type: disk
|
||
metric_range:
|
||
-
|
||
- from:
|
||
- - gala_gopher_proc_read_bytes
|
||
- - gala_gopher_proc_write_bytes
|
||
- to: []
|
||
\ No newline at end of file
|
||
+ from: PROC_IO_LOAD
|
||
+ to: ALL
|
||
+ -
|
||
+ from_type: disk
|
||
+ to_type: block
|
||
+ metric_range:
|
||
+ -
|
||
+ from: DISK_DELAY
|
||
+ to: BLOCK_DELAY
|
||
+ -
|
||
+ from_type: proc
|
||
+ to_type: sli
|
||
+ metric_range:
|
||
+ -
|
||
+ from: VIRTUAL
|
||
+ to: ALL
|
||
+ -
|
||
+ from: PROC_CPU
|
||
+ to: ALL
|
||
+ -
|
||
+ from_type: tcp_link
|
||
+ to_type: proc
|
||
+ metric_range:
|
||
+ -
|
||
+ from: ALL
|
||
+ to: VIRTUAL
|
||
\ No newline at end of file
|
||
--
|
||
2.21.0.windows.1
|
||
|