diff --git a/0001-cause-inference-optimization.patch b/0001-cause-inference-optimization.patch deleted file mode 100644 index 9f43760..0000000 --- a/0001-cause-inference-optimization.patch +++ /dev/null @@ -1,679 +0,0 @@ -From ac48cfdcac8c93ba59754899237a37dac40659ac Mon Sep 17 00:00:00 2001 -From: algorithmofdish -Date: Fri, 18 Nov 2022 11:07:47 +0800 -Subject: [PATCH] feat(infer): cause inference optimization - -1. categorize metrics and select one as cause with max abnormal score; -2. add virtual metric to complete the fault propagation path; -3. use abnormal score without nomarlization; -4. fix a bug where subgraph are incompletely obtained. ---- - cause_inference/__main__.py | 4 +- - cause_inference/arangodb.py | 24 +++-- - cause_inference/cause_infer.py | 11 +-- - cause_inference/infer_policy.py | 84 ++++++++++++++---- - cause_inference/model.py | 64 +++++++++++--- - cause_inference/rule_parser.py | 149 ++++++++++++++++++++++++-------- - config/infer-rule.yaml | 98 +++++++++++++++++---- - 7 files changed, 337 insertions(+), 97 deletions(-) - -diff --git a/cause_inference/__main__.py b/cause_inference/__main__.py -index baddbe4..093f7ac 100644 ---- a/cause_inference/__main__.py -+++ b/cause_inference/__main__.py -@@ -15,7 +15,7 @@ from cause_inference.config import init_infer_config - from cause_inference.model import AbnormalEvent - from cause_inference.cause_infer import cause_locating - from cause_inference.cause_infer import parse_abn_evt --from cause_inference.cause_infer import normalize_abn_score -+from cause_inference.cause_infer import preprocess_abn_score - from cause_inference.rule_parser import rule_engine - from cause_inference.exceptions import InferenceException - from cause_inference.exceptions import DataParseException -@@ -172,7 +172,7 @@ def get_recommend_metric_evts(abn_kpi_data: dict) -> List[AbnormalEvent]: - metric_evt = AbnormalEvent( - timestamp=abn_kpi_data.get('Timestamp'), - abnormal_metric_id=metric_data.get('metric', ''), -- abnormal_score=normalize_abn_score(metric_data.get('score')), -+ abnormal_score=preprocess_abn_score(metric_data.get('score')), - metric_labels=metric_data.get('label', {}), - desc=metric_data.get('description', '') - ) -diff --git a/cause_inference/arangodb.py b/cause_inference/arangodb.py -index 424e500..d7982b7 100644 ---- a/cause_inference/arangodb.py -+++ b/cause_inference/arangodb.py -@@ -22,6 +22,14 @@ def connect_to_arangodb(arango_url, db_name): - return conn.databases[db_name] - - -+def query_all(db, aql_query, bind_vars=None, raw_results=True): -+ res = [] -+ query_hdl = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=raw_results) -+ for item in query_hdl: -+ res.append(item) -+ return res -+ -+ - def query_recent_topo_ts(db: Database, ts): - bind_vars = {'@collection': _TIMESTAMP_COLL_NAME, 'ts': ts} - aql_query = ''' -@@ -32,12 +40,12 @@ def query_recent_topo_ts(db: Database, ts): - RETURN t._key - ''' - try: -- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response -+ query_res = query_all(db, aql_query, bind_vars) - except AQLQueryError as ex: - raise DBException(ex) from ex -- if query_res.get('error') or not query_res.get('result'): -+ if len(query_res) == 0: - raise DBException('Can not find topological graph at the abnormal timestamp {}'.format(ts)) -- last_ts = query_res.get('result')[0] -+ last_ts = query_res[0] - return int(last_ts) - - -@@ -56,12 +64,12 @@ def query_topo_entities(db: Database, ts, query_options=None): - return v - '''.format(filter_str) - try: -- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response -+ query_res = query_all(db, aql_query, bind_vars) - except AQLQueryError as ex: - raise DBException(ex) from ex -- if query_res.get('error') or not query_res.get('result'): -+ if len(query_res) == 0: - raise DBException('Can not find observe entities satisfied.') -- return query_res.get('result') -+ return query_res - - - def query_subgraph(db, ts, start_node_id, edge_collection, depth=1): -@@ -83,12 +91,12 @@ def query_subgraph(db, ts, start_node_id, edge_collection, depth=1): - return {{"vertex": v, "edge": e}} - '''.format(edge_coll_str) - try: -- query_res = db.AQLQuery(aql_query, bindVars=bind_vars, rawResults=True).response -+ query_res = query_all(db, aql_query, bind_vars) - except AQLQueryError as ex: - raise DBException(ex) from ex - vertices = {} - edges = {} -- for item in query_res.get('result'): -+ for item in query_res: - vertex = item.get('vertex') - edge = item.get('edge') - vertices.setdefault(vertex.get('_id'), vertex) -diff --git a/cause_inference/cause_infer.py b/cause_inference/cause_infer.py -index 1954abf..dff26d0 100644 ---- a/cause_inference/cause_infer.py -+++ b/cause_inference/cause_infer.py -@@ -23,8 +23,8 @@ from cause_inference.infer_policy import InferPolicy - from cause_inference.infer_policy import get_infer_policy - - --def normalize_abn_score(score): -- return expit(score) -+def preprocess_abn_score(score): -+ return max(0, score) - - - # 因果推理 -@@ -172,9 +172,10 @@ def cause_locating(abnormal_kpi: AbnormalEvent, abnormal_metrics: List[AbnormalE - # 5. 生成异常指标之间的因果图 - causal_graph.init_metric_cause_graph() - -- logger.logger.debug("Causal graph nodes are: {}".format(causal_graph.entity_cause_graph.nodes)) -- logger.logger.debug("Causal graph predecessors: {}".format(causal_graph.entity_cause_graph.pred)) -- logger.logger.debug("Causal graph successors: {}".format(causal_graph.entity_cause_graph.succ)) -+ logger.logger.debug("Entity cause graph nodes are: {}".format(causal_graph.entity_cause_graph.nodes)) -+ logger.logger.debug("Entity cause graph edges are: {}".format(causal_graph.entity_cause_graph.edges)) -+ logger.logger.debug("Metric cause graph nodes are: {}".format(causal_graph.metric_cause_graph.nodes)) -+ logger.logger.debug("Metric cause graph edges are: {}".format(causal_graph.metric_cause_graph.edges)) - - # 6. 以故障传播图 + 异常KPI为输入,执行根因推导算法,输出 topK 根因指标 - infer_policy = get_infer_policy(infer_config.infer_conf.get('infer_policy')) -diff --git a/cause_inference/infer_policy.py b/cause_inference/infer_policy.py -index c0952e7..0663e74 100644 ---- a/cause_inference/infer_policy.py -+++ b/cause_inference/infer_policy.py -@@ -5,9 +5,10 @@ from typing import List - - import networkx as nx - --from cause_inference.cause_infer import CausalGraph --from cause_inference.cause_infer import Cause -+from cause_inference.model import CausalGraph -+from cause_inference.model import Cause - from cause_inference.exceptions import InferenceException -+from cause_inference import rule_parser - - - class InferPolicy(ABC): -@@ -114,22 +115,28 @@ class DfsPolicy(InferPolicy): - if length < 1: - return 0.0 - total_score = 0.0 -+ num_of_valid_node = 0 - for node in path[:length]: -+ if is_virtual_node(node): -+ continue - total_score += cause_graph.nodes[node].get('abnormal_score', 0) -- if length != 0: -- total_score /= length -+ num_of_valid_node += 1 -+ if num_of_valid_node != 0: -+ total_score /= num_of_valid_node - return total_score - -- def infer(self, causal_graph: CausalGraph, top_k: int) -> List[Cause]: -- cause_graph = causal_graph.metric_cause_graph -- abn_node_id = (causal_graph.entity_id_of_abn_kpi, causal_graph.abnormal_kpi.abnormal_metric_id) -- -- reverse_graph = nx.DiGraph() -- reverse_graph.add_nodes_from(cause_graph.nodes) -+ @staticmethod -+ def reverse_graph(cause_graph): -+ reversed_graph = nx.DiGraph() -+ reversed_graph.add_nodes_from(cause_graph.nodes) - for from_, to in cause_graph.edges: -- reverse_graph.add_edge(to, from_) -+ reversed_graph.add_edge(to, from_) -+ return reversed_graph - -- successors = nx.dfs_successors(reverse_graph, abn_node_id) -+ @staticmethod -+ def get_all_paths_to_abn_node(abn_node_id, cause_graph): -+ reversed_graph = DfsPolicy.reverse_graph(cause_graph) -+ successors = nx.dfs_successors(reversed_graph, abn_node_id) - paths = [] - path = [] - -@@ -145,24 +152,63 @@ class DfsPolicy(InferPolicy): - - path.append(abn_node_id) - dfs_path(abn_node_id) -+ return paths - -+ @staticmethod -+ def get_scored_paths(cause_graph, paths) -> list: - scored_paths = [] -- for p in paths: -+ for path in paths: - scored_paths.append({ -- 'score': self.calc_path_score(p, cause_graph), -- 'path': p -+ 'score': DfsPolicy.calc_path_score(path, cause_graph), -+ 'path': path - }) -- scored_paths = sorted(scored_paths, key=lambda k: k['score'], reverse=True) -- scored_paths = scored_paths[:top_k] -+ return scored_paths -+ -+ @staticmethod -+ def get_top_paths(scored_paths, top_k) -> list: -+ top_paths = [] -+ node_selected = set() -+ metric_selected = set() -+ for path in scored_paths: -+ if len(top_paths) == top_k: -+ break -+ cause_node_id = path.get('path')[0] -+ if cause_node_id in node_selected: -+ continue -+ if cause_node_id[1] in metric_selected: -+ continue -+ if is_virtual_node(cause_node_id): -+ continue -+ node_selected.add(cause_node_id) -+ metric_selected.add(cause_node_id[1]) -+ top_paths.append(path) -+ -+ return top_paths - -+ @staticmethod -+ def get_top_causes(top_paths) -> List[Cause]: - res = [] -- for item in scored_paths: -+ for item in top_paths: - cause_node_id = item.get('path')[0] - cause = Cause(cause_node_id[1], cause_node_id[0], item.get('score'), item.get('path')) - res.append(cause) -- - return res - -+ def infer(self, causal_graph: CausalGraph, top_k: int) -> List[Cause]: -+ cause_graph = causal_graph.metric_cause_graph -+ abn_node_id = (causal_graph.entity_id_of_abn_kpi, causal_graph.abnormal_kpi.abnormal_metric_id) -+ -+ paths = self.get_all_paths_to_abn_node(abn_node_id, cause_graph) -+ scored_paths = self.get_scored_paths(cause_graph, paths) -+ scored_paths = sorted(scored_paths, key=lambda k: k['score'], reverse=True) -+ top_paths = self.get_top_paths(scored_paths, top_k) -+ -+ return self.get_top_causes(top_paths) -+ -+ -+def is_virtual_node(node_id) -> bool: -+ return rule_parser.is_virtual_metric(node_id[1]) -+ - - def get_infer_policy(policy: str, **options) -> InferPolicy: - if policy == 'dfs': -diff --git a/cause_inference/model.py b/cause_inference/model.py -index eb473da..136e4c3 100644 ---- a/cause_inference/model.py -+++ b/cause_inference/model.py -@@ -7,6 +7,7 @@ from spider.conf.observe_meta import ObserveMetaMgt - from spider.util import logger - from spider.util.entity import concate_entity_id - from spider.util.entity import escape_entity_id -+from cause_inference import rule_parser - - - class AbnormalEvent: -@@ -77,6 +78,10 @@ class CausalGraph: - self.metric_cause_graph = nx.DiGraph() - self.init_casual_graph() - -+ @staticmethod -+ def is_virtual_metric_group(metric_group) -> bool: -+ return len(metric_group) == 1 and rule_parser.is_virtual_metric(metric_group[0]) -+ - def init_casual_graph(self): - for node_id, node_attrs in self.topo_nodes.items(): - self.entity_cause_graph.add_node(node_id, **node_attrs) -@@ -134,19 +139,54 @@ class CausalGraph: - self.init_metric_edge(edge) - - def init_metric_edge(self, entity_edge): -- from_entity_id = entity_edge[0] -- to_entity_id = entity_edge[1] -+ f_entity_id = entity_edge[0] -+ t_entity_id = entity_edge[1] -+ avail_relations = self.get_avail_metric_causal_relations(entity_edge) -+ -+ unique = set() -+ for f_metric_group, t_metric_group in avail_relations: -+ if self.is_virtual_metric_group(f_metric_group): -+ self.add_virtual_metric_node(f_entity_id, f_metric_group[0]) -+ if self.is_virtual_metric_group(t_metric_group): -+ self.add_virtual_metric_node(t_entity_id, t_metric_group[0]) -+ -+ f_metric_id = self.metric_with_largest_abn_score(f_metric_group, f_entity_id) -+ t_metric_id = self.metric_with_largest_abn_score(t_metric_group, t_entity_id) -+ if (f_metric_id, t_metric_id) not in unique: -+ self.metric_cause_graph.add_edge((f_entity_id, f_metric_id), (t_entity_id, t_metric_id)) -+ unique.add((f_metric_id, t_metric_id)) -+ -+ def add_virtual_metric_node(self, entity_id, metric_id): -+ self.metric_cause_graph.add_node((entity_id, metric_id)) -+ -+ def get_avail_metric_causal_relations(self, entity_edge): -+ f_metric_ids = self.get_abn_metric_ids(entity_edge[0]) -+ t_metric_ids = self.get_abn_metric_ids(entity_edge[1]) - rule_meta = self.entity_cause_graph.edges[entity_edge].get('rule_meta') -- for from_metric_evt in self.get_abnormal_metrics_of_node(from_entity_id): -- for to_metric_evt in self.get_abnormal_metrics_of_node(to_entity_id): -- from_metric_id = from_metric_evt.abnormal_metric_id -- to_metric_id = to_metric_evt.abnormal_metric_id -- if rule_meta is not None and not rule_meta.check_metric_pair(from_metric_id, to_metric_id): -- continue -- self.metric_cause_graph.add_edge( -- (from_entity_id, from_metric_id), -- (to_entity_id, to_metric_id) -- ) -+ return rule_meta.get_avail_causal_relations(f_metric_ids, t_metric_ids) -+ -+ def get_abn_metric_ids(self, entity_id): -+ metric_evts = self.get_abnormal_metrics_of_node(entity_id) -+ return [evt.abnormal_metric_id for evt in metric_evts] -+ -+ def metric_with_largest_abn_score(self, metric_group: list, entity_id) -> str: -+ if len(metric_group) == 1: -+ return metric_group[0] -+ -+ metric_evt_map = {} -+ metric_evts = self.get_abnormal_metrics_of_node(entity_id) -+ for metric_evt in metric_evts: -+ metric_evt_map.setdefault(metric_evt.abnormal_metric_id, metric_evt) -+ -+ metric_id_of_largest = metric_group[0] -+ largest_abn_score = metric_evt_map.get(metric_id_of_largest).abnormal_score -+ for metric_id in metric_group: -+ abn_score = metric_evt_map.get(metric_id).abnormal_score -+ if abn_score > largest_abn_score: -+ metric_id_of_largest = metric_id -+ largest_abn_score = abn_score -+ -+ return metric_id_of_largest - - - class Cause: -diff --git a/cause_inference/rule_parser.py b/cause_inference/rule_parser.py -index f08ed3c..eb6227c 100644 ---- a/cause_inference/rule_parser.py -+++ b/cause_inference/rule_parser.py -@@ -1,7 +1,7 @@ - import os - from abc import ABCMeta - from abc import abstractmethod --from typing import List -+from typing import List, Dict, Tuple - - import yaml - -@@ -9,33 +9,86 @@ from spider.conf.observe_meta import RelationType - from spider.conf.observe_meta import EntityType - from spider.util import logger - -+METRIC_CATEGORY_ALL = 'ALL' -+METRIC_CATEGORY_OTHER = 'OTHER' -+METRIC_CATEGORY_VIRTUAL = 'VIRTUAL' -+METRIC_ID_OF_CATEGORY_VIRTUAL = 'virtual_metric' - --class MetricPairSet: -- def __init__(self, from_: set, to_: set): -+ -+def is_virtual_metric(metric_id: str) -> bool: -+ return metric_id == METRIC_ID_OF_CATEGORY_VIRTUAL -+ -+ -+class MetricCategoryPair: -+ def __init__(self, from_: str, to_: str): - self.from_ = from_ - self.to_ = to_ - -- def check_metric_pair(self, from_metric_id: str, to_metric_id: str) -> bool: -- if self.from_ and from_metric_id not in self.from_: -- return False -- if self.to_ and to_metric_id not in self.to_: -- return False -- return True -- - - class RuleMeta: -- def __init__(self, from_type, to_type, metric_range=None): -+ def __init__(self, from_type, to_type, from_categories=None, to_categories=None, metric_range=None): - self.from_type = from_type - self.to_type = to_type -- self.metric_range: List[MetricPairSet] = metric_range or [] -- -- def check_metric_pair(self, from_metric_id: str, to_metric_id: str) -> bool: -- if not self.metric_range: -- return True -- for item in self.metric_range: -- if item.check_metric_pair(from_metric_id, to_metric_id): -- return True -- return False -+ self.from_categories = from_categories or {} -+ self.to_categories = to_categories or {} -+ self.category_pairs: List[MetricCategoryPair] = metric_range or [] -+ -+ @staticmethod -+ def aggregate_metric_from_groups(category_type, metric_groups) -> List[list]: -+ res = [] -+ if category_type == METRIC_CATEGORY_ALL: -+ for cate_type, metric_group in metric_groups.items(): -+ if cate_type == METRIC_CATEGORY_VIRTUAL: -+ continue -+ elif cate_type == METRIC_CATEGORY_OTHER: -+ res.extend([metric] for metric in metric_group) -+ else: -+ res.append(metric_group) -+ else: -+ metric_group = metric_groups.get(category_type) -+ if metric_group: -+ res.append(metric_group) -+ -+ return res -+ -+ @staticmethod -+ def _group_metric_by_category(metrics, categories) -> Dict[str, list]: -+ parts = {} -+ parted_metrics = set() -+ for cate_type, cate_metrics in categories.items(): -+ part = [] -+ for metric in metrics: -+ if metric in cate_metrics: -+ part.append(metric) -+ parted_metrics.add(metric) -+ if len(part) > 0: -+ parts.setdefault(cate_type, part) -+ -+ other_part = [] -+ for metric in metrics: -+ if metric not in parted_metrics: -+ other_part.append(metric) -+ if len(other_part) > 0: -+ parts.setdefault(METRIC_CATEGORY_OTHER, other_part) -+ -+ virtual_part = [METRIC_ID_OF_CATEGORY_VIRTUAL] -+ parts.setdefault(METRIC_CATEGORY_VIRTUAL, virtual_part) -+ -+ return parts -+ -+ def get_avail_causal_relations(self, real_from_metrics, real_to_metrics) -> List[Tuple[list, list]]: -+ causal_relations = [] -+ -+ from_groups = self._group_metric_by_category(real_from_metrics, self.from_categories) -+ to_groups = self._group_metric_by_category(real_to_metrics, self.to_categories) -+ for cate_pair in self.category_pairs: -+ all_from_metrics = self.aggregate_metric_from_groups(cate_pair.from_, from_groups) -+ all_to_metrics = self.aggregate_metric_from_groups(cate_pair.to_, to_groups) -+ for from_metrics in all_from_metrics: -+ for to_metrics in all_to_metrics: -+ causal_relations.append((from_metrics, to_metrics)) -+ -+ return causal_relations - - - class Rule(metaclass=ABCMeta): -@@ -180,7 +233,8 @@ class NicRule1(Rule): - class RuleEngine: - def __init__(self): - self.rules: List[Rule] = [] -- self.rule_metas = {} -+ self.metric_categories = {} -+ self.rule_metas: Dict[tuple, RuleMeta] = {} - - def add_rule(self, rule: Rule): - self.rules.append(rule) -@@ -190,33 +244,58 @@ class RuleEngine: - rule.rule_parsing(causal_graph) - - def load_rule_meta_from_yaml(self, rule_path: str) -> bool: -- abs_rule_path = os.path.abspath(rule_path) -- if not os.path.exists(abs_rule_path): -- logger.logger.warning("Rule meta path '{}' not exist", abs_rule_path) -- return True - try: -- with open(abs_rule_path, 'r') as file: -+ with open(os.path.abspath(rule_path), 'r') as file: - data = yaml.safe_load(file) - except IOError as ex: - logger.logger.warning(ex) - return False - -- infer_rules = data.get("infer_rules", []) -- for rule_meta in infer_rules: -- saved_metric_range = [] -- for item in rule_meta.get("metric_range", []): -- saved_metric_range.append(MetricPairSet(set(item.get('from', [])), set(item.get('to', [])))) -- saved_rule_meta = RuleMeta(rule_meta.get('from_type'), rule_meta.get('to_type'), saved_metric_range) -- self.rule_metas.setdefault((rule_meta.get("from_type"), rule_meta.get("to_type")), saved_rule_meta) -- -+ self.load_rule_meta_from_dict(data) - return True - -+ def create_default_rule_meta(self, from_type, to_type): -+ return RuleMeta( -+ from_type, -+ to_type, -+ self.metric_categories.get(from_type), -+ self.metric_categories.get(to_type), -+ [MetricCategoryPair(METRIC_CATEGORY_ALL, METRIC_CATEGORY_ALL)] -+ ) -+ - def add_rule_meta(self, causal_graph): - entity_cause_graph = causal_graph.entity_cause_graph - for edge in entity_cause_graph.edges: - from_type = entity_cause_graph.nodes[edge[0]].get('type') - to_type = entity_cause_graph.nodes[edge[1]].get('type') -- entity_cause_graph.edges[edge]["rule_meta"] = self.rule_metas.get((from_type, to_type)) -+ rule_meta = self.rule_metas.get((from_type, to_type)) -+ if not rule_meta: -+ rule_meta = self.create_default_rule_meta(from_type, to_type) -+ entity_cause_graph.edges[edge]["rule_meta"] = rule_meta -+ -+ def load_rule_meta_from_dict(self, data: dict): -+ self.load_metric_categories(data.get('metric_categories', {})) -+ self.load_infer_rules(data.get("infer_rules", [])) -+ -+ def load_metric_categories(self, metric_categories: dict): -+ for entity_type, categories in metric_categories.items(): -+ category_dict = {} -+ for category in categories: -+ category_dict.setdefault(category.get('category'), category.get('metrics')) -+ self.metric_categories.setdefault(entity_type, category_dict) -+ -+ def load_infer_rules(self, infer_rules: list): -+ for rule_meta in infer_rules: -+ from_entity_type = rule_meta.get('from_type') -+ to_entity_type = rule_meta.get('to_type') -+ saved_metric_range = [] -+ for item in rule_meta.get("metric_range", []): -+ from_category = item.get('from') -+ to_category = item.get('to') -+ saved_metric_range.append(MetricCategoryPair(from_category, to_category)) -+ saved_rule_meta = RuleMeta(from_entity_type, to_entity_type, self.metric_categories.get(from_entity_type), -+ self.metric_categories.get(to_entity_type), saved_metric_range) -+ self.rule_metas.setdefault((from_entity_type, to_entity_type), saved_rule_meta) - - - rule_engine = RuleEngine() -diff --git a/config/infer-rule.yaml b/config/infer-rule.yaml -index c3bc3f3..d287972 100644 ---- a/config/infer-rule.yaml -+++ b/config/infer-rule.yaml -@@ -1,32 +1,98 @@ -+metric_categories: -+ proc: -+ - -+ category: PROC_CPU -+ metrics: -+ - gala_gopher_proc_utime_jiffies -+ - gala_gopher_proc_stime_jiffies -+ - -+ category: PROC_IO_LOAD -+ metrics: -+ - gala_gopher_proc_read_bytes -+ - gala_gopher_proc_write_bytes -+ - gala_gopher_proc_less_4k_io_read -+ - gala_gopher_proc_less_4k_io_write -+ - gala_gopher_proc_greater_4k_io_read -+ - gala_gopher_proc_greater_4k_io_write -+ disk: -+ - -+ category: DISK_LOAD -+ metrics: -+ - gala_gopher_disk_rspeed_kB -+ - gala_gopher_disk_wspeed_kB -+ - gala_gopher_disk_rspeed -+ - gala_gopher_disk_wspeed -+ - -+ category: DISK_DELAY -+ metrics: -+ - gala_gopher_disk_r_await -+ - gala_gopher_disk_w_await -+ - gala_gopher_disk_rareq -+ - gala_gopher_disk_wareq -+ block: -+ - -+ category: BLOCK_DELAY -+ metrics: -+ - gala_gopher_block_latency_req_max -+ - gala_gopher_block_latency_req_last -+ - gala_gopher_block_latency_req_sum -+ - gala_gopher_block_latency_req_jitter -+ nic: -+ - -+ category: NIC_DROP -+ metrics: -+ - gala_gopher_nic_tc_sent_drop -+ - gala_gopher_nic_tx_dropped -+ - gala_gopher_nic_rx_dropped -+ cpu: -+ - -+ category: CPU_TOTAL -+ metrics: -+ - gala_gopher_cpu_total_used_per -+ - infer_rules: - - - from_type: cpu - to_type: proc - metric_range: - - -- from: [] -- to: -- - gala_gopher_proc_utime_jiffies -- - gala_gopher_proc_stime_jiffies -+ from: CPU_TOTAL -+ to: PROC_CPU - - - from_type: block - to_type: proc - metric_range: - - -- from: [] -- to: -- - gala_gopher_proc_ns_ext4_read -- - gala_gopher_proc_ns_ext4_write -- - gala_gopher_proc_ns_ext4_flush -- - gala_gopher_proc_ns_overlay_read -- - gala_gopher_proc_ns_overlay_write -- - gala_gopher_proc_ns_overlay_flush -+ from: BLOCK_DELAY -+ to: VIRTUAL - - - from_type: proc - to_type: disk - metric_range: - - -- from: -- - gala_gopher_proc_read_bytes -- - gala_gopher_proc_write_bytes -- to: [] -\ No newline at end of file -+ from: PROC_IO_LOAD -+ to: ALL -+ - -+ from_type: disk -+ to_type: block -+ metric_range: -+ - -+ from: DISK_DELAY -+ to: BLOCK_DELAY -+ - -+ from_type: proc -+ to_type: sli -+ metric_range: -+ - -+ from: VIRTUAL -+ to: ALL -+ - -+ from: PROC_CPU -+ to: ALL -+ - -+ from_type: tcp_link -+ to_type: proc -+ metric_range: -+ - -+ from: ALL -+ to: VIRTUAL -\ No newline at end of file --- -2.21.0.windows.1 - diff --git a/0002-adaptation-for-abnormal-event-output-change.patch b/0002-adaptation-for-abnormal-event-output-change.patch deleted file mode 100644 index eb120ca..0000000 --- a/0002-adaptation-for-abnormal-event-output-change.patch +++ /dev/null @@ -1,108 +0,0 @@ -From 8a89c59b0c6194afdd6ed1c9bbd949b2cc62aeb0 Mon Sep 17 00:00:00 2001 -From: algorithmofdish -Date: Tue, 29 Nov 2022 17:07:12 +0800 -Subject: [PATCH] refactor(infer): adaptation for abnormal event output change - ---- - cause_inference/__main__.py | 25 +++++++++++++++++++++---- - cause_inference/cause_infer.py | 21 --------------------- - 2 files changed, 21 insertions(+), 25 deletions(-) - -diff --git a/cause_inference/__main__.py b/cause_inference/__main__.py -index 093f7ac..d586b1f 100644 ---- a/cause_inference/__main__.py -+++ b/cause_inference/__main__.py -@@ -14,7 +14,6 @@ from cause_inference.config import infer_config - from cause_inference.config import init_infer_config - from cause_inference.model import AbnormalEvent - from cause_inference.cause_infer import cause_locating --from cause_inference.cause_infer import parse_abn_evt - from cause_inference.cause_infer import preprocess_abn_score - from cause_inference.rule_parser import rule_engine - from cause_inference.exceptions import InferenceException -@@ -164,16 +163,34 @@ def init_obsv_meta_coll_thd(): - return obsv_meta_coll_thread - - -+def parse_abn_evt(data) -> AbnormalEvent: -+ resource = data.get('Resource', {}) -+ attrs = data.get('Attributes', {}) -+ if not resource.get('metric'): -+ raise DataParseException('Attribute "Resource.metric" required in abnormal event') -+ if not attrs.get('entity_id') and not resource.get('labels'): -+ raise DataParseException('metric_label or entity_id info need in abnormal event') -+ abn_evt = AbnormalEvent( -+ timestamp=data.get('Timestamp'), -+ abnormal_metric_id=resource.get('metric'), -+ abnormal_score=preprocess_abn_score(resource.get('score', 0.0)), -+ metric_labels=resource.get('labels'), -+ abnormal_entity_id=attrs.get('entity_id'), -+ desc=resource.get('description', '') or data.get('Body', '') -+ ) -+ return abn_evt -+ -+ - def get_recommend_metric_evts(abn_kpi_data: dict) -> List[AbnormalEvent]: - metric_evts = [] - obsv_meta_mgt = ObserveMetaMgt() -- recommend_metrics = abn_kpi_data.get('Resource', {}).get('recommend_metrics', {}) -+ recommend_metrics = abn_kpi_data.get('Resource', {}).get('cause_metrics', {}) - for metric_data in recommend_metrics: - metric_evt = AbnormalEvent( - timestamp=abn_kpi_data.get('Timestamp'), - abnormal_metric_id=metric_data.get('metric', ''), -- abnormal_score=preprocess_abn_score(metric_data.get('score')), -- metric_labels=metric_data.get('label', {}), -+ abnormal_score=preprocess_abn_score(metric_data.get('score', 0.0)), -+ metric_labels=metric_data.get('labels', {}), - desc=metric_data.get('description', '') - ) - if not metric_evt.update_entity_id(obsv_meta_mgt): -diff --git a/cause_inference/cause_infer.py b/cause_inference/cause_infer.py -index dff26d0..6873c1e 100644 ---- a/cause_inference/cause_infer.py -+++ b/cause_inference/cause_infer.py -@@ -1,7 +1,5 @@ - from typing import List - --from scipy.special import expit -- - from cause_inference.model import Cause - from cause_inference.model import CausalGraph - from cause_inference.model import AbnormalEvent -@@ -12,7 +10,6 @@ from spider.collector.data_collector import DataCollector - from spider.collector.prometheus_collector import PrometheusCollector - from cause_inference.exceptions import InferenceException - from cause_inference.exceptions import DBException --from cause_inference.exceptions import DataParseException - from cause_inference.config import infer_config - from cause_inference.rule_parser import rule_engine - from cause_inference.arangodb import connect_to_arangodb -@@ -68,24 +65,6 @@ def query_abnormal_topo_subgraph(abnormal_event: AbnormalEvent): - return subgraph - - --def parse_abn_evt(data) -> AbnormalEvent: -- resource = data.get('Resource', {}) -- attrs = data.get('Attributes', {}) -- if not resource.get('metrics'): -- raise DataParseException('Atribute "Resource.metrics" required in abnormal event') -- if not attrs.get('entity_id') and not resource.get('metric_label'): -- raise DataParseException('metric_label or entity_id info need in abnormal event') -- abn_evt = AbnormalEvent( -- timestamp=data.get('Timestamp'), -- abnormal_metric_id=resource.get('metrics'), -- abnormal_score=1.0, -- metric_labels=resource.get('metric_label'), -- abnormal_entity_id=attrs.get('entity_id'), -- desc=resource.get('description', '') or data.get('Body', '') -- ) -- return abn_evt -- -- - def parse_entity_id(orig_entity_id: str) -> str: - fs_idx = orig_entity_id.index('/') - return orig_entity_id[fs_idx+1:] --- -2.21.0.windows.1 - diff --git a/gala-spider-1.0.0.tar.gz b/gala-spider-1.0.0.tar.gz deleted file mode 100644 index d4c266a..0000000 Binary files a/gala-spider-1.0.0.tar.gz and /dev/null differ diff --git a/gala-spider-1.0.1.tar.gz b/gala-spider-1.0.1.tar.gz new file mode 100644 index 0000000..88eefd1 Binary files /dev/null and b/gala-spider-1.0.1.tar.gz differ diff --git a/gala-spider.spec b/gala-spider.spec index b673ca5..9b490de 100644 --- a/gala-spider.spec +++ b/gala-spider.spec @@ -1,8 +1,8 @@ %define debug_package %{nil} Name: gala-spider -Version: 1.0.0 -Release: 6 +Version: 1.0.1 +Release: 1 Summary: OS topological graph storage service and cause inference service for gala-ops project License: MulanPSL2 URL: https://gitee.com/openeuler/gala-spider @@ -11,9 +11,6 @@ Source0: %{name}-%{version}.tar.gz BuildRequires: python3-setuptools systemd Requires: python3-%{name} = %{version}-%{release} -patch0: 0001-cause-inference-optimization.patch -patch1: 0002-adaptation-for-abnormal-event-output-change.patch - %description OS topological graph storage service for gala-ops project @@ -114,6 +111,7 @@ fi %config(noreplace) %{_sysconfdir}/gala-inference/gala-inference.yaml %config(noreplace) %{_sysconfdir}/gala-inference/ext-observe-meta.yaml %config(noreplace) %{_sysconfdir}/gala-inference/infer-rule.yaml +%config(noreplace) %{_sysconfdir}/gala-inference/cause-keyword.yaml %{_bindir}/gala-inference %{_unitdir}/gala-inference.service @@ -124,6 +122,9 @@ fi %changelog +* Wed Dec 14 2022 algorithmofdish - 1.0.1-1 +- Update to 1.0.1: support cross host cause location + * Sat Dec 10 2022 algorithmofdish - 1.0.0-6 - Adaptation for abnormal event output change