改造drain3，使其能处理包含中文的日志

dongdong9 · dongdong9 · commit 8baccd7de264 · 2022-09-29T11:59:59.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ src/__pycache__/*
 .idea/
 src/drain3_examples/SSH.log
 src/drain3_examples/SSH.tar.gz
+config_ini/drain3_state.bin
diff --git a/config_ini/drain3.ini b/config_ini/drain3.ini
@@ -12,8 +12,10 @@ masking = [
           {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"},
           {"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"}
           ]
-mask_prefix = <:
-mask_suffix = :>
+;mask_prefix = <:
+;mask_suffix = :>
+mask_prefix = <<
+mask_suffix = >>
 
 [DRAIN]
 sim_th = 0.4
diff --git a/data/chinese_english_logs_parse_by_drain3.csv b/data/chinese_english_logs_parse_by_drain3.csv
diff --git a/data/english_logs_parse_by_drain3.csv b/data/english_logs_parse_by_drain3.csv
diff --git a/drain3/drain.py b/drain3/drain.py
@@ -7,7 +7,7 @@
 from cachetools import LRUCache, Cache
 
 from drain3.simple_profiler import Profiler, NullProfiler
-
+from src.tool.tokenizer import get_token_list
 
 class LogCluster:
     __slots__ = ["log_template_tokens", "cluster_id", "size"]
@@ -20,7 +20,7 @@ def __init__(self, log_template_tokens: list, cluster_id: int):
         """
         self.log_template_tokens = tuple(log_template_tokens)
         self.cluster_id = cluster_id
-        self.size = 1
+        self.size = 1 #yd。用于统计当前cluster匹配的日志条数
 
     def get_template(self):
         return ' '.join(self.log_template_tokens)
@@ -138,6 +138,7 @@ def tree_search(self, root_node: Node, tokens: list, sim_th: float, include_para
 
         # find the leaf node for this log - a path of nodes matching the first N tokens (N=tree depth)
         cur_node_depth = 1
+
         for token in tokens:
             # at max depth
             if cur_node_depth >= self.max_node_depth:
@@ -235,7 +236,7 @@ def add_seq_to_prefix_tree(self, root_node, cluster: LogCluster):
     # seq1 is a template, seq2 is the log to match
     def get_seq_distance(self, seq1, seq2, include_params: bool):
         """
-
+        yd。功能：计算seq1与seq2的相似度，相似度 = 公共元素的个数/ seq1的长度
         :param seq1:
         :param seq2:
         :param include_params:
@@ -266,6 +267,7 @@ def get_seq_distance(self, seq1, seq2, include_params: bool):
 
     def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include_params: bool):
         """
+        yd。功能：从cluster_ids对应的所有cluster中，找出cluster.log_template_tokens与tokens相似度最高的
         Find the best match for a log message (represented as tokens) versus a list of clusters
         :param cluster_ids: List of clusters to match against (represented by their IDs)
         :param tokens: the log message, separated to tokens.
@@ -332,17 +334,36 @@ def print_node(self, token, node, depth, file, max_clusters):
             out_str = '\t' * (depth + 1) + str(cluster)
             print(out_str, file=file)
 
-    def get_content_as_tokens(self, content):
+    def get_content_as_tokens_raw(self, content):
+        """
+        这是drain3最原始的分词代码，只考虑了英文，没有考虑中文的情况
+        :param content:
+        :return:
+        """
         content = content.strip()
         for delimiter in self.extra_delimiters:
             content = content.replace(delimiter, " ")
         content_tokens = content.split()
         return content_tokens
 
+
+    def get_content_as_tokens(self, content):
+        """
+        考虑中英文混杂，纯英文两种情况
+        :param content:
+        :return:
+        """
+        content = content.strip()
+        is_contain_chinese, substr_type_pattern, substr_detail_list, token_list,token_join_str = get_token_list(content)
+        content_tokens = token_list
+        print(f"content_tokens = {content_tokens}")
+        return content_tokens
+
+
     def add_log_message(self, content: str):
         """
         yd。功能：根据传入的content，获取匹配的logCluster，该LogCluster可能是先前已经存在的，也可能是需要新生成的
-        :param content:被正则匹配mask后的日志内容
+        :param content:被正则匹配mask后的日志内容，例如"connected to <:IP:>"
         :return:match_cluster：匹配的logCluster；update_type：表示更新match_cluster的原因
         """
         content_tokens = self.get_content_as_tokens(content)  # yd。对content进行分词
@@ -370,20 +391,20 @@ def add_log_message(self, content: str):
             if self.profiler:
                 self.profiler.start_section("cluster_exist")
             new_template_tokens = self.create_template(content_tokens, match_cluster.log_template_tokens)
-            if tuple(new_template_tokens) == match_cluster.log_template_tokens:
+            if tuple(new_template_tokens) == match_cluster.log_template_tokens: #yd。如果新创建的模板与最匹配的模板相同
                 update_type = "none"
-            else:
+            else:#yd。如果新创建的模板与最新的模板不相同，则用新创建的模板来更新最匹配的模板
                 match_cluster.log_template_tokens = tuple(new_template_tokens)
                 update_type = "cluster_template_changed"
             match_cluster.size += 1
             # Touch cluster to update its state in the cache.
             # noinspection PyStatementEffect
-            self.id_to_cluster[match_cluster.cluster_id]
+            self.id_to_cluster[match_cluster.cluster_id] #yd。因为使用了LRUCache机制来控制cluster个数，故这里需要访问一下match_cluster对应的id
 
         if self.profiler:
             self.profiler.end_section()
 
-        return match_cluster, update_type
+        return match_cluster, update_type,content_tokens
 
     def get_clusters_ids_for_seq_len(self, seq_len: int):
         """
diff --git a/drain3/template_miner.py b/drain3/template_miner.py
@@ -145,22 +145,33 @@ def add_log_message(self, log_message: str) -> dict:
         """
         self.profiler.start_section("total")
 
-        self.profiler.start_section("mask")
-        masked_content = self.masker.mask(log_message) #yd。将log_message字符串中正则匹配的子串，用特定符号替换，比如将content中的ip数字用"<:IP:>"替换
-        self.profiler.end_section()
+        if 0:
+            self.profiler.start_section("mask")
+            # yd。将log_message字符串中正则匹配的子串，用特定符号替换。
+            # 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换，返回"connected to <:IP:>"
+            masked_content = self.masker.mask(log_message)
+            self.profiler.end_section()
+        else:
+            masked_content = log_message
 
         self.profiler.start_section("drain")
-        cluster, change_type = self.drain.add_log_message(masked_content) #yd。根据传入的masked_content，获取匹配的logCluster
+        # yd。根据传入的masked_content，获取匹配的logCluster
+        cluster, change_type, content_tokens = self.drain.add_log_message(masked_content)
         self.profiler.end_section("drain")
 
         result = {
+            "content_tokens":content_tokens,
             "change_type": change_type,
             "cluster_id": cluster.cluster_id,
-            "cluster_size": cluster.size,
+            "cluster_size": cluster.size, #yd。用于统计当前cluster匹配的日志条数
+            "log_template_tokens": cluster.log_template_tokens,
             "template_mined": cluster.get_template(), #yd。返回挖掘处理的日志模板
+
             "cluster_count": len(self.drain.clusters) #yd。统计当前已经挖掘的模板的 总数
+
         }
 
+        #yd。这里是将当前的日志模板信息的快照保存下来
         if self.persistence_handler is not None:
             self.profiler.start_section("save_state")
             snapshot_reason = self.get_snapshot_reason(change_type, cluster.cluster_id)
@@ -214,6 +225,16 @@ def get_parameter_list(self, log_template: str, log_message: str) -> List[str]:
             return []
         return [parameter.value for parameter in extracted_parameters]
 
+    def extract_parameters_by_compare(self, content_tokens, log_template_tokens):
+        parameter_list = []
+        for token1, token2 in zip(content_tokens, log_template_tokens):
+            if token1 == token2:
+                continue
+            extracted_parameter = ExtractedParameter(token1, mask_name="-")
+            parameter_list.append(extracted_parameter)
+        return parameter_list
+
+
     def extract_parameters(self,
                            log_template: str,
                            log_message: str,
@@ -244,8 +265,6 @@ def extract_parameters(self,
             log_template, exact_matching)
 
         # Parameters are represented by specific named groups inside template_regex.
-
-
         parameter_match = re.match(template_regex, log_message)
 
         # log template does not match template
@@ -254,7 +273,7 @@ def extract_parameters(self,
 
         # create list of extracted parameters
         extracted_parameters = []
-        for group_name, parameter in parameter_match.groupdict().items():
+        for group_name, parameter in parameter_match.groupdict().items(): #yd。对正则匹配的结果进行遍历
             if group_name in param_group_name_to_mask_name:
                 mask_name = param_group_name_to_mask_name[group_name]
                 extracted_parameter = ExtractedParameter(parameter, mask_name)
@@ -265,7 +284,7 @@ def extract_parameters(self,
     @cachedmethod(lambda self: self.parameter_extraction_cache)
     def _get_template_parameter_extraction_regex(self, log_template: str, exact_matching: bool):
         """
-
+        yd。功能：构建模板参数抽取的正则表达式
         :param log_template:
         :param exact_matching:
         :return: template_regex:
diff --git a/src/common_config.py b/src/common_config.py
@@ -9,6 +9,9 @@
 
 DEFAULT_STR_VALUE = " "
 
+SPACE_CHAR = " "
+
+USE_OLD_FUNCTION_EXTRACT_PARAMETER = False
 
 CHINESE_SUBSTR_TYPE = "中"
 SPACE_SUBSTR_TYPE = "空格"
diff --git a/src/drain3_examples/drain_stdin_demo.py b/src/drain3_examples/drain_stdin_demo.py
@@ -7,7 +7,9 @@
 
 from drain3 import TemplateMiner
 from drain3.template_miner_config import TemplateMinerConfig
-from src.common_config import CONFIG_DIR_PATH
+from src.common_config import CONFIG_DIR_PATH,USE_OLD_FUNCTION_EXTRACT_PARAMETER
+from src.tool.tokenizer import get_token_list
+
 # persistence_type = "NONE"
 # persistence_type = "REDIS"
 # persistence_type = "KAFKA"
@@ -53,14 +55,21 @@
     log_line = input("> ")
     if log_line == 'q':
         break
+    # is_contain_chinese, substr_type_pattern, substr_detail_list, token_list, token_join_str = get_token_list(log_line)
+    # log_line = token_join_str
     result = template_miner.add_log_message(log_line)
-    result_json = json.dumps(result)
+    result_json = json.dumps(result,ensure_ascii=False)
     print(result_json)
-    template = result["template_mined"]
-    params = template_miner.extract_parameters(template, log_line)
+    if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
+        template = result["template_mined"]
+        params = template_miner.extract_parameters(template, log_line)
+    else:
+        content_tokens = result["content_tokens"]
+        log_template_tokens = result["log_template_tokens"]
+        params = template_miner.extract_parameters_by_compare(content_tokens, log_template_tokens)
     print("Parameters: " + str(params))
-
-print("Training done. Mined clusters:") #yd。训练完毕，打印挖掘的cluster
+#yd。训练完毕，打印挖掘的每个cluster
+print("Training done. Mined clusters:")
 for cluster in template_miner.drain.clusters:
     print(cluster)
 
diff --git a/src/log_parser_by_drain3.py b/src/log_parser_by_drain3.py
@@ -4,7 +4,7 @@
 from tqdm import tqdm  # 进度条
 from src.tool.read_save_file import open_excel, save_dataframe
 
-from src.common_config import DATA_DIR_PATH, CONNECTOR_CHAR, \
+from src.common_config import DATA_DIR_PATH, CONNECTOR_CHAR, USE_OLD_FUNCTION_EXTRACT_PARAMETER,\
     STAR_CHAR, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY,CLUSTER_COUNT_KEY
 from src.tool.str_related import get_tow_set_diff
 from src.tool.tool import calculate_normalize_ratio
@@ -36,11 +36,19 @@ def __init__(self):
         print(f"Starting training mode. Reading from std-in ('q' to finish)") #yd。利用输入的一条条日志，训练得到模板
 
     def parse_log_content(self, log_line):
+        # is_contain_chinese, substr_type_pattern, substr_detail_list, token_list, token_join_str = get_token_list(
+        #     log_line)
+        # log_line = token_join_str
         result = self.template_miner.add_log_message(log_line)
-        result_json = json.dumps(result)
+        result_json = json.dumps(result,ensure_ascii=False)
         #print(result_json)
-        template = result["template_mined"] #yd。取出挖掘的日志模板
-        params = self.template_miner.extract_parameters(template, log_line)
+        if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
+            template = result["template_mined"]
+            params = self.template_miner.extract_parameters(template, log_line)
+        else:
+            content_tokens = result["content_tokens"]
+            log_template_tokens = result["log_template_tokens"]
+            params = self.template_miner.extract_parameters_by_compare(content_tokens, log_template_tokens)
         #print("Parameters: " + str(params))
         return result, params
 
diff --git a/src/tool/tokenizer.py b/src/tool/tokenizer.py
@@ -1,6 +1,6 @@
 from src.common_config import DATA_DIR_PATH,CHINESE_REGEXP,CONNECTOR_CHAR,\
     PUNCTUATION_MARK_REGEXP,NONE_CHINESE_REGEXP, CHINESE_SUBSTR_TYPE,SPACE_SUBSTR_TYPE, ENGLISH_SUBSTR_TYPE,\
-    CHINESE_SPACE_CHINESE_PATTERN,STAR_CHAR,PUNCTUATION_MARK_TYPE
+    CHINESE_SPACE_CHINESE_PATTERN,SPACE_CHAR,PUNCTUATION_MARK_TYPE
 from src.tool.str_related import str_normalize, get_tow_set_diff
 import jieba
 
@@ -94,7 +94,8 @@ def get_token_list(content):
             token_list = split_substr(substr_detail_list, CHINESE_SUBSTR_TYPE, is_split_by_space=False)
     else:  # 即模式中不包含中文，则对英文按空格进行切分
         token_list = split_substr(substr_detail_list, ENGLISH_SUBSTR_TYPE, is_split_by_space=True)
-    return is_contain_chinese, substr_type_pattern, substr_detail_list, token_list
+    token_join_str = SPACE_CHAR.join(token_list)
+    return is_contain_chinese, substr_type_pattern, substr_detail_list, token_list,token_join_str
 
 
 if __name__ == '__main__':