dongdong9
diff --git a/‎data/chinese_english_logs_parse_by_drain3.csv
+6,163-6,163 b/‎data/chinese_english_logs_parse_by_drain3.csv
+6,163-6,163
diff --git a/‎data/chinese_english_logs_parse_by_statistic.csv
+6,096-6,096 b/‎data/chinese_english_logs_parse_by_statistic.csv
+6,096-6,096
diff --git a/‎data/english_logs_parse_by_drain3.csv
+33,195-33,195 b/‎data/english_logs_parse_by_drain3.csv
+33,195-33,195
diff --git a/‎data/english_logs_parse_by_statistic.csv
+18,634-18,634 b/‎data/english_logs_parse_by_statistic.csv
+18,634-18,634
diff --git a/‎data/解析结果与金标准对比的结果_by_drain3.xlsx
-8.13 KB b/‎data/解析结果与金标准对比的结果_by_drain3.xlsx
-8.13 KB
diff --git a/‎data/解析结果与金标准对比的结果_by_statistic.xlsx
5.1 KB b/‎data/解析结果与金标准对比的结果_by_statistic.xlsx
5.1 KB
diff --git a/‎drain3/drain.py
+3-3 b/‎drain3/drain.py
+3-3
diff --git a/‎drain3/template_miner.py
+34-16 b/‎drain3/template_miner.py
+34-16
diff --git a/‎src/common_config.py
+1 b/‎src/common_config.py
+1
diff --git a/‎src/drain3_examples/drain_stdin_demo.py
+7-13 b/‎src/drain3_examples/drain_stdin_demo.py
+7-13
diff --git a/‎src/log_parser_by_drain3.py
+2-3 b/‎src/log_parser_by_drain3.py
+2-3
diff --git a/‎src/tool/tokenizer.py
+2 b/‎src/tool/tokenizer.py
+2
@@ -462,17 +462,17 @@ def match(self, content: str, full_search_strategy="never"):
         def full_search():
             all_ids = self.get_clusters_ids_for_seq_len(len(content_tokens))
             cluster = self.fast_match(all_ids, content_tokens, required_sim_th, include_params=True)
-            return cluster
+            return cluster, tokenize_result
 
         if full_search_strategy == "always":
             return full_search()
 
         match_cluster = self.tree_search(self.root_node, content_tokens, required_sim_th, include_params=True)
         if match_cluster is not None:
-            return match_cluster
+            return match_cluster, tokenize_result
 
         if full_search_strategy == "never":
-            return None
+            return None, tokenize_result
 
         return full_search()
 
 
@@ -16,7 +16,7 @@
 from drain3.simple_profiler import SimpleProfiler, NullProfiler, Profiler
 from drain3.template_miner_config import TemplateMinerConfig
 from src.common_config import CLUSTER_COUNT_KEY, DEFAULT_STR_VALUE, USE_OLD_FUNCTION_EXTRACT_PARAMETER,\
-    STAR_CHAR, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY,LOG_TEMPLATE_TOKENS_KEY
+    TOKEN_LIST_KEY, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY,LOG_TEMPLATE_TOKENS_KEY,ENABLE_MASK_CONTENT
 
 logger = logging.getLogger(__name__)
 
@@ -139,6 +139,15 @@ def get_snapshot_reason(self, change_type, cluster_id):
 
         return None
 
+    def make_result_dict(self,cluster, tokenize_result):
+        result_dict = {            CLUSTER_ID_KEY: cluster.cluster_id,
+            CLUSTER_SIZE_KEY: cluster.size, #yd。用于统计当前cluster匹配的日志条数
+            LOG_TEMPLATE_TOKENS_KEY: cluster.log_template_tokens,
+            TEMPLATE_MINED_KEY: cluster.get_template() # yd。返回挖掘处理的日志模板
+        }
+        result_dict.update(tokenize_result)
+        return result_dict
+
     def add_log_message(self, log_message: str) -> dict:
         """
         yd。功能：根据当前传入的日志内容，获取对应的日志模板的logCluster
@@ -147,7 +156,7 @@ def add_log_message(self, log_message: str) -> dict:
         """
         self.profiler.start_section("total")
 
-        if 0:
+        if ENABLE_MASK_CONTENT:
             self.profiler.start_section("mask")
             # yd。将log_message字符串中正则匹配的子串，用特定符号替换。
             # 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换，返回"connected to <:IP:>"
@@ -163,19 +172,11 @@ def add_log_message(self, log_message: str) -> dict:
 
         result = {
             "change_type": change_type,
-            #"cluster_id": cluster.cluster_id,
-            CLUSTER_ID_KEY: cluster.cluster_id,
-            #"cluster_size": cluster.size, #yd。用于统计当前cluster匹配的日志条数
-            CLUSTER_SIZE_KEY: cluster.size, #yd。用于统计当前cluster匹配的日志条数
-            #"log_template_tokens": cluster.log_template_tokens,
-            LOG_TEMPLATE_TOKENS_KEY: cluster.log_template_tokens,
-            #"template_mined": cluster.get_template(), #yd。返回挖掘处理的日志模板
-            TEMPLATE_MINED_KEY: cluster.get_template(),  # yd。返回挖掘处理的日志模板
-            #"cluster_count": len(self.drain.clusters) #yd。统计当前已经挖掘的模板的 总数
             CLUSTER_COUNT_KEY: len(self.drain.clusters)  # yd。统计当前已经挖掘的模板的 总数
-
         }
-        result.update(tokenize_result)
+        result_dict = self.make_result_dict(cluster, tokenize_result)
+        result.update(result_dict)
+
         #yd。这里是将当前的日志模板信息的快照保存下来
         if self.persistence_handler is not None:
             self.profiler.start_section("save_state")
@@ -208,10 +209,15 @@ def match(self, log_message: str, full_search_strategy="never") -> LogCluster:
             count of wildcard matches.
         :return: Matched cluster or None if no match found.
         """
+        if ENABLE_MASK_CONTENT:
+            # yd。将log_message字符串中正则匹配的子串，用特定符号替换。
+            # 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换，返回"connected to <:IP:>"
+            masked_content = self.masker.mask(log_message)
+        else:
+            masked_content = log_message
 
-        masked_content = self.masker.mask(log_message)
-        matched_cluster = self.drain.match(masked_content, full_search_strategy)
-        return matched_cluster
+        matched_cluster, tokenize_result = self.drain.match(masked_content, full_search_strategy)
+        return matched_cluster, tokenize_result
 
     def get_parameter_list(self, log_template: str, log_message: str) -> List[str]:
         """
@@ -230,6 +236,18 @@ def get_parameter_list(self, log_template: str, log_message: str) -> List[str]:
             return []
         return [parameter.value for parameter in extracted_parameters]
 
+    def get_parameter(self,result_dict, log_line):
+        if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
+            # template = result["template_mined"]
+            template = result_dict.get(TEMPLATE_MINED_KEY, DEFAULT_STR_VALUE)
+            params = self.extract_parameters(template, log_line)
+            return params
+        content_tokens = result_dict.get(TOKEN_LIST_KEY, [])
+        # log_template_tokens = result["log_template_tokens"]
+        log_template_tokens = result_dict.get(LOG_TEMPLATE_TOKENS_KEY, [])
+        params = self.extract_parameters_by_compare(content_tokens, log_template_tokens)
+        return params
+
     def extract_parameters_by_compare(self, content_tokens, log_template_tokens):
         parameter_list = []
         for token1, token2 in zip(content_tokens, log_template_tokens):
 
@@ -11,6 +11,7 @@
 
 
 USE_OLD_FUNCTION_EXTRACT_PARAMETER = False
+ENABLE_MASK_CONTENT = False
 
 CHINESE_SUBSTR_TYPE = "中"
 SPACE_SUBSTR_TYPE = "空格"
 
@@ -60,15 +60,7 @@
     result = template_miner.add_log_message(log_line)
     result_json = json.dumps(result,ensure_ascii=False)
     print(result_json)
-    if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
-        #template = result["template_mined"]
-        template = result.get(TEMPLATE_MINED_KEY, DEFAULT_STR_VALUE)
-        params = template_miner.extract_parameters(template, log_line)
-    else:
-        content_tokens = result.get(TOKEN_LIST_KEY, [])
-        #log_template_tokens = result["log_template_tokens"]
-        log_template_tokens = result.get(LOG_TEMPLATE_TOKENS_KEY,[])
-        params = template_miner.extract_parameters_by_compare(content_tokens, log_template_tokens)
+    params = template_miner.get_parameter(result, log_line)
     print("Parameters: " + str(params))
 #yd。训练完毕，打印挖掘的每个cluster
 print("Training done. Mined clusters:")
@@ -80,10 +72,12 @@
     log_line = input("> ")
     if log_line == 'q':
         break
-    cluster = template_miner.match(log_line)
+    cluster, tokenize_result = template_miner.match(log_line)
     if cluster is None:
         print(f"No match found")
     else:
-        template = cluster.get_template()
-        print(f"Matched template #{cluster.cluster_id}: {template}")
-        print(f"Parameters: {template_miner.get_parameter_list(template, log_line)}")
+        # template = cluster.get_template()
+        # print(f"Matched template #{cluster.cluster_id}: {template}")
+        result = template_miner.make_result_dict(cluster, tokenize_result)
+        params = template_miner.get_parameter(result, log_line)
+        print(f"Parameters: {params}")
@@ -5,9 +5,8 @@
 from src.tool.read_save_file import open_excel, save_dataframe
 
 from src.common_config import DATA_DIR_PATH, DEFAULT_STR_VALUE, USE_OLD_FUNCTION_EXTRACT_PARAMETER,\
-    STAR_CHAR, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY
-from src.common_config import IS_CONTAIN_CHINESE_KEY, SUBSTR_TYPE_PATTERN_KEY, SUBSTR_DETAIL_LIST_KEY, \
-    TOKEN_LIST_KEY,LOG_TEMPLATE_TOKENS_KEY
+    CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY, IS_CONTAIN_CHINESE_KEY, SUBSTR_TYPE_PATTERN_KEY, \
+    SUBSTR_DETAIL_LIST_KEY, TOKEN_LIST_KEY, LOG_TEMPLATE_TOKENS_KEY
 
 from src.tool.str_related import get_tow_set_diff
 import json
 
@@ -77,6 +77,8 @@ def split_substr(substr_detail_list, need_split_substr_type, is_split_by_space):
 
             split_list.extend(temp_token_list)
         else:
+            if substr_type == SPACE_SUBSTR_TYPE:
+                continue
             split_list.append(match_str)
     return split_list