Skip to content

Commit edca7f4

Browse files
committed
调通drain_stdin_demo.py中在线解析日志的功能
1 parent ba46d8e commit edca7f4

12 files changed

+64137
-64123
lines changed

data/chinese_english_logs_parse_by_drain3.csv

+6,163-6,163
Large diffs are not rendered by default.

data/chinese_english_logs_parse_by_statistic.csv

+6,096-6,096
Large diffs are not rendered by default.

data/english_logs_parse_by_drain3.csv

+33,195-33,195
Large diffs are not rendered by default.

data/english_logs_parse_by_statistic.csv

+18,634-18,634
Large diffs are not rendered by default.
Binary file not shown.
Binary file not shown.

drain3/drain.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -462,17 +462,17 @@ def match(self, content: str, full_search_strategy="never"):
462462
def full_search():
463463
all_ids = self.get_clusters_ids_for_seq_len(len(content_tokens))
464464
cluster = self.fast_match(all_ids, content_tokens, required_sim_th, include_params=True)
465-
return cluster
465+
return cluster, tokenize_result
466466

467467
if full_search_strategy == "always":
468468
return full_search()
469469

470470
match_cluster = self.tree_search(self.root_node, content_tokens, required_sim_th, include_params=True)
471471
if match_cluster is not None:
472-
return match_cluster
472+
return match_cluster, tokenize_result
473473

474474
if full_search_strategy == "never":
475-
return None
475+
return None, tokenize_result
476476

477477
return full_search()
478478

drain3/template_miner.py

+34-16
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from drain3.simple_profiler import SimpleProfiler, NullProfiler, Profiler
1717
from drain3.template_miner_config import TemplateMinerConfig
1818
from src.common_config import CLUSTER_COUNT_KEY, DEFAULT_STR_VALUE, USE_OLD_FUNCTION_EXTRACT_PARAMETER,\
19-
STAR_CHAR, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY,LOG_TEMPLATE_TOKENS_KEY
19+
TOKEN_LIST_KEY, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY,LOG_TEMPLATE_TOKENS_KEY,ENABLE_MASK_CONTENT
2020

2121
logger = logging.getLogger(__name__)
2222

@@ -139,6 +139,15 @@ def get_snapshot_reason(self, change_type, cluster_id):
139139

140140
return None
141141

142+
def make_result_dict(self,cluster, tokenize_result):
143+
result_dict = { CLUSTER_ID_KEY: cluster.cluster_id,
144+
CLUSTER_SIZE_KEY: cluster.size, #yd。用于统计当前cluster匹配的日志条数
145+
LOG_TEMPLATE_TOKENS_KEY: cluster.log_template_tokens,
146+
TEMPLATE_MINED_KEY: cluster.get_template() # yd。返回挖掘处理的日志模板
147+
}
148+
result_dict.update(tokenize_result)
149+
return result_dict
150+
142151
def add_log_message(self, log_message: str) -> dict:
143152
"""
144153
yd。功能:根据当前传入的日志内容,获取对应的日志模板的logCluster
@@ -147,7 +156,7 @@ def add_log_message(self, log_message: str) -> dict:
147156
"""
148157
self.profiler.start_section("total")
149158

150-
if 0:
159+
if ENABLE_MASK_CONTENT:
151160
self.profiler.start_section("mask")
152161
# yd。将log_message字符串中正则匹配的子串,用特定符号替换。
153162
# 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换,返回"connected to <:IP:>"
@@ -163,19 +172,11 @@ def add_log_message(self, log_message: str) -> dict:
163172

164173
result = {
165174
"change_type": change_type,
166-
#"cluster_id": cluster.cluster_id,
167-
CLUSTER_ID_KEY: cluster.cluster_id,
168-
#"cluster_size": cluster.size, #yd。用于统计当前cluster匹配的日志条数
169-
CLUSTER_SIZE_KEY: cluster.size, #yd。用于统计当前cluster匹配的日志条数
170-
#"log_template_tokens": cluster.log_template_tokens,
171-
LOG_TEMPLATE_TOKENS_KEY: cluster.log_template_tokens,
172-
#"template_mined": cluster.get_template(), #yd。返回挖掘处理的日志模板
173-
TEMPLATE_MINED_KEY: cluster.get_template(), # yd。返回挖掘处理的日志模板
174-
#"cluster_count": len(self.drain.clusters) #yd。统计当前已经挖掘的模板的 总数
175175
CLUSTER_COUNT_KEY: len(self.drain.clusters) # yd。统计当前已经挖掘的模板的 总数
176-
177176
}
178-
result.update(tokenize_result)
177+
result_dict = self.make_result_dict(cluster, tokenize_result)
178+
result.update(result_dict)
179+
179180
#yd。这里是将当前的日志模板信息的快照保存下来
180181
if self.persistence_handler is not None:
181182
self.profiler.start_section("save_state")
@@ -208,10 +209,15 @@ def match(self, log_message: str, full_search_strategy="never") -> LogCluster:
208209
count of wildcard matches.
209210
:return: Matched cluster or None if no match found.
210211
"""
212+
if ENABLE_MASK_CONTENT:
213+
# yd。将log_message字符串中正则匹配的子串,用特定符号替换。
214+
# 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换,返回"connected to <:IP:>"
215+
masked_content = self.masker.mask(log_message)
216+
else:
217+
masked_content = log_message
211218

212-
masked_content = self.masker.mask(log_message)
213-
matched_cluster = self.drain.match(masked_content, full_search_strategy)
214-
return matched_cluster
219+
matched_cluster, tokenize_result = self.drain.match(masked_content, full_search_strategy)
220+
return matched_cluster, tokenize_result
215221

216222
def get_parameter_list(self, log_template: str, log_message: str) -> List[str]:
217223
"""
@@ -230,6 +236,18 @@ def get_parameter_list(self, log_template: str, log_message: str) -> List[str]:
230236
return []
231237
return [parameter.value for parameter in extracted_parameters]
232238

239+
def get_parameter(self,result_dict, log_line):
240+
if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
241+
# template = result["template_mined"]
242+
template = result_dict.get(TEMPLATE_MINED_KEY, DEFAULT_STR_VALUE)
243+
params = self.extract_parameters(template, log_line)
244+
return params
245+
content_tokens = result_dict.get(TOKEN_LIST_KEY, [])
246+
# log_template_tokens = result["log_template_tokens"]
247+
log_template_tokens = result_dict.get(LOG_TEMPLATE_TOKENS_KEY, [])
248+
params = self.extract_parameters_by_compare(content_tokens, log_template_tokens)
249+
return params
250+
233251
def extract_parameters_by_compare(self, content_tokens, log_template_tokens):
234252
parameter_list = []
235253
for token1, token2 in zip(content_tokens, log_template_tokens):

src/common_config.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212

1313
USE_OLD_FUNCTION_EXTRACT_PARAMETER = False
14+
ENABLE_MASK_CONTENT = False
1415

1516
CHINESE_SUBSTR_TYPE = "中"
1617
SPACE_SUBSTR_TYPE = "空格"

src/drain3_examples/drain_stdin_demo.py

+7-13
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,7 @@
6060
result = template_miner.add_log_message(log_line)
6161
result_json = json.dumps(result,ensure_ascii=False)
6262
print(result_json)
63-
if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
64-
#template = result["template_mined"]
65-
template = result.get(TEMPLATE_MINED_KEY, DEFAULT_STR_VALUE)
66-
params = template_miner.extract_parameters(template, log_line)
67-
else:
68-
content_tokens = result.get(TOKEN_LIST_KEY, [])
69-
#log_template_tokens = result["log_template_tokens"]
70-
log_template_tokens = result.get(LOG_TEMPLATE_TOKENS_KEY,[])
71-
params = template_miner.extract_parameters_by_compare(content_tokens, log_template_tokens)
63+
params = template_miner.get_parameter(result, log_line)
7264
print("Parameters: " + str(params))
7365
#yd。训练完毕,打印挖掘的每个cluster
7466
print("Training done. Mined clusters:")
@@ -80,10 +72,12 @@
8072
log_line = input("> ")
8173
if log_line == 'q':
8274
break
83-
cluster = template_miner.match(log_line)
75+
cluster, tokenize_result = template_miner.match(log_line)
8476
if cluster is None:
8577
print(f"No match found")
8678
else:
87-
template = cluster.get_template()
88-
print(f"Matched template #{cluster.cluster_id}: {template}")
89-
print(f"Parameters: {template_miner.get_parameter_list(template, log_line)}")
79+
# template = cluster.get_template()
80+
# print(f"Matched template #{cluster.cluster_id}: {template}")
81+
result = template_miner.make_result_dict(cluster, tokenize_result)
82+
params = template_miner.get_parameter(result, log_line)
83+
print(f"Parameters: {params}")

src/log_parser_by_drain3.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
from src.tool.read_save_file import open_excel, save_dataframe
66

77
from src.common_config import DATA_DIR_PATH, DEFAULT_STR_VALUE, USE_OLD_FUNCTION_EXTRACT_PARAMETER,\
8-
STAR_CHAR, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY
9-
from src.common_config import IS_CONTAIN_CHINESE_KEY, SUBSTR_TYPE_PATTERN_KEY, SUBSTR_DETAIL_LIST_KEY, \
10-
TOKEN_LIST_KEY,LOG_TEMPLATE_TOKENS_KEY
8+
CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY, IS_CONTAIN_CHINESE_KEY, SUBSTR_TYPE_PATTERN_KEY, \
9+
SUBSTR_DETAIL_LIST_KEY, TOKEN_LIST_KEY, LOG_TEMPLATE_TOKENS_KEY
1110

1211
from src.tool.str_related import get_tow_set_diff
1312
import json

src/tool/tokenizer.py

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ def split_substr(substr_detail_list, need_split_substr_type, is_split_by_space):
7777

7878
split_list.extend(temp_token_list)
7979
else:
80+
if substr_type == SPACE_SUBSTR_TYPE:
81+
continue
8082
split_list.append(match_str)
8183
return split_list
8284

0 commit comments

Comments
 (0)