Skip to content

Commit 8baccd7

Browse files
committed
改造drain3,使其能处理包含中文的日志
1 parent 69287c3 commit 8baccd7

10 files changed

+41090
-41026
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ src/__pycache__/*
33
.idea/
44
src/drain3_examples/SSH.log
55
src/drain3_examples/SSH.tar.gz
6+
config_ini/drain3_state.bin

config_ini/drain3.ini

+4-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@ masking = [
1212
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"},
1313
{"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"}
1414
]
15-
mask_prefix = <:
16-
mask_suffix = :>
15+
;mask_prefix = <:
16+
;mask_suffix = :>
17+
mask_prefix = <<
18+
mask_suffix = >>
1719

1820
[DRAIN]
1921
sim_th = 0.4

data/chinese_english_logs_parse_by_drain3.csv

+6,163-6,163
Large diffs are not rendered by default.

data/english_logs_parse_by_drain3.csv

+34,831-34,831
Large diffs are not rendered by default.

drain3/drain.py

+30-9
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from cachetools import LRUCache, Cache
88

99
from drain3.simple_profiler import Profiler, NullProfiler
10-
10+
from src.tool.tokenizer import get_token_list
1111

1212
class LogCluster:
1313
__slots__ = ["log_template_tokens", "cluster_id", "size"]
@@ -20,7 +20,7 @@ def __init__(self, log_template_tokens: list, cluster_id: int):
2020
"""
2121
self.log_template_tokens = tuple(log_template_tokens)
2222
self.cluster_id = cluster_id
23-
self.size = 1
23+
self.size = 1 #yd。用于统计当前cluster匹配的日志条数
2424

2525
def get_template(self):
2626
return ' '.join(self.log_template_tokens)
@@ -138,6 +138,7 @@ def tree_search(self, root_node: Node, tokens: list, sim_th: float, include_para
138138

139139
# find the leaf node for this log - a path of nodes matching the first N tokens (N=tree depth)
140140
cur_node_depth = 1
141+
141142
for token in tokens:
142143
# at max depth
143144
if cur_node_depth >= self.max_node_depth:
@@ -235,7 +236,7 @@ def add_seq_to_prefix_tree(self, root_node, cluster: LogCluster):
235236
# seq1 is a template, seq2 is the log to match
236237
def get_seq_distance(self, seq1, seq2, include_params: bool):
237238
"""
238-
239+
yd。功能:计算seq1与seq2的相似度,相似度 = 公共元素的个数/ seq1的长度
239240
:param seq1:
240241
:param seq2:
241242
:param include_params:
@@ -266,6 +267,7 @@ def get_seq_distance(self, seq1, seq2, include_params: bool):
266267

267268
def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include_params: bool):
268269
"""
270+
yd。功能:从cluster_ids对应的所有cluster中,找出cluster.log_template_tokens与tokens相似度最高的
269271
Find the best match for a log message (represented as tokens) versus a list of clusters
270272
:param cluster_ids: List of clusters to match against (represented by their IDs)
271273
:param tokens: the log message, separated to tokens.
@@ -332,17 +334,36 @@ def print_node(self, token, node, depth, file, max_clusters):
332334
out_str = '\t' * (depth + 1) + str(cluster)
333335
print(out_str, file=file)
334336

335-
def get_content_as_tokens(self, content):
337+
def get_content_as_tokens_raw(self, content):
338+
"""
339+
这是drain3最原始的分词代码,只考虑了英文,没有考虑中文的情况
340+
:param content:
341+
:return:
342+
"""
336343
content = content.strip()
337344
for delimiter in self.extra_delimiters:
338345
content = content.replace(delimiter, " ")
339346
content_tokens = content.split()
340347
return content_tokens
341348

349+
350+
def get_content_as_tokens(self, content):
351+
"""
352+
考虑中英文混杂,纯英文两种情况
353+
:param content:
354+
:return:
355+
"""
356+
content = content.strip()
357+
is_contain_chinese, substr_type_pattern, substr_detail_list, token_list,token_join_str = get_token_list(content)
358+
content_tokens = token_list
359+
print(f"content_tokens = {content_tokens}")
360+
return content_tokens
361+
362+
342363
def add_log_message(self, content: str):
343364
"""
344365
yd。功能:根据传入的content,获取匹配的logCluster,该LogCluster可能是先前已经存在的,也可能是需要新生成的
345-
:param content:被正则匹配mask后的日志内容
366+
:param content:被正则匹配mask后的日志内容,例如"connected to <:IP:>"
346367
:return:match_cluster:匹配的logCluster;update_type:表示更新match_cluster的原因
347368
"""
348369
content_tokens = self.get_content_as_tokens(content) # yd。对content进行分词
@@ -370,20 +391,20 @@ def add_log_message(self, content: str):
370391
if self.profiler:
371392
self.profiler.start_section("cluster_exist")
372393
new_template_tokens = self.create_template(content_tokens, match_cluster.log_template_tokens)
373-
if tuple(new_template_tokens) == match_cluster.log_template_tokens:
394+
if tuple(new_template_tokens) == match_cluster.log_template_tokens: #yd。如果新创建的模板与最匹配的模板相同
374395
update_type = "none"
375-
else:
396+
else:#yd。如果新创建的模板与最新的模板不相同,则用新创建的模板来更新最匹配的模板
376397
match_cluster.log_template_tokens = tuple(new_template_tokens)
377398
update_type = "cluster_template_changed"
378399
match_cluster.size += 1
379400
# Touch cluster to update its state in the cache.
380401
# noinspection PyStatementEffect
381-
self.id_to_cluster[match_cluster.cluster_id]
402+
self.id_to_cluster[match_cluster.cluster_id] #yd。因为使用了LRUCache机制来控制cluster个数,故这里需要访问一下match_cluster对应的id
382403

383404
if self.profiler:
384405
self.profiler.end_section()
385406

386-
return match_cluster, update_type
407+
return match_cluster, update_type,content_tokens
387408

388409
def get_clusters_ids_for_seq_len(self, seq_len: int):
389410
"""

drain3/template_miner.py

+28-9
Original file line numberDiff line numberDiff line change
@@ -145,22 +145,33 @@ def add_log_message(self, log_message: str) -> dict:
145145
"""
146146
self.profiler.start_section("total")
147147

148-
self.profiler.start_section("mask")
149-
masked_content = self.masker.mask(log_message) #yd。将log_message字符串中正则匹配的子串,用特定符号替换,比如将content中的ip数字用"<:IP:>"替换
150-
self.profiler.end_section()
148+
if 0:
149+
self.profiler.start_section("mask")
150+
# yd。将log_message字符串中正则匹配的子串,用特定符号替换。
151+
# 比如将"connected to 10.0.0.1"中的ip数字用"<:IP:>"替换,返回"connected to <:IP:>"
152+
masked_content = self.masker.mask(log_message)
153+
self.profiler.end_section()
154+
else:
155+
masked_content = log_message
151156

152157
self.profiler.start_section("drain")
153-
cluster, change_type = self.drain.add_log_message(masked_content) #yd。根据传入的masked_content,获取匹配的logCluster
158+
# yd。根据传入的masked_content,获取匹配的logCluster
159+
cluster, change_type, content_tokens = self.drain.add_log_message(masked_content)
154160
self.profiler.end_section("drain")
155161

156162
result = {
163+
"content_tokens":content_tokens,
157164
"change_type": change_type,
158165
"cluster_id": cluster.cluster_id,
159-
"cluster_size": cluster.size,
166+
"cluster_size": cluster.size, #yd。用于统计当前cluster匹配的日志条数
167+
"log_template_tokens": cluster.log_template_tokens,
160168
"template_mined": cluster.get_template(), #yd。返回挖掘处理的日志模板
169+
161170
"cluster_count": len(self.drain.clusters) #yd。统计当前已经挖掘的模板的 总数
171+
162172
}
163173

174+
#yd。这里是将当前的日志模板信息的快照保存下来
164175
if self.persistence_handler is not None:
165176
self.profiler.start_section("save_state")
166177
snapshot_reason = self.get_snapshot_reason(change_type, cluster.cluster_id)
@@ -214,6 +225,16 @@ def get_parameter_list(self, log_template: str, log_message: str) -> List[str]:
214225
return []
215226
return [parameter.value for parameter in extracted_parameters]
216227

228+
def extract_parameters_by_compare(self, content_tokens, log_template_tokens):
229+
parameter_list = []
230+
for token1, token2 in zip(content_tokens, log_template_tokens):
231+
if token1 == token2:
232+
continue
233+
extracted_parameter = ExtractedParameter(token1, mask_name="-")
234+
parameter_list.append(extracted_parameter)
235+
return parameter_list
236+
237+
217238
def extract_parameters(self,
218239
log_template: str,
219240
log_message: str,
@@ -244,8 +265,6 @@ def extract_parameters(self,
244265
log_template, exact_matching)
245266

246267
# Parameters are represented by specific named groups inside template_regex.
247-
248-
249268
parameter_match = re.match(template_regex, log_message)
250269

251270
# log template does not match template
@@ -254,7 +273,7 @@ def extract_parameters(self,
254273

255274
# create list of extracted parameters
256275
extracted_parameters = []
257-
for group_name, parameter in parameter_match.groupdict().items():
276+
for group_name, parameter in parameter_match.groupdict().items(): #yd。对正则匹配的结果进行遍历
258277
if group_name in param_group_name_to_mask_name:
259278
mask_name = param_group_name_to_mask_name[group_name]
260279
extracted_parameter = ExtractedParameter(parameter, mask_name)
@@ -265,7 +284,7 @@ def extract_parameters(self,
265284
@cachedmethod(lambda self: self.parameter_extraction_cache)
266285
def _get_template_parameter_extraction_regex(self, log_template: str, exact_matching: bool):
267286
"""
268-
287+
yd。功能:构建模板参数抽取的正则表达式
269288
:param log_template:
270289
:param exact_matching:
271290
:return: template_regex:

src/common_config.py

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99

1010
DEFAULT_STR_VALUE = " "
1111

12+
SPACE_CHAR = " "
13+
14+
USE_OLD_FUNCTION_EXTRACT_PARAMETER = False
1215

1316
CHINESE_SUBSTR_TYPE = "中"
1417
SPACE_SUBSTR_TYPE = "空格"

src/drain3_examples/drain_stdin_demo.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77

88
from drain3 import TemplateMiner
99
from drain3.template_miner_config import TemplateMinerConfig
10-
from src.common_config import CONFIG_DIR_PATH
10+
from src.common_config import CONFIG_DIR_PATH,USE_OLD_FUNCTION_EXTRACT_PARAMETER
11+
from src.tool.tokenizer import get_token_list
12+
1113
# persistence_type = "NONE"
1214
# persistence_type = "REDIS"
1315
# persistence_type = "KAFKA"
@@ -53,14 +55,21 @@
5355
log_line = input("> ")
5456
if log_line == 'q':
5557
break
58+
# is_contain_chinese, substr_type_pattern, substr_detail_list, token_list, token_join_str = get_token_list(log_line)
59+
# log_line = token_join_str
5660
result = template_miner.add_log_message(log_line)
57-
result_json = json.dumps(result)
61+
result_json = json.dumps(result,ensure_ascii=False)
5862
print(result_json)
59-
template = result["template_mined"]
60-
params = template_miner.extract_parameters(template, log_line)
63+
if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
64+
template = result["template_mined"]
65+
params = template_miner.extract_parameters(template, log_line)
66+
else:
67+
content_tokens = result["content_tokens"]
68+
log_template_tokens = result["log_template_tokens"]
69+
params = template_miner.extract_parameters_by_compare(content_tokens, log_template_tokens)
6170
print("Parameters: " + str(params))
62-
63-
print("Training done. Mined clusters:") #yd。训练完毕,打印挖掘的cluster
71+
#yd。训练完毕,打印挖掘的每个cluster
72+
print("Training done. Mined clusters:")
6473
for cluster in template_miner.drain.clusters:
6574
print(cluster)
6675

src/log_parser_by_drain3.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from tqdm import tqdm # 进度条
55
from src.tool.read_save_file import open_excel, save_dataframe
66

7-
from src.common_config import DATA_DIR_PATH, CONNECTOR_CHAR, \
7+
from src.common_config import DATA_DIR_PATH, CONNECTOR_CHAR, USE_OLD_FUNCTION_EXTRACT_PARAMETER,\
88
STAR_CHAR, CLUSTER_ID_KEY,CLUSTER_SIZE_KEY,TEMPLATE_MINED_KEY,CLUSTER_COUNT_KEY
99
from src.tool.str_related import get_tow_set_diff
1010
from src.tool.tool import calculate_normalize_ratio
@@ -36,11 +36,19 @@ def __init__(self):
3636
print(f"Starting training mode. Reading from std-in ('q' to finish)") #yd。利用输入的一条条日志,训练得到模板
3737

3838
def parse_log_content(self, log_line):
39+
# is_contain_chinese, substr_type_pattern, substr_detail_list, token_list, token_join_str = get_token_list(
40+
# log_line)
41+
# log_line = token_join_str
3942
result = self.template_miner.add_log_message(log_line)
40-
result_json = json.dumps(result)
43+
result_json = json.dumps(result,ensure_ascii=False)
4144
#print(result_json)
42-
template = result["template_mined"] #yd。取出挖掘的日志模板
43-
params = self.template_miner.extract_parameters(template, log_line)
45+
if USE_OLD_FUNCTION_EXTRACT_PARAMETER:
46+
template = result["template_mined"]
47+
params = self.template_miner.extract_parameters(template, log_line)
48+
else:
49+
content_tokens = result["content_tokens"]
50+
log_template_tokens = result["log_template_tokens"]
51+
params = self.template_miner.extract_parameters_by_compare(content_tokens, log_template_tokens)
4452
#print("Parameters: " + str(params))
4553
return result, params
4654

src/tool/tokenizer.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from src.common_config import DATA_DIR_PATH,CHINESE_REGEXP,CONNECTOR_CHAR,\
22
PUNCTUATION_MARK_REGEXP,NONE_CHINESE_REGEXP, CHINESE_SUBSTR_TYPE,SPACE_SUBSTR_TYPE, ENGLISH_SUBSTR_TYPE,\
3-
CHINESE_SPACE_CHINESE_PATTERN,STAR_CHAR,PUNCTUATION_MARK_TYPE
3+
CHINESE_SPACE_CHINESE_PATTERN,SPACE_CHAR,PUNCTUATION_MARK_TYPE
44
from src.tool.str_related import str_normalize, get_tow_set_diff
55
import jieba
66

@@ -94,7 +94,8 @@ def get_token_list(content):
9494
token_list = split_substr(substr_detail_list, CHINESE_SUBSTR_TYPE, is_split_by_space=False)
9595
else: # 即模式中不包含中文,则对英文按空格进行切分
9696
token_list = split_substr(substr_detail_list, ENGLISH_SUBSTR_TYPE, is_split_by_space=True)
97-
return is_contain_chinese, substr_type_pattern, substr_detail_list, token_list
97+
token_join_str = SPACE_CHAR.join(token_list)
98+
return is_contain_chinese, substr_type_pattern, substr_detail_list, token_list,token_join_str
9899

99100

100101
if __name__ == '__main__':

0 commit comments

Comments
 (0)