-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathGet-Comments.py
executable file
·284 lines (254 loc) · 11.6 KB
/
Get-Comments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/usr/local/bin/python3.9
# -*- coding: utf-8 -*-
# @Author skillnull
# @Function 获取网易云音乐评论
import os
import random
import json
import time
import base64
import codecs
import requests
import multiprocessing # 多进程
from Crypto.Cipher import AES
import re # 正则表达式库
import numpy as np # numpy数据处理库
import collections # 词频统计库
import wordcloud # 词云展示库
import jieba # 结巴分词
import matplotlib.pyplot as plt
from PIL import Image # 图像处理库
ID = input('请输入歌曲ID:')
NAME = input('请输入歌曲名称:')
HEADERS = {
'authority': 'music.163.com',
'Host': 'music.163.com',
'Origin': 'https://music.163.com',
'Referer': f'https://music.163.com/song?id={ID}',
"x-music-loc-site": "100_https://music.163.com/song",
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,da;q=0.7'
}
print(HEADERS)
url = 'https://music.163.com/weapi/comment/resource/comments/get?csrf_token='
first_param = ''
second_param = '010001'
third_param = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa' \
'76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee' \
'255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
forth_param = b'0CoJUm6Qyw8W8jud'
# params 需要第一个和第四个参数 encSecKey需要一个随机的16位字符串和第二个和第三个参数
strw = 'S' * 16
cursor_temp = -1
# aes加密
def aes_encrypt(text, key):
iv = b'0102030405060708' # 偏移量
pad = 16 - len(text) % 16 # 使加密信息的长度为16的倍数
tt = pad * chr(pad) # 返回整数i对应的ASCII字符
text = text + tt.encode('utf-8')
encrpyptor = AES.new(key, AES.MODE_CBC, iv)
cipher_text = base64.b64encode(encrpyptor.encrypt(text))
return cipher_text
# rsa加密
def rsa_encrypt(pubkey, text, mouduls):
text = text[::-1]
rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16) ** int(pubkey, 16) % int(mouduls, 16)
return format(rs, 'x').zfill(256)
# rid 是歌曲的 id
# 获取aes加密参数
def get_aes_params(text, cursor):
global ID
params = {"rid": f"R_SO_4_{ID}", "threadId": f"R_SO_4_{ID}", "pageNo": f"{text}", "pageSize": "20",
"cursor": f"{cursor}", "offset": "0", "orderType": "1", "csrf_token": ""}
global first_param
first_param = bytes(str(params), encoding='utf-8')
params = aes_encrypt(first_param, forth_param)
# print(f'params的随机值是:{params} ')
params = aes_encrypt(params, strw.encode('utf-8'))
# print(f'第二次加密后的随机值是:{params}')
return params
# 获取rsa加密参数
def get_rsa_params(text):
encseckey = rsa_encrypt(second_param, text, third_param)
return encseckey
# 抓取评论
def get_json(pm, esk):
form_data = {
'params': pm,
'encSecKey': esk
}
json_text = requests.post(url, headers=HEADERS, data=form_data)
return json_text.text
# 解析评论
def get_all_comment():
params = get_aes_params(1, -1)
enc_seckey = get_rsa_params(strw)
json_text = get_json(params, enc_seckey)
json_dict = json.loads(json_text)
result = json_dict['data']
comments_num = int(result['totalCount'])
if comments_num % 20 == 0:
page = comments_num // 20
else:
page = int(comments_num // 20) + 1
print(f'共有{comments_num}条,{page}页评论!')
os.makedirs(f'musicComments/{ID}', mode=0o777, exist_ok=True) # 创建歌曲文件夹目录
get_page = input(f'获取多少页评论,最多{page}页:')
int_get_page = int(get_page)
if page > int_get_page:
handler_comments(range(int_get_page))
else:
handler_comments(range(page))
# 处理解析后的评论
def handler_comments(comments):
p = multiprocessing.Process(target=save_to_html, args=(comments,))
p.start()
p.join()
time.sleep(3)
p = multiprocessing.Process(target=save_to_txt, args=(comments,))
p.start()
p.join()
# 将评论存储为html
def save_to_html(comments):
print(f'抓取{comments}页')
global cursor_temp
with codecs.open(f'musicComments/{ID}/{ID}.html', 'w') as file:
file.write(f'<html>') # 设置输出的html文件的格式
file.write(f'<head>')
file.write(f'<meta charset="utf-8">')
file.write(f'<title>{NAME}</title>')
file.write(f'</head>')
file.write(f'<body>')
file.write(f'<table style="font-size:12px;font-weight:300;">')
file.write(f'<thead>')
file.write(f'<tr>')
file.write(f'<td style="min-width:65px;border: 1px solid #f4f4f4;padding: 5px;">评论者id</td>'
f'<td style="min-width:65px;border: 1px solid #f4f4f4;padding: 5px;">头像</td>'
f'<td style="min-width:65px;border: 1px solid #f4f4f4;padding: 5px;">昵称</td>'
f'<td style="min-width:65px;border: 1px solid #f4f4f4;padding: 5px;">评论内容</td>'
f'<td style="min-width:65px;border: 1px solid #f4f4f4;padding: 5px;">点赞总数</td>'
f'<td style="min-width:65px;border: 1px solid #f4f4f4;padding: 5px;">评论时间</td>'
f'<td style="min-width:65px;border: 1px solid #f4f4f4;padding: 5px;">IP属地</td>'
)
file.write(f'</tr>')
file.write(f'</thead>')
for i in comments: # 逐页抓取
params = get_aes_params(i + 1, cursor_temp)
enc_seckey = get_rsa_params(strw)
json_text = get_json(params, enc_seckey)
json_dict = json.loads(json_text)
cursor_temp = json_dict['data']['cursor']
for item in json_dict['data']['comments']:
user_id = item['user']['userId'] # 评论者id
nickname = item['user']['nickname'] # 昵称
comment = item['content'] # 评论内容
time_temp = item['time']
if len(f'{item["time"]}') == 13:
time_temp = float(time_temp / 1000)
_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_temp)) # 评论时间
liked_count = item['likedCount'] # 点赞总数
location = item['ipLocation']['location'] # IP属地
avatar = item['user']['avatarUrl'] # 头像
file.write(f'<tr>')
file.write(f'<td style="border: 1px solid #f4f4f4;padding: 5px;">{user_id}</td>')
file.write(
f'<td style="border: 1px solid #f4f4f4;padding: 5px;"><img src="{avatar}" width="50" height="50" /></td>')
file.write(f'<td style="border: 1px solid #f4f4f4;padding: 5px;">{nickname}</td>')
file.write(f'<td style="border: 1px solid #f4f4f4;padding: 5px;">{comment}</td>')
file.write(f'<td style="border: 1px solid #f4f4f4;padding: 5px;">{liked_count}</td>')
file.write(f'<td style="border: 1px solid #f4f4f4;padding: 5px;">{_time}</td>')
file.write(f'<td style="border: 1px solid #f4f4f4;padding: 5px;">{location}</td>')
file.write(f'</tr>')
sleeptime = random.randint(0, 2)
time.sleep(sleeptime)
file.write(f'</table>')
file.write(f'</body>')
file.write(f'</html>')
cursor_temp = -1
# 将评论写入文本文件
def save_to_txt(comments):
print(f'抓取{comments}页')
global cursor_temp
with codecs.open(f'musicComments/{ID}/{NAME}.txt', 'w', encoding='utf-8') as file:
for i in comments: # 逐页抓取
comments_list = ''
params = get_aes_params(i + 1, cursor_temp)
enc_seckey = get_rsa_params(strw)
json_text = get_json(params, enc_seckey)
json_dict = json.loads(json_text)
cursor_temp = json_dict['data']['cursor']
for item in json_dict['data']['comments']:
comment = item['content'] # 评论内容
nickname = item['user']['nickname'] # 昵称
user_id = item['user']['userId'] # 评论者id
liked_count = item['likedCount'] # 点赞总数
time_temp = item['time']
if len(f'{item["time"]}') == 13:
time_temp = float(time_temp / 1000)
_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time_temp)) # 评论时间
location = item['ipLocation']['location'] # IP属地
comment_info = f'{user_id} | {nickname} | {comment} | {liked_count} | {_time} | {location}'.replace(
'\r',
'').replace(
'\n', '')
comment_info += f'\r\n-----------------------------------------------------\r\n'
comments_list += comment_info
file.writelines(comments_list)
sleeptime = random.randint(0, 3)
time.sleep(sleeptime)
print(f'第{i + 1}页写入文件成功!')
cursor_temp = -1
get_wordcloud()
# 生成词云
def get_wordcloud():
# 读取文件
fn = open(f'musicComments/{ID}/{NAME}.txt', encoding="utf-8") # 打开文件
string_data = fn.read() # 读出整个文件
fn.close() # 关闭文件
# 文本预处理
pattern = re.compile(u'\t|\n|\.|\||\)|\(|\?|[-:;"?]') # 定义正则表达式匹配模式
string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除
if not string_data:
return
# 文本分词
seg_list_exact = jieba.cut(string_data, cut_all=False) # 精确模式分词
object_list = []
# 自定义去除词库
remove_words = [
f'在', f'了', f'通常', f'如果', f'我们', f'需要', f'的', f',', f'和', f'是', f'随着', f'对于', f'对', f'等',
f'能', f'都', f'。', f' ', f'、', f'中', f'1', f'2', f'3', f'4', f'5', f'6', f'7', f'8', f'9', f'0', f'2023'
]
for word in seg_list_exact: # 循环读出每个分词
if word not in remove_words: # 如果不在去除词库中
object_list.append(word) # 分词追加到列表
# 词频统计
word_counts = collections.Counter(object_list) # 对分词做词频统计
word_counts_top = word_counts.most_common(50) # 获取高频的词
print(word_counts_top) # 输出检查
# 词频展示
mask = np.array(Image.open('musicComments/wordcloud_bg.jpg')) # 定义词频背景
wc = wordcloud.WordCloud(
font_path='/System/Library/fonts/PingFang.ttc', # 设置字体格式
mask=mask, # 设置背景图
background_color="white",
mode="RGBA", # 当参数为“RGBA”并且background_color不为空时,背景为透明
max_words=500, # 最多显示词数
max_font_size=90, # 字体最大值
scale=1
)
wc.generate_from_frequencies(word_counts) # 从字典生成词云
image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案
wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案
plt.imshow(wc) # 显示词云
plt.axis('off') # 关闭坐标轴
# plt.savefig(f'musicComments/{ID}/{NAME}.png', dpi=200)
plt.show()
wc.to_file(f'musicComments/{ID}/{NAME}.png')
if __name__ == '__main__':
start_time = time.time() # 开始时间
get_all_comment()
end_time = time.time() # 结束时间
print(f'程序耗时{end_time - start_time}秒')