-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathXXOO3.py
94 lines (78 loc) · 2.63 KB
/
XXOO3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
'''
multiprocessing crawler for jandan.net/ooxx
"pip install -r requirements.txt" and run it.
The line with notes is position you can control this script.
Have fun.
@author: B1u3Buf4
'''
import os
import re
import time
from multiprocessing import Process, Queue, Pool
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
class checkload(object):
def __init__(self,driver):
self.driver = driver
def __call__(self,driver):
return driver.find_element_by_id('comments').get_attribute("innerHTML").find('sinaimg.cn') > -1
def crawlurl():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options = chrome_options)
driver.get(url = 'http://jandan.net/ooxx')
time.sleep(1)
pics = []
urls = []
order = 1
try:
page = driver.page_source
counts = re.findall('current-comment-page">.*</s', page)[0]
cou = re.findall('[0-9]{1,4}', counts)
cou = int(cou[0])
for i in range(1, cou + 1): #start-page to end-page
order = i
print(order, len(pics))
url = 'http://jandan.net/ooxx/page-' + str(i) + '#comments'
driver.get(url)
WebDriverWait(driver, 5, 0.5).until(checkload(driver))
page = driver.find_element_by_id('comments').get_attribute("innerHTML")
pics.extend(re.findall('(src=".*?jpg|src=".*?gif|src=".*?png)', page))
for j in pics:
url = re.sub('cn/.*?/', 'cn/large/', j[5:])
if 'jandan.net' in url:
continue
if url.find('http://') != 0:
print('[-]', url)
continue
urls.append(url)
return urls
finally:
print('[+]', order, len(urls))
driver.quit()
def tinyreq(url):
r = requests.get(url)
name1 = re.findall('([a-zA-Z0-9]*?.jpg|[a-zA-Z0-9]*?.gif|[a-zA-Z0-9]*?.png)', url[-36:])
print('Downloading', url)
with open('./pics/' + name1[0], "wb") as code:
try:
code.write(r.content)
except:
pass
def getpic(imgs, processes = 5): #number of processes
pool = Pool(processes)
for img in imgs:
pool.apply_async(tinyreq, (img, ))
pool.close()
pool.join()
if __name__ == "__main__":
if not os.path.exists('./pics/'):
os.mkdir('pics')
start_time = time.time()
pic = crawlurl()
getpic(pic)
print('[+] total:%s S' % (int(time.time() - start_time)))