forked from Exceen/4chan-downloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpicrel.py
228 lines (182 loc) · 8.51 KB
/
picrel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/python3
import urllib.request, urllib.error, urllib.parse, argparse, logging
import os, re, time
import http.client
import fileinput
from multiprocessing import Process
import json
log = logging.getLogger('picrel')
workpath = os.path.dirname(os.path.realpath(__file__))
downldir = '' #user can have the 'download' folder here
args = None
def main():
global args
parser = argparse.ArgumentParser(description='picrel')
parser.add_argument('thread', nargs=1, help='url of the thread (or filename; one url per line)')
parser.add_argument('-c', '--with-counter', action='store_true', help='show a counter next the the image that has been downloaded')
parser.add_argument('-d', '--date', action='store_true', help='show date as well')
parser.add_argument('-l', '--less', action='store_true', help='show less information (suppresses checking messages)')
parser.add_argument('-n', '--use-names', action='store_true', help='use thread names instead of the thread ids (...4chan.org/board/thread/thread-id/thread-name)')
parser.add_argument('-r', '--reload', action='store_true', help='reload the queue file every 5 minutes')
parser.add_argument('-t', '--title', action='store_true', help='save original filenames')
parser.add_argument('-m', '--monitor', action='store_true', help='keep monitoring a fast thread')
args = parser.parse_args()
if args.date:
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p')
else:
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%I:%M:%S %p')
if args.title:
try:
import bs4
import django
except ImportError:
logging.error('Could not import the required modules! Disabling --title option...')
args.title = False
thread = args.thread[0].strip()
if thread[:4].lower() == 'http':
download_thread(thread, args, [])
else:
download_from_file(thread)
def load(url):
req = urllib.request.Request(url, headers={'User-Agent': '4chan Browser'})
return urllib.request.urlopen(req).read()
def get_title_list(html_content):
ret = list()
from bs4 import BeautifulSoup
parsed = BeautifulSoup(html_content, 'html.parser')
divs = parsed.find_all("div", {"class": "fileText"})
for i in divs:
current_child = i.findChildren("a", recursive = False)[0]
try:
ret.append(current_child["title"])
except KeyError:
ret.append(current_child.text)
return ret
def call_download_thread(thread_link, args, downloaded_files):
try:
download_thread(thread_link, args, downloaded_files)
except KeyboardInterrupt:
pass
def download_thread(thread_link, args, downloaded_files):
board = thread_link.split('/')[3]
thread = thread_link.split('/')[5].split('#')[0]
if len(thread_link.split('/')) > 6:
thread_tmp = thread_link.split('/')[6].split('#')[0]
if args.use_names or os.path.exists(os.path.join(workpath, downldir, board, thread_tmp)):
thread = thread_tmp
thread_folder = os.path.join(workpath, downldir, board, thread)
downloaded_files = load_downloaded_files(thread_folder) # Load the downloaded files from the JSON file
new_files_downloaded = False # Flag to track if new files were downloaded
while True:
try:
regex = '(\/\/i(?:s|)\d*\.(?:4cdn|4chan)\.org\/\w+\/(\d+\.(?:jpg|png|gif|webm)))'
html_result = load(thread_link).decode('utf-8')
regex_result = list(set(re.findall(regex, html_result)))
regex_result = sorted(regex_result, key=lambda tup: tup[1])
regex_result_len = len(regex_result)
regex_result_cnt = 1
directory = os.path.join(workpath, downldir, board, thread)
if not os.path.exists(directory):
os.makedirs(directory)
if args.title:
all_titles = get_title_list(html_result)
for enum_index, enum_tuple in enumerate(regex_result):
link, img = enum_tuple
if args.title:
img = all_titles[enum_index]
from django.utils.text import get_valid_filename
img_path = os.path.join(directory, get_valid_filename(img))
else:
img_path = os.path.join(directory, img)
if img_path not in downloaded_files:
data = load('https:' + link)
output_text = board + '/' + thread + '/' + img
if args.with_counter:
output_text = '[' + str(regex_result_cnt).rjust(len(str(regex_result_len))) + '/' + str(regex_result_len) + '] ' + output_text
log.info(output_text)
with open(img_path, 'wb') as f:
f.write(data)
# Add the downloaded file to the recorded list
downloaded_files.append(img_path)
new_files_downloaded = True # Set the flag to indicate new files were downloaded
save_downloaded_files(downloaded_files, thread_folder) # Save the updated list to the JSON file
regex_result_cnt += 1
except urllib.error.HTTPError:
time.sleep(10)
try:
load(thread_link)
except urllib.error.HTTPError:
log.info('%s 404\'d', thread_link)
break
continue
except (urllib.error.URLError, http.client.BadStatusLine, http.client.IncompleteRead):
log.fatal(thread_link + ' crashed!')
raise
if not new_files_downloaded:
log.info('No new media files to download. Operation stopped.')
save_downloaded_files(downloaded_files, thread_folder) # Save the downloaded files to the JSON file
break
if not args.less:
log.info('Checking ' + board + '/' + thread)
if args.monitor:
time.sleep(20)
else:
log.info('No new media files to download. Operation stopped.')
break
def download_from_file(filename):
running_links = []
while True:
processes = []
if os.path.isabs(filename):
queue_file = filename
else:
queue_file = os.path.join(workpath, filename)
log.info('your queue path:' + queue_file)
for link in [_f for _f in [line.strip() for line in open(queue_file) if line[:4] == 'http'] if _f]:
if link not in running_links:
running_links.append(link)
log.info('Added ' + link)
board = link.split('/')[3]
thread = link.split('/')[5].split('#')[0]
thread_folder = os.path.join(workpath, downldir, board, thread)
downloaded_files = load_downloaded_files(thread_folder)
process = Process(target=call_download_thread, args=(link, args, downloaded_files))
process.start()
processes.append([process, link])
if len(processes) == 0:
log.warning(filename + ' empty')
if args.reload:
time.sleep(60 * 5) # 5 minutes
links_to_remove = []
for process, link in processes:
if not process.is_alive():
links_to_remove.append(link)
else:
process.terminate()
for link in links_to_remove:
for line in fileinput.input(filename, inplace=True):
print(line.replace(link, '-' + link), end='')
running_links.remove(link)
log.info('Removed ' + link)
if not args.less:
log.info('Reloading ' + args.thread[0]) # thread = filename here; reloading on the next loop
else:
break
def load_downloaded_files(thread_folder):
downloaded_files = []
json_file = os.path.join(thread_folder, 'downloaded_files.json')
if os.path.exists(json_file):
with open(json_file, 'r') as f:
downloaded_files = json.load(f)
return downloaded_files
def save_downloaded_files(downloaded_files, thread_folder):
json_file = os.path.join(thread_folder, 'downloaded_files.json')
with open(json_file, 'w') as f:
json.dump(downloaded_files, f)
f.flush()
os.fsync(f)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass