|
21 | 21 |
|
22 | 22 | __version__ = '0.1.0.3'
|
23 | 23 |
|
| 24 | +import base64 |
24 | 25 | import re
|
25 | 26 | import sys
|
26 | 27 | import getopt
|
@@ -269,7 +270,7 @@ def pre_html_transform(doc, url):
|
269 | 270 | doc = fix_move_href_tags(doc)
|
270 | 271 | if config.remove_history:
|
271 | 272 | doc = html_remove_image_history(doc)
|
272 |
| - |
| 273 | + |
273 | 274 | doc = html_remove_translation_links(doc)
|
274 | 275 |
|
275 | 276 | return doc
|
@@ -449,7 +450,7 @@ def monobook_hack_skin_css(doc, url):
|
449 | 450 | '\n/* end edit by mw2html */\n')
|
450 | 451 |
|
451 | 452 | doc = doc.replace('h3 { font-size: 90%; }', 'h3 { font-size: 130%; }')
|
452 |
| - |
| 453 | + |
453 | 454 | # Remove external link icons.
|
454 | 455 | if config.remove_png:
|
455 | 456 | doc = re.sub(r'#bodyContent a\[href \^="https://"\][\s\S]+?\}', r'', doc)
|
@@ -681,14 +682,14 @@ def url_to_filename(url):
|
681 | 682 | if L[4].startswith('title=') and L[2].endswith('index.php'):
|
682 | 683 | L[4] = L[4][len('title='):]
|
683 | 684 | L[2] = L[2][:-len('index.php')]
|
684 |
| - |
| 685 | + |
685 | 686 | if lpath[-1]=='man':
|
686 | 687 | L[2] = INDEX_HTML
|
687 | 688 | if lpath[-1].lower().startswith( 'quick_help'):
|
688 | 689 | L[2] = QHELP_HTML
|
689 | 690 | L[3] = ''
|
690 |
| - |
691 |
| - |
| 691 | + |
| 692 | + |
692 | 693 |
|
693 | 694 | L[2] = L[2].strip('/')
|
694 | 695 |
|
@@ -881,7 +882,7 @@ def should_follow(url):
|
881 | 882 | # JKC: we do allow css from 'strange' places.
|
882 | 883 | if '.css' in L[-1]:
|
883 | 884 | return True
|
884 |
| - |
| 885 | + |
885 | 886 | forbidden_parents = ['.php', '.html', '.htm']
|
886 | 887 | for fp in forbidden_parents:
|
887 | 888 | if fp in L[-1]:
|
@@ -915,7 +916,7 @@ def parse_html(doc, url, filename):
|
915 | 916 |
|
916 | 917 | # in this code we change each absolute url in L
|
917 | 918 | # into a relative one.
|
918 |
| - # we also kick-off zillions of subthreads to collect |
| 919 | + # we also kick-off zillions of subthreads to collect |
919 | 920 | # more pages.
|
920 | 921 | for item in L:
|
921 | 922 | u = item.url
|
@@ -1059,14 +1060,14 @@ def run(out=sys.stdout):
|
1059 | 1060 | if config.debug:
|
1060 | 1061 | out.write(url + '\n => ' + filename + '\n\n')
|
1061 | 1062 | n += 1
|
1062 |
| - |
| 1063 | + |
1063 | 1064 | # Enqueue URLs that we haven't yet spidered.
|
1064 | 1065 | for u in new_urls:
|
1065 | 1066 | if normalize_url(u) not in complete:
|
1066 | 1067 | # Strip off any #section link.
|
1067 | 1068 | if '#' in u:
|
1068 | 1069 | u = u[:u.index('#')]
|
1069 |
| - pending.add(u) |
| 1070 | + pending.add(u) |
1070 | 1071 |
|
1071 | 1072 | conn.close()
|
1072 | 1073 | print("connection to", domain, "closed.")
|
@@ -1202,6 +1203,10 @@ def main():
|
1202 | 1203 | if opt in ['-i', '--index']:
|
1203 | 1204 | config.index = arg
|
1204 | 1205 |
|
| 1206 | + user = os.environ.get('MW2HTML_USER', '') |
| 1207 | + passw = os.environ.get('MW2HTML_PASSWORD', '') |
| 1208 | + if user != '' and passw != '': |
| 1209 | + headers["Authorization"] = "Basic {}".format(base64.b64encode(bytes(f"{user}:{passw}", "utf-8")).decode("ascii")) |
1205 | 1210 | # Run program
|
1206 | 1211 | run()
|
1207 | 1212 |
|
|
0 commit comments