Skip to content

Commit 08e9733

Browse files
authored
Merge pull request #78 from openzim/reliability
Many fixes for reliability of the scraper
2 parents 5236501 + ad99067 commit 08e9733

File tree

9 files changed

+362
-271
lines changed

9 files changed

+362
-271
lines changed

.github/workflows/PublishDockerDevImage.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ on:
44
push:
55
branches:
66
- main
7+
- reliability
78

89
jobs:
910
publish:

scraper/src/mindtouch2zim/asset.py

Lines changed: 56 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
1+
import re
2+
import threading
13
from io import BytesIO
24
from typing import NamedTuple
35

46
import backoff
57
from kiwixstorage import KiwixStorage, NotFoundError
68
from pif import get_public_ip
79
from PIL import Image
8-
from requests import HTTPError
910
from requests.exceptions import RequestException
1011
from zimscraperlib.download import stream_file
1112
from zimscraperlib.image.optimization import optimize_webp
1213
from zimscraperlib.image.presets import WebpMedium
1314
from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
1415
from zimscraperlib.zim import Creator
1516

16-
from mindtouch2zim.constants import logger, web_session
17+
from mindtouch2zim.constants import KNOWN_BAD_ASSETS_REGEX, logger, web_session
18+
from mindtouch2zim.errors import KnownBadAssetFailedError
1719
from mindtouch2zim.utils import backoff_hdlr
1820

1921
SUPPORTED_IMAGE_MIME_TYPES = {
@@ -47,9 +49,22 @@ class AssetDetails(NamedTuple):
4749

4850
class AssetProcessor:
4951

50-
def __init__(self, s3_url_with_credentials: str | None) -> None:
52+
def __init__(
53+
self,
54+
s3_url_with_credentials: str | None,
55+
bad_assets_regex: str | None,
56+
bad_assets_threshold: int,
57+
) -> None:
5158
self.s3_url_with_credentials = s3_url_with_credentials
59+
60+
bad_assets_regex = f"{bad_assets_regex}|{KNOWN_BAD_ASSETS_REGEX}"
61+
self.bad_assets_regex = (
62+
re.compile(bad_assets_regex, re.IGNORECASE) if bad_assets_regex else None
63+
)
64+
self.bad_assets_threshold = bad_assets_threshold
5265
self._setup_s3()
66+
self.bad_assets_count = 0
67+
self.lock = threading.Lock()
5368

5469
def process_asset(
5570
self,
@@ -62,12 +77,6 @@ def process_asset(
6277
asset_path=asset_path, asset_details=asset_details, creator=creator
6378
)
6479

65-
@backoff.on_exception(
66-
backoff.expo,
67-
RequestException,
68-
max_time=16,
69-
on_backoff=backoff_hdlr,
70-
)
7180
def _process_asset_internal(
7281
self,
7382
asset_path: ZimPath,
@@ -89,11 +98,28 @@ def _process_asset_internal(
8998
content=asset_content.getvalue(),
9099
)
91100
break # file found and added
92-
except HTTPError as exc:
93-
# would make more sense to be a warning, but this is just too
94-
# verbose, at least on geo.libretexts.org many assets are just
95-
# missing
96-
logger.debug(f"Ignoring {asset_path.value} due to {exc}")
101+
except KnownBadAssetFailedError as exc:
102+
logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
103+
except RequestException as exc:
104+
with self.lock:
105+
self.bad_assets_count += 1
106+
if (
107+
self.bad_assets_threshold >= 0
108+
and self.bad_assets_count > self.bad_assets_threshold
109+
):
110+
logger.error(
111+
f"Exception while processing asset for {asset_url.value}: "
112+
f"{exc}"
113+
)
114+
raise Exception( # noqa: B904
115+
f"Asset failure threshold ({self.bad_assets_threshold}) "
116+
"reached, stopping execution"
117+
)
118+
else:
119+
logger.warning(
120+
f"Exception while processing asset for {asset_url.value}: "
121+
f"{exc}"
122+
)
97123

98124
def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
99125
"""Get details from headers for a given url
@@ -204,6 +230,12 @@ def _download_from_online(self, asset_url: HttpUrl) -> BytesIO:
204230
)
205231
return asset_content
206232

233+
@backoff.on_exception(
234+
backoff.expo,
235+
RequestException,
236+
max_time=30, # secs
237+
on_backoff=backoff_hdlr,
238+
)
207239
def get_asset_content(
208240
self, asset_path: ZimPath, asset_url: HttpUrl, *, always_fetch_online: bool
209241
) -> BytesIO:
@@ -222,7 +254,16 @@ def get_asset_content(
222254
else:
223255
logger.debug(f"Not optimizing, unsupported mime type: {mime_type}")
224256

225-
return self._download_from_online(asset_url=asset_url)
257+
try:
258+
return self._download_from_online(asset_url=asset_url)
259+
except RequestException as exc:
260+
# check if the failing download match known bad assets regex early, and if
261+
# so raise a custom exception to escape backoff (always important to try
262+
# once even if asset is expected to not work, but no need to loose time on
263+
# retrying assets which are expected to be bad)
264+
if self.bad_assets_regex and self.bad_assets_regex.findall(asset_url.value):
265+
raise KnownBadAssetFailedError() from exc
266+
raise
226267

227268
def _setup_s3(self):
228269
if not self.s3_url_with_credentials:

scraper/src/mindtouch2zim/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
HTTP_TIMEOUT_NORMAL_SECONDS = 15
1717
HTTP_TIMEOUT_LONG_SECONDS = 30
1818

19-
HTML_ISSUES_WARN_ONLY = False
19+
# Loading the CSS leads to many bad assets at these URLs, we just ignore them
20+
KNOWN_BAD_ASSETS_REGEX = r"https?:\/\/a\.mtstatic\.com/@(cache|style)"
2021

2122
logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)
2223

scraper/src/mindtouch2zim/entrypoint.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,20 @@ def main(tmpdir: str) -> None:
226226
)
227227

228228
parser.add_argument(
229-
"--html-issues-warn-only",
230-
help="[dev] Only log a warning when unexpected HTML is encountered. Use with "
231-
"caution because activating this option means that ZIM HTML will probably lead "
232-
"to online resources without user noticing it.",
233-
action="store_true",
234-
default=False,
235-
dest="html_issues_warn_only",
229+
"--bad-assets-regex",
230+
help="Regular expression of asset URLs known to not be available. "
231+
"Case insensitive.",
232+
dest="bad_assets_regex",
233+
)
234+
235+
parser.add_argument(
236+
"--bad-assets-threshold",
237+
type=int,
238+
help="[dev] Number of assets allowed to fail to download before failing the"
239+
" scraper. Assets already excluded with --bad-assets-regex are not counted for"
240+
" this threshold. Defaults to 10 assets.",
241+
default=10,
242+
dest="bad_assets_threshold",
236243
)
237244

238245
args = parser.parse_args()
@@ -272,7 +279,8 @@ def main(tmpdir: str) -> None:
272279
illustration_url=args.illustration_url,
273280
s3_url_with_credentials=args.s3_url_with_credentials,
274281
assets_workers=args.assets_workers,
275-
html_issues_warn_only=args.html_issues_warn_only,
282+
bad_assets_regex=args.bad_assets_regex,
283+
bad_assets_threshold=args.bad_assets_threshold,
276284
).run()
277285
except SystemExit:
278286
logger.error("Generation failed, exiting")

scraper/src/mindtouch2zim/errors.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,13 @@ class InvalidFormatError(Exception):
44
pass
55

66

7-
class UnsupportedTagError(Exception):
8-
"""An exception raised when an HTML tag is not expected to be encountered"""
9-
10-
pass
11-
12-
13-
class UnsupportedHrefSrcError(Exception):
14-
"""An exception raised when an href or src is not expected to be encountered"""
7+
class NoIllustrationFoundError(Exception):
8+
"""An exception raised when no suitable illustration has been found"""
159

1610
pass
1711

1812

19-
class NoIllustrationFoundError(Exception):
20-
"""An exception raised when no suitable illustration has been found"""
13+
class KnownBadAssetFailedError(Exception):
14+
"""An exception raised when an asset known to be failing, failed as expected"""
2115

2216
pass

scraper/src/mindtouch2zim/html_rewriting.py

Lines changed: 18 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,8 @@
1313
ZimPath,
1414
)
1515

16-
import mindtouch2zim.constants
1716
from mindtouch2zim.client import LibraryPage
1817
from mindtouch2zim.constants import logger
19-
from mindtouch2zim.errors import UnsupportedHrefSrcError, UnsupportedTagError
2018
from mindtouch2zim.utils import is_better_srcset_descriptor
2119
from mindtouch2zim.vimeo import get_vimeo_thumbnail_url
2220

@@ -29,20 +27,20 @@
2927

3028

3129
@html_rules.rewrite_attribute()
32-
def rewrite_href_src_attributes(
30+
def rewrite_href_src_srcset_attributes(
3331
tag: str,
3432
attr_name: str,
3533
attr_value: str | None,
3634
url_rewriter: ArticleUrlRewriter,
3735
base_href: str | None,
3836
):
3937
"""Rewrite href and src attributes"""
40-
if attr_name not in ("href", "src") or not attr_value:
38+
if attr_name not in ("href", "src", "srcset") or not attr_value:
4139
return
4240
if not isinstance(url_rewriter, HtmlUrlsRewriter):
4341
raise Exception("Expecting HtmlUrlsRewriter")
4442
new_attr_value = None
45-
if tag == "a":
43+
if tag in ["a", "area"]:
4644
rewrite_result = url_rewriter(
4745
attr_value, base_href=base_href, rewrite_all_url=False
4846
)
@@ -53,36 +51,17 @@ def rewrite_href_src_attributes(
5351
if rewrite_result.rewriten_url.startswith(url_rewriter.library_path.value)
5452
else rewrite_result.rewriten_url
5553
)
56-
if not new_attr_value:
57-
# we do not (yet) support other tags / attributes so we fail the scraper
58-
msg = (
54+
else:
55+
# we remove the src/href/srcset which is not supported, to ensure we won't load
56+
# external assets
57+
new_attr_value = ""
58+
logger.warning(
5959
f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
6060
f"'{attr_value}') while rewriting {rewriting_context}"
6161
)
62-
if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
63-
raise UnsupportedHrefSrcError(msg)
64-
else:
65-
logger.warning(msg)
66-
return
6762
return (attr_name, new_attr_value)
6863

6964

70-
@html_rules.rewrite_tag()
71-
def refuse_unsupported_tags(tag: str):
72-
"""Stop scraper if unsupported tag is encountered"""
73-
if tag not in ["picture"]:
74-
return
75-
msg = (
76-
f"Tag {tag} is not yet supported in this scraper, found while rewriting "
77-
f"{rewriting_context}"
78-
)
79-
if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
80-
raise UnsupportedTagError(msg)
81-
else:
82-
logger.warning(msg)
83-
return
84-
85-
8665
YOUTUBE_IFRAME_RE = re.compile(r".*youtube(?:-\w+)*\.\w+\/embed\/(?P<id>.*?)(?:\?.*)*$")
8766
VIMEO_IFRAME_RE = re.compile(r".*vimeo(?:-\w+)*\.\w+\/video\/(?:.*?)(?:\?.*)*$")
8867

@@ -101,15 +80,8 @@ def rewrite_iframe_tags(
10180
raise Exception("Expecting HtmlUrlsRewriter")
10281
src = get_attr_value_from(attrs=attrs, name="src")
10382
if not src:
104-
msg = (
105-
"Unsupported empty src in iframe, found while rewriting "
106-
f"{rewriting_context}"
107-
)
108-
if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
109-
raise UnsupportedTagError(msg)
110-
else:
111-
logger.warning(msg)
112-
return
83+
logger.warning(f"Empty src found in iframe while rewriting {rewriting_context}")
84+
return
11385
image_rewriten_url = None
11486
try:
11587
if ytb_match := YOUTUBE_IFRAME_RE.match(src):
@@ -127,9 +99,15 @@ def rewrite_iframe_tags(
12799
url_rewriter.add_item_to_download(rewrite_result)
128100
image_rewriten_url = rewrite_result.rewriten_url
129101
else:
130-
logger.debug(f"iframe pointing to {src} will not have any preview")
102+
logger.debug(
103+
f"iframe pointing to {src} in {rewriting_context} will not "
104+
"have any preview"
105+
)
131106
except Exception as exc:
132-
logger.warning(f"Failed to rewrite iframe with src {src}", exc_info=exc)
107+
logger.warning(
108+
f"Failed to rewrite iframe with src {src} in {rewriting_context}",
109+
exc_info=exc,
110+
)
133111

134112
if image_rewriten_url:
135113
return (

0 commit comments

Comments
 (0)