Merge pull request #78 from openzim/reliability

benoit74 · web-flow · commit 08e9733c3f71 · 2024-11-25T09:25:02.000+01:00
Many fixes for reliability of the scraper
diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - main
+      - reliability
 
 jobs:
   publish:
diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py
@@ -1,19 +1,21 @@
+import re
+import threading
 from io import BytesIO
 from typing import NamedTuple
 
 import backoff
 from kiwixstorage import KiwixStorage, NotFoundError
 from pif import get_public_ip
 from PIL import Image
-from requests import HTTPError
 from requests.exceptions import RequestException
 from zimscraperlib.download import stream_file
 from zimscraperlib.image.optimization import optimize_webp
 from zimscraperlib.image.presets import WebpMedium
 from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
 from zimscraperlib.zim import Creator
 
-from mindtouch2zim.constants import logger, web_session
+from mindtouch2zim.constants import KNOWN_BAD_ASSETS_REGEX, logger, web_session
+from mindtouch2zim.errors import KnownBadAssetFailedError
 from mindtouch2zim.utils import backoff_hdlr
 
 SUPPORTED_IMAGE_MIME_TYPES = {
@@ -47,9 +49,22 @@ class AssetDetails(NamedTuple):
 
 class AssetProcessor:
 
-    def __init__(self, s3_url_with_credentials: str | None) -> None:
+    def __init__(
+        self,
+        s3_url_with_credentials: str | None,
+        bad_assets_regex: str | None,
+        bad_assets_threshold: int,
+    ) -> None:
         self.s3_url_with_credentials = s3_url_with_credentials
+
+        bad_assets_regex = f"{bad_assets_regex}|{KNOWN_BAD_ASSETS_REGEX}"
+        self.bad_assets_regex = (
+            re.compile(bad_assets_regex, re.IGNORECASE) if bad_assets_regex else None
+        )
+        self.bad_assets_threshold = bad_assets_threshold
         self._setup_s3()
+        self.bad_assets_count = 0
+        self.lock = threading.Lock()
 
     def process_asset(
         self,
@@ -62,12 +77,6 @@ def process_asset(
             asset_path=asset_path, asset_details=asset_details, creator=creator
         )
 
-    @backoff.on_exception(
-        backoff.expo,
-        RequestException,
-        max_time=16,
-        on_backoff=backoff_hdlr,
-    )
     def _process_asset_internal(
         self,
         asset_path: ZimPath,
@@ -89,11 +98,28 @@ def _process_asset_internal(
                     content=asset_content.getvalue(),
                 )
                 break  # file found and added
-            except HTTPError as exc:
-                # would make more sense to be a warning, but this is just too
-                # verbose, at least on geo.libretexts.org many assets are just
-                # missing
-                logger.debug(f"Ignoring {asset_path.value} due to {exc}")
+            except KnownBadAssetFailedError as exc:
+                logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
+            except RequestException as exc:
+                with self.lock:
+                    self.bad_assets_count += 1
+                    if (
+                        self.bad_assets_threshold >= 0
+                        and self.bad_assets_count > self.bad_assets_threshold
+                    ):
+                        logger.error(
+                            f"Exception while processing asset for {asset_url.value}: "
+                            f"{exc}"
+                        )
+                        raise Exception(  # noqa: B904
+                            f"Asset failure threshold ({self.bad_assets_threshold}) "
+                            "reached, stopping execution"
+                        )
+                    else:
+                        logger.warning(
+                            f"Exception while processing asset for {asset_url.value}: "
+                            f"{exc}"
+                        )
 
     def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
         """Get details from headers for a given url
@@ -204,6 +230,12 @@ def _download_from_online(self, asset_url: HttpUrl) -> BytesIO:
         )
         return asset_content
 
+    @backoff.on_exception(
+        backoff.expo,
+        RequestException,
+        max_time=30,  # secs
+        on_backoff=backoff_hdlr,
+    )
     def get_asset_content(
         self, asset_path: ZimPath, asset_url: HttpUrl, *, always_fetch_online: bool
     ) -> BytesIO:
@@ -222,7 +254,16 @@ def get_asset_content(
                 else:
                     logger.debug(f"Not optimizing, unsupported mime type: {mime_type}")
 
-        return self._download_from_online(asset_url=asset_url)
+        try:
+            return self._download_from_online(asset_url=asset_url)
+        except RequestException as exc:
+            # check if the failing download match known bad assets regex early, and if
+            # so raise a custom exception to escape backoff (always important to try
+            # once even if asset is expected to not work, but no need to loose time on
+            # retrying assets which are expected to be bad)
+            if self.bad_assets_regex and self.bad_assets_regex.findall(asset_url.value):
+                raise KnownBadAssetFailedError() from exc
+            raise
 
     def _setup_s3(self):
         if not self.s3_url_with_credentials:
diff --git a/scraper/src/mindtouch2zim/constants.py b/scraper/src/mindtouch2zim/constants.py
@@ -16,7 +16,8 @@
 HTTP_TIMEOUT_NORMAL_SECONDS = 15
 HTTP_TIMEOUT_LONG_SECONDS = 30
 
-HTML_ISSUES_WARN_ONLY = False
+# Loading the CSS leads to many bad assets at these URLs, we just ignore them
+KNOWN_BAD_ASSETS_REGEX = r"https?:\/\/a\.mtstatic\.com/@(cache|style)"
 
 logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)
 
diff --git a/scraper/src/mindtouch2zim/entrypoint.py b/scraper/src/mindtouch2zim/entrypoint.py
@@ -226,13 +226,20 @@ def main(tmpdir: str) -> None:
     )
 
     parser.add_argument(
-        "--html-issues-warn-only",
-        help="[dev] Only log a warning when unexpected HTML is encountered. Use with "
-        "caution because activating this option means that ZIM HTML will probably lead "
-        "to online resources without user noticing it.",
-        action="store_true",
-        default=False,
-        dest="html_issues_warn_only",
+        "--bad-assets-regex",
+        help="Regular expression of asset URLs known to not be available. "
+        "Case insensitive.",
+        dest="bad_assets_regex",
+    )
+
+    parser.add_argument(
+        "--bad-assets-threshold",
+        type=int,
+        help="[dev] Number of assets allowed to fail to download before failing the"
+        " scraper. Assets already excluded with --bad-assets-regex are not counted for"
+        " this threshold. Defaults to 10 assets.",
+        default=10,
+        dest="bad_assets_threshold",
     )
 
     args = parser.parse_args()
@@ -272,7 +279,8 @@ def main(tmpdir: str) -> None:
             illustration_url=args.illustration_url,
             s3_url_with_credentials=args.s3_url_with_credentials,
             assets_workers=args.assets_workers,
-            html_issues_warn_only=args.html_issues_warn_only,
+            bad_assets_regex=args.bad_assets_regex,
+            bad_assets_threshold=args.bad_assets_threshold,
         ).run()
     except SystemExit:
         logger.error("Generation failed, exiting")
diff --git a/scraper/src/mindtouch2zim/errors.py b/scraper/src/mindtouch2zim/errors.py
@@ -4,19 +4,13 @@ class InvalidFormatError(Exception):
     pass
 
 
-class UnsupportedTagError(Exception):
-    """An exception raised when an HTML tag is not expected to be encountered"""
-
-    pass
-
-
-class UnsupportedHrefSrcError(Exception):
-    """An exception raised when an href or src is not expected to be encountered"""
+class NoIllustrationFoundError(Exception):
+    """An exception raised when no suitable illustration has been found"""
 
     pass
 
 
-class NoIllustrationFoundError(Exception):
-    """An exception raised when no suitable illustration has been found"""
+class KnownBadAssetFailedError(Exception):
+    """An exception raised when an asset known to be failing, failed as expected"""
 
     pass
diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py
@@ -13,10 +13,8 @@
     ZimPath,
 )
 
-import mindtouch2zim.constants
 from mindtouch2zim.client import LibraryPage
 from mindtouch2zim.constants import logger
-from mindtouch2zim.errors import UnsupportedHrefSrcError, UnsupportedTagError
 from mindtouch2zim.utils import is_better_srcset_descriptor
 from mindtouch2zim.vimeo import get_vimeo_thumbnail_url
 
@@ -29,20 +27,20 @@
 
 
 @html_rules.rewrite_attribute()
-def rewrite_href_src_attributes(
+def rewrite_href_src_srcset_attributes(
     tag: str,
     attr_name: str,
     attr_value: str | None,
     url_rewriter: ArticleUrlRewriter,
     base_href: str | None,
 ):
     """Rewrite href and src attributes"""
-    if attr_name not in ("href", "src") or not attr_value:
+    if attr_name not in ("href", "src", "srcset") or not attr_value:
         return
     if not isinstance(url_rewriter, HtmlUrlsRewriter):
         raise Exception("Expecting HtmlUrlsRewriter")
     new_attr_value = None
-    if tag == "a":
+    if tag in ["a", "area"]:
         rewrite_result = url_rewriter(
             attr_value, base_href=base_href, rewrite_all_url=False
         )
@@ -53,36 +51,17 @@ def rewrite_href_src_attributes(
             if rewrite_result.rewriten_url.startswith(url_rewriter.library_path.value)
             else rewrite_result.rewriten_url
         )
-    if not new_attr_value:
-        # we do not (yet) support other tags / attributes so we fail the scraper
-        msg = (
+    else:
+        # we remove the src/href/srcset which is not supported, to ensure we won't load
+        # external assets
+        new_attr_value = ""
+        logger.warning(
             f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
             f"'{attr_value}') while rewriting {rewriting_context}"
         )
-        if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
-            raise UnsupportedHrefSrcError(msg)
-        else:
-            logger.warning(msg)
-            return
     return (attr_name, new_attr_value)
 
 
-@html_rules.rewrite_tag()
-def refuse_unsupported_tags(tag: str):
-    """Stop scraper if unsupported tag is encountered"""
-    if tag not in ["picture"]:
-        return
-    msg = (
-        f"Tag {tag} is not yet supported in this scraper, found while rewriting "
-        f"{rewriting_context}"
-    )
-    if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
-        raise UnsupportedTagError(msg)
-    else:
-        logger.warning(msg)
-        return
-
-
 YOUTUBE_IFRAME_RE = re.compile(r".*youtube(?:-\w+)*\.\w+\/embed\/(?P<id>.*?)(?:\?.*)*$")
 VIMEO_IFRAME_RE = re.compile(r".*vimeo(?:-\w+)*\.\w+\/video\/(?:.*?)(?:\?.*)*$")
 
@@ -101,15 +80,8 @@ def rewrite_iframe_tags(
         raise Exception("Expecting HtmlUrlsRewriter")
     src = get_attr_value_from(attrs=attrs, name="src")
     if not src:
-        msg = (
-            "Unsupported empty src in iframe, found while rewriting "
-            f"{rewriting_context}"
-        )
-        if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
-            raise UnsupportedTagError(msg)
-        else:
-            logger.warning(msg)
-            return
+        logger.warning(f"Empty src found in iframe while rewriting {rewriting_context}")
+        return
     image_rewriten_url = None
     try:
         if ytb_match := YOUTUBE_IFRAME_RE.match(src):
@@ -127,9 +99,15 @@ def rewrite_iframe_tags(
             url_rewriter.add_item_to_download(rewrite_result)
             image_rewriten_url = rewrite_result.rewriten_url
         else:
-            logger.debug(f"iframe pointing to {src} will not have any preview")
+            logger.debug(
+                f"iframe pointing to {src} in {rewriting_context} will not "
+                "have any preview"
+            )
     except Exception as exc:
-        logger.warning(f"Failed to rewrite iframe with src {src}", exc_info=exc)
+        logger.warning(
+            f"Failed to rewrite iframe with src {src} in  {rewriting_context}",
+            exc_info=exc,
+        )
 
     if image_rewriten_url:
         return (
diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py
diff --git a/scraper/src/mindtouch2zim/utils.py b/scraper/src/mindtouch2zim/utils.py
diff --git a/scraper/tests/test_html_rewriting.py b/scraper/tests/test_html_rewriting.py