openzim · benoit74 · Dec 6, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 22, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -43,6 +43,7 @@ COPY --from=zimui /src/dist /src/zimui
 
 ENV MINDTOUCH_ZIMUI_DIST=/src/zimui \
     MINDTOUCH_OUTPUT=/output \
-    MINDTOUCH_TMP=/tmp
+    MINDTOUCH_TMP=/tmp\
+    MINDTOUCH_CONTACT_INFO=https://www.kiwix.org
 
 CMD ["mindtouch2zim", "--help"]
diff --git a/scraper/src/mindtouch2zim/__main__.py b/scraper/src/mindtouch2zim/__main__.py
@@ -1,11 +1,29 @@
+import sys
 import tempfile
 
-from mindtouch2zim.entrypoint import main as entrypoint
+from mindtouch2zim.constants import logger
+from mindtouch2zim.entrypoint import prepare_context
 
 
 def main():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        entrypoint(tmpdir)
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+
+            prepare_context(sys.argv[1:], tmpdir)
+
+            # import this only once the Context has been initialized, so that it gets an
+            # initialized context
+            from mindtouch2zim.processor import Processor
+
+            Processor().run()
+
+    except SystemExit:
+        logger.error("Scraper failed, exiting")
+        raise
+    except Exception as exc:
+        logger.exception(exc)
+        logger.error(f"Scraper failed with the following error: {exc}")
+        raise SystemExit(1) from exc
 
 
 if __name__ == "__main__":

diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py
@@ -1,4 +1,3 @@
-import re
 import threading
 from io import BytesIO
 from typing import NamedTuple
@@ -8,13 +7,14 @@
 from pif import get_public_ip
 from PIL import Image
 from requests.exceptions import RequestException
-from zimscraperlib.download import stream_file
 from zimscraperlib.image.optimization import optimize_webp
 from zimscraperlib.image.presets import WebpMedium
 from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
 from zimscraperlib.zim import Creator
 
-from mindtouch2zim.constants import KNOWN_BAD_ASSETS_REGEX, logger, web_session
+from mindtouch2zim.constants import logger
+from mindtouch2zim.context import Context
+from mindtouch2zim.download import stream_file
 from mindtouch2zim.errors import (
     KnownBadAssetFailedError,
     S3CacheError,
@@ -40,6 +40,8 @@
 
 WEBP_OPTIONS = WebpMedium().options
 
+context = Context.get()
+
 
 class HeaderData(NamedTuple):
     ident: str  # ~version~ of the URL data to use for comparisons
@@ -55,17 +57,7 @@
 
     def __init__(
         self,
-        s3_url_with_credentials: str | None,
-        bad_assets_regex: str | None,
-        bad_assets_threshold: int,
     ) -> None:
-        self.s3_url_with_credentials = s3_url_with_credentials
-
-        bad_assets_regex = f"{bad_assets_regex}|{KNOWN_BAD_ASSETS_REGEX}"
-        self.bad_assets_regex = (
-            re.compile(bad_assets_regex, re.IGNORECASE) if bad_assets_regex else None
-        )
-        self.bad_assets_threshold = bad_assets_threshold
         self._setup_s3()
         self.bad_assets_count = 0
         self.lock = threading.Lock()
@@ -77,6 +69,7 @@
         creator: Creator,
     ):
         logger.debug(f"Processing asset for {asset_path}")
+        context.current_thread_workitem = f"processing asset {asset_path}"
         self._process_asset_internal(
             asset_path=asset_path, asset_details=asset_details, creator=creator
         )
@@ -108,15 +101,15 @@
                 with self.lock:
                     self.bad_assets_count += 1
                     if (
-                        self.bad_assets_threshold >= 0
-                        and self.bad_assets_count > self.bad_assets_threshold
+                        context.bad_assets_threshold >= 0
+                        and self.bad_assets_count > context.bad_assets_threshold
                     ):
                         logger.error(
                             f"Exception while processing asset for {asset_url.value}: "
                             f"{exc}"
                         )
                         raise OSError(  # noqa: B904
-                            f"Asset failure threshold ({self.bad_assets_threshold}) "
+                            f"Asset failure threshold ({context.bad_assets_threshold}) "
                             "reached, stopping execution"
                         )
                     else:
@@ -163,7 +156,7 @@
         meta = {"ident": header_data.ident, "version": str(WebpMedium.VERSION) + ".r"}
         s3_key = f"medium/{asset_path.value}"
 
-        if self.s3_url_with_credentials:
+        if context.s3_url_with_credentials:
             if s3_data := self._download_from_s3_cache(s3_key=s3_key, meta=meta):
                 logger.debug("Fetching directly from S3 cache")
                 return s3_data  # found in cache
@@ -186,7 +179,7 @@
             ),  # pyright: ignore[reportArgumentType]
         )
 
-        if self.s3_url_with_credentials:
+        if context.s3_url_with_credentials:
             # upload optimized to S3
             logger.debug("Uploading to S3")
             self._upload_to_s3_cache(
@@ -230,7 +223,6 @@
         stream_file(
             asset_url.value,
             byte_stream=asset_content,
-            session=web_session,
         )
         return asset_content
 
@@ -245,35 +237,39 @@
     ) -> BytesIO:
         """Download of a given asset, optimize if needed, or download from S3 cache"""
 
-        if not always_fetch_online:
-            header_data = self._get_header_data_for(asset_url)
-            if header_data.content_type:
-                mime_type = header_data.content_type.split(";")[0].strip()
-                if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
-                    return self._get_image_content(
-                        asset_path=asset_path,
-                        asset_url=asset_url,
-                        header_data=header_data,
-                    )
-                else:
-                    logger.debug(f"Not optimizing, unsupported mime type: {mime_type}")
-
         try:
+            if not always_fetch_online:
+                header_data = self._get_header_data_for(asset_url)
+                if header_data.content_type:
+                    mime_type = header_data.content_type.split(";")[0].strip()
+                    if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
+                        return self._get_image_content(
+                            asset_path=asset_path,
+                            asset_url=asset_url,
+                            header_data=header_data,
+                        )
+                    else:
+                        logger.debug(
+                            f"Not optimizing, unsupported mime type: {mime_type}"
+                        )
+
             return self._download_from_online(asset_url=asset_url)
         except RequestException as exc:
             # check if the failing download match known bad assets regex early, and if
             # so raise a custom exception to escape backoff (always important to try
             # once even if asset is expected to not work, but no need to loose time on
             # retrying assets which are expected to be bad)
-            if self.bad_assets_regex and self.bad_assets_regex.findall(asset_url.value):
+            if context.bad_assets_regex and context.bad_assets_regex.findall(
+                asset_url.value
+            ):
                 raise KnownBadAssetFailedError() from exc
             raise
 
     def _setup_s3(self):
-        if not self.s3_url_with_credentials:
+        if not context.s3_url_with_credentials:
             return
         logger.info("testing S3 Optimization Cache credentials")
-        self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
+        self.s3_storage = KiwixStorage(context.s3_url_with_credentials)
         if not self.s3_storage.check_credentials(  # pyright: ignore[reportUnknownMemberType]
             list_buckets=True, bucket=True, write=True, read=True, failsafe=True
         ):

diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py
@@ -7,15 +7,13 @@
 from pydantic import BaseModel
 from requests import Response
 
-from mindtouch2zim.constants import (
-    HTTP_TIMEOUT_LONG_SECONDS,
-    HTTP_TIMEOUT_NORMAL_SECONDS,
-    logger,
-    web_session,
-)
+from mindtouch2zim.constants import logger
+from mindtouch2zim.context import Context
 from mindtouch2zim.errors import APITokenRetrievalError, MindtouchParsingError
 from mindtouch2zim.html import get_soup
 
+context = Context.get()
+
 
 class MindtouchHome(BaseModel):
     home_url: str
@@ -99,27 +97,25 @@
 class MindtouchClient:
     """Utility functions to read data from mindtouch instance."""
 
-    def __init__(self, library_url: str, cache_folder: Path) -> None:
+    def __init__(self) -> None:
         """Initializes MindtouchClient.
 
         Paremters:
             library_url: Scheme and hostname for the Libretext library
                 e.g. `https://geo.libretexts.org`.
         """
-        self.library_url = library_url
         self.deki_token = None
-        self.cache_folder = cache_folder
 
     @property
     def api_url(self) -> str:
-        return f"{self.library_url}/@api/deki"
+        return f"{context.library_url}/@api/deki"
 
     def _get_cache_file(self, url_subpath_and_query: str) -> Path:
         """Get location where HTTP result should be cached"""
         url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query)
         if url_subpath_and_query.endswith("/"):
             url_subpath_and_query += "index"
-        return self.cache_folder / url_subpath_and_query
+        return context.cache_folder / url_subpath_and_query
 
     def _get_text(self, url_subpath_and_query: str) -> str:
         """Perform a GET request and return the response as decoded text."""
@@ -129,13 +125,13 @@
             return cache_file.read_text()
         cache_file.parent.mkdir(parents=True, exist_ok=True)
 
-        full_url = f"{self.library_url}{url_subpath_and_query}"
+        full_url = f"{context.library_url}{url_subpath_and_query}"
         logger.debug(f"Fetching {full_url}")
 
-        resp = web_session.get(
+        resp = context.web_session.get(
             url=full_url,
             allow_redirects=True,
-            timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
+            timeout=context.http_timeout_normal_seconds,
         )
         resp.raise_for_status()
 
@@ -145,7 +141,7 @@
     def _get_api_resp(self, api_sub_path_and_query: str, timeout: float) -> Response:
         api_url = f"{self.api_url}{api_sub_path_and_query}"
         logger.debug(f"Calling API at {api_url}")
-        resp = web_session.get(
+        resp = context.web_session.get(
             url=api_url,
             headers={"x-deki-token": self.deki_token},
             timeout=timeout,
@@ -157,7 +153,7 @@
         self,
         api_sub_path: str,
         query_params: str = "",
-        timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS,
+        timeout: float = context.http_timeout_normal_seconds,
     ) -> Any:
         cache_file = self._get_cache_file(f"api_json{api_sub_path}{query_params}.dat")
         if cache_file.exists():
@@ -173,7 +169,7 @@
         return result
 
     def _get_api_content(
-        self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
+        self, api_sub_path: str, timeout: float = context.http_timeout_normal_seconds
     ) -> bytes | Any:
         cache_file = self._get_cache_file(f"api_content{api_sub_path}")
         if cache_file.exists():
@@ -196,7 +192,7 @@
             screen_css_url=_get_screen_css_url_from_home(soup),
             print_css_url=_get_print_css_url_from_home(soup),
             inline_css=_get_inline_css_from_home(soup),
-            home_url=f"{self.library_url}/",
+            home_url=f"{context.library_url}/",
             icons_urls=_get_icons_urls(soup),
         )
 
@@ -214,7 +210,9 @@
     def get_all_pages_ids(self) -> list[LibraryPageId]:
         """Returns the IDs of all pages on current website, exploring the whole tree"""
 
-        tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
+        tree = self._get_api_json(
+            "/pages/home/tree", timeout=context.http_timeout_long_seconds
+        )
 
         page_ids: list[LibraryPageId] = []
 
@@ -235,13 +233,15 @@
     def get_root_page_id(self) -> LibraryPageId:
         """Returns the ID the root of the tree of pages"""
 
-        tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
+        tree = self._get_api_json(
+            "/pages/home/tree", timeout=context.http_timeout_long_seconds
+        )
         return tree["page"]["@id"]
 
     def get_page_tree(self) -> LibraryTree:
 
         tree_data = self._get_api_json(
-            "/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS
+            "/pages/home/tree", timeout=context.http_timeout_long_seconds
         )
 
         root = LibraryPage(
@@ -283,7 +283,7 @@
     def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
         """Returns the 'raw' content of a given page"""
         tree = self._get_api_json(
-            f"/pages/{page.id}/contents", timeout=HTTP_TIMEOUT_NORMAL_SECONDS
+            f"/pages/{page.id}/contents", timeout=context.http_timeout_normal_seconds
         )
         if not isinstance(tree["body"][0], str):
             raise MindtouchParsingError(
@@ -313,7 +313,7 @@
         """
         if page.definition is None:
             raw_definition = self._get_api_json(
-                f"/pages/{page.id}", timeout=HTTP_TIMEOUT_NORMAL_SECONDS
+                f"/pages/{page.id}", timeout=context.http_timeout_normal_seconds
             )
             raw_tags = raw_definition.get("tags", None)
             if raw_tags is None:
@@ -369,7 +369,7 @@
         tree = self._get_api_json(
             f"/pages/{template}/contents",
             query_params=f"pageid={page_id}",
-            timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
+            timeout=context.http_timeout_normal_seconds,
         )
         if not tree.get("body", ""):
             raise MindtouchParsingError(

diff --git a/scraper/src/mindtouch2zim/constants.py b/scraper/src/mindtouch2zim/constants.py
@@ -1,7 +1,6 @@
 import logging
 import pathlib
 
-from zimscraperlib.download import get_session
 from zimscraperlib.logging import DEFAULT_FORMAT_WITH_THREADS, getLogger
 
 from mindtouch2zim.__about__ import __version__
@@ -10,15 +9,11 @@
 VERSION = __version__
 ROOT_DIR = pathlib.Path(__file__).parent
 
-# As of 2024-09-24, all libraries appears to be in English.
-LANGUAGE_ISO_639_3 = "eng"
-
-HTTP_TIMEOUT_NORMAL_SECONDS = 15
-HTTP_TIMEOUT_LONG_SECONDS = 30
-
 # Loading the CSS leads to many bad assets at these URLs, we just ignore them
-KNOWN_BAD_ASSETS_REGEX = r"https?:\/\/a\.mtstatic\.com/@(cache|style)"
-
-logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)
+STANDARD_KNOWN_BAD_ASSETS_REGEX = r"https?:\/\/a\.mtstatic\.com/@(cache|style)"
 
-web_session = get_session()
+# logger to use everywhere (not part of Context class because we need it early, before
+# Context has been initialized)
+logger: logging.Logger = getLogger(
+    NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS
+)