Skip to content

Add Context class, pass User-Agent to all web requests, set async job timeout, and retry header retrieval #92

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 6, 2024
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ COPY --from=zimui /src/dist /src/zimui

ENV MINDTOUCH_ZIMUI_DIST=/src/zimui \
MINDTOUCH_OUTPUT=/output \
MINDTOUCH_TMP=/tmp
MINDTOUCH_TMP=/tmp\
MINDTOUCH_CONTACT_INFO=https://www.kiwix.org

CMD ["mindtouch2zim", "--help"]
24 changes: 21 additions & 3 deletions scraper/src/mindtouch2zim/__main__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,29 @@
import sys

Check warning on line 1 in scraper/src/mindtouch2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/__main__.py#L1

Added line #L1 was not covered by tests
import tempfile

from mindtouch2zim.entrypoint import main as entrypoint
from mindtouch2zim.constants import logger
from mindtouch2zim.entrypoint import prepare_context

Check warning on line 5 in scraper/src/mindtouch2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/__main__.py#L4-L5

Added lines #L4 - L5 were not covered by tests


def main():
with tempfile.TemporaryDirectory() as tmpdir:
entrypoint(tmpdir)
try:

Check warning on line 9 in scraper/src/mindtouch2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/__main__.py#L9

Added line #L9 was not covered by tests
with tempfile.TemporaryDirectory() as tmpdir:

prepare_context(sys.argv[1:], tmpdir)

Check warning on line 12 in scraper/src/mindtouch2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/__main__.py#L12

Added line #L12 was not covered by tests

# import this only once the Context has been initialized, so that it gets an
# initialized context
from mindtouch2zim.processor import Processor

Check warning on line 16 in scraper/src/mindtouch2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/__main__.py#L16

Added line #L16 was not covered by tests

Processor().run()

Check warning on line 18 in scraper/src/mindtouch2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/__main__.py#L18

Added line #L18 was not covered by tests

except SystemExit:
logger.error("Scraper failed, exiting")
raise
except Exception as exc:
logger.exception(exc)
logger.error(f"Scraper failed with the following error: {exc}")
raise SystemExit(1) from exc

Check warning on line 26 in scraper/src/mindtouch2zim/__main__.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/__main__.py#L21-L26

Added lines #L21 - L26 were not covered by tests


if __name__ == "__main__":
Expand Down
66 changes: 31 additions & 35 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
import threading
from io import BytesIO
from typing import NamedTuple
Expand All @@ -8,13 +7,14 @@
from pif import get_public_ip
from PIL import Image
from requests.exceptions import RequestException
from zimscraperlib.download import stream_file
from zimscraperlib.image.optimization import optimize_webp
from zimscraperlib.image.presets import WebpMedium
from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
from zimscraperlib.zim import Creator

from mindtouch2zim.constants import KNOWN_BAD_ASSETS_REGEX, logger, web_session
from mindtouch2zim.constants import logger
from mindtouch2zim.context import Context
from mindtouch2zim.download import stream_file
from mindtouch2zim.errors import (
KnownBadAssetFailedError,
S3CacheError,
Expand All @@ -40,6 +40,8 @@

WEBP_OPTIONS = WebpMedium().options

context = Context.get()


class HeaderData(NamedTuple):
ident: str # ~version~ of the URL data to use for comparisons
Expand All @@ -55,17 +57,7 @@

def __init__(
self,
s3_url_with_credentials: str | None,
bad_assets_regex: str | None,
bad_assets_threshold: int,
) -> None:
self.s3_url_with_credentials = s3_url_with_credentials

bad_assets_regex = f"{bad_assets_regex}|{KNOWN_BAD_ASSETS_REGEX}"
self.bad_assets_regex = (
re.compile(bad_assets_regex, re.IGNORECASE) if bad_assets_regex else None
)
self.bad_assets_threshold = bad_assets_threshold
self._setup_s3()
self.bad_assets_count = 0
self.lock = threading.Lock()
Expand All @@ -77,6 +69,7 @@
creator: Creator,
):
logger.debug(f"Processing asset for {asset_path}")
context.current_thread_workitem = f"processing asset {asset_path}"

Check warning on line 72 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L72

Added line #L72 was not covered by tests
self._process_asset_internal(
asset_path=asset_path, asset_details=asset_details, creator=creator
)
Expand Down Expand Up @@ -108,15 +101,15 @@
with self.lock:
self.bad_assets_count += 1
if (
self.bad_assets_threshold >= 0
and self.bad_assets_count > self.bad_assets_threshold
context.bad_assets_threshold >= 0
and self.bad_assets_count > context.bad_assets_threshold
):
logger.error(
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
raise OSError( # noqa: B904
f"Asset failure threshold ({self.bad_assets_threshold}) "
f"Asset failure threshold ({context.bad_assets_threshold}) "
"reached, stopping execution"
)
else:
Expand Down Expand Up @@ -163,7 +156,7 @@
meta = {"ident": header_data.ident, "version": str(WebpMedium.VERSION) + ".r"}
s3_key = f"medium/{asset_path.value}"

if self.s3_url_with_credentials:
if context.s3_url_with_credentials:
if s3_data := self._download_from_s3_cache(s3_key=s3_key, meta=meta):
logger.debug("Fetching directly from S3 cache")
return s3_data # found in cache
Expand All @@ -186,7 +179,7 @@
), # pyright: ignore[reportArgumentType]
)

if self.s3_url_with_credentials:
if context.s3_url_with_credentials:
# upload optimized to S3
logger.debug("Uploading to S3")
self._upload_to_s3_cache(
Expand Down Expand Up @@ -230,7 +223,6 @@
stream_file(
asset_url.value,
byte_stream=asset_content,
session=web_session,
)
return asset_content

Expand All @@ -245,35 +237,39 @@
) -> BytesIO:
"""Download of a given asset, optimize if needed, or download from S3 cache"""

if not always_fetch_online:
header_data = self._get_header_data_for(asset_url)
if header_data.content_type:
mime_type = header_data.content_type.split(";")[0].strip()
if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
return self._get_image_content(
asset_path=asset_path,
asset_url=asset_url,
header_data=header_data,
)
else:
logger.debug(f"Not optimizing, unsupported mime type: {mime_type}")

try:
if not always_fetch_online:
header_data = self._get_header_data_for(asset_url)

Check warning on line 242 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L242

Added line #L242 was not covered by tests
if header_data.content_type:
mime_type = header_data.content_type.split(";")[0].strip()

Check warning on line 244 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L244

Added line #L244 was not covered by tests
if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
return self._get_image_content(

Check warning on line 246 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L246

Added line #L246 was not covered by tests
asset_path=asset_path,
asset_url=asset_url,
header_data=header_data,
)
else:
logger.debug(

Check warning on line 252 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L252

Added line #L252 was not covered by tests
f"Not optimizing, unsupported mime type: {mime_type}"
)

return self._download_from_online(asset_url=asset_url)
except RequestException as exc:
# check if the failing download match known bad assets regex early, and if
# so raise a custom exception to escape backoff (always important to try
# once even if asset is expected to not work, but no need to loose time on
# retrying assets which are expected to be bad)
if self.bad_assets_regex and self.bad_assets_regex.findall(asset_url.value):
if context.bad_assets_regex and context.bad_assets_regex.findall(
asset_url.value
):
raise KnownBadAssetFailedError() from exc
raise

def _setup_s3(self):
if not self.s3_url_with_credentials:
if not context.s3_url_with_credentials:
return
logger.info("testing S3 Optimization Cache credentials")
self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
self.s3_storage = KiwixStorage(context.s3_url_with_credentials)

Check warning on line 272 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L272

Added line #L272 was not covered by tests
if not self.s3_storage.check_credentials( # pyright: ignore[reportUnknownMemberType]
list_buckets=True, bucket=True, write=True, read=True, failsafe=True
):
Expand Down
48 changes: 24 additions & 24 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@
from pydantic import BaseModel
from requests import Response

from mindtouch2zim.constants import (
HTTP_TIMEOUT_LONG_SECONDS,
HTTP_TIMEOUT_NORMAL_SECONDS,
logger,
web_session,
)
from mindtouch2zim.constants import logger
from mindtouch2zim.context import Context
from mindtouch2zim.errors import APITokenRetrievalError, MindtouchParsingError
from mindtouch2zim.html import get_soup

context = Context.get()


class MindtouchHome(BaseModel):
home_url: str
Expand Down Expand Up @@ -99,27 +97,25 @@
class MindtouchClient:
"""Utility functions to read data from mindtouch instance."""

def __init__(self, library_url: str, cache_folder: Path) -> None:
def __init__(self) -> None:
"""Initializes MindtouchClient.

Paremters:
library_url: Scheme and hostname for the Libretext library
e.g. `https://geo.libretexts.org`.
"""
self.library_url = library_url
self.deki_token = None
self.cache_folder = cache_folder

@property
def api_url(self) -> str:
return f"{self.library_url}/@api/deki"
return f"{context.library_url}/@api/deki"

Check warning on line 111 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L111

Added line #L111 was not covered by tests

def _get_cache_file(self, url_subpath_and_query: str) -> Path:
"""Get location where HTTP result should be cached"""
url_subpath_and_query = re.sub(r"^/", "", url_subpath_and_query)
if url_subpath_and_query.endswith("/"):
url_subpath_and_query += "index"
return self.cache_folder / url_subpath_and_query
return context.cache_folder / url_subpath_and_query

Check warning on line 118 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L118

Added line #L118 was not covered by tests

def _get_text(self, url_subpath_and_query: str) -> str:
"""Perform a GET request and return the response as decoded text."""
Expand All @@ -129,13 +125,13 @@
return cache_file.read_text()
cache_file.parent.mkdir(parents=True, exist_ok=True)

full_url = f"{self.library_url}{url_subpath_and_query}"
full_url = f"{context.library_url}{url_subpath_and_query}"

Check warning on line 128 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L128

Added line #L128 was not covered by tests
logger.debug(f"Fetching {full_url}")

resp = web_session.get(
resp = context.web_session.get(

Check warning on line 131 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L131

Added line #L131 was not covered by tests
url=full_url,
allow_redirects=True,
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
timeout=context.http_timeout_normal_seconds,
)
resp.raise_for_status()

Expand All @@ -145,7 +141,7 @@
def _get_api_resp(self, api_sub_path_and_query: str, timeout: float) -> Response:
api_url = f"{self.api_url}{api_sub_path_and_query}"
logger.debug(f"Calling API at {api_url}")
resp = web_session.get(
resp = context.web_session.get(

Check warning on line 144 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L144

Added line #L144 was not covered by tests
url=api_url,
headers={"x-deki-token": self.deki_token},
timeout=timeout,
Expand All @@ -157,7 +153,7 @@
self,
api_sub_path: str,
query_params: str = "",
timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS,
timeout: float = context.http_timeout_normal_seconds,
) -> Any:
cache_file = self._get_cache_file(f"api_json{api_sub_path}{query_params}.dat")
if cache_file.exists():
Expand All @@ -173,7 +169,7 @@
return result

def _get_api_content(
self, api_sub_path: str, timeout: float = HTTP_TIMEOUT_NORMAL_SECONDS
self, api_sub_path: str, timeout: float = context.http_timeout_normal_seconds
) -> bytes | Any:
cache_file = self._get_cache_file(f"api_content{api_sub_path}")
if cache_file.exists():
Expand All @@ -196,7 +192,7 @@
screen_css_url=_get_screen_css_url_from_home(soup),
print_css_url=_get_print_css_url_from_home(soup),
inline_css=_get_inline_css_from_home(soup),
home_url=f"{self.library_url}/",
home_url=f"{context.library_url}/",
icons_urls=_get_icons_urls(soup),
)

Expand All @@ -214,7 +210,9 @@
def get_all_pages_ids(self) -> list[LibraryPageId]:
"""Returns the IDs of all pages on current website, exploring the whole tree"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
tree = self._get_api_json(

Check warning on line 213 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L213

Added line #L213 was not covered by tests
"/pages/home/tree", timeout=context.http_timeout_long_seconds
)

page_ids: list[LibraryPageId] = []

Expand All @@ -235,13 +233,15 @@
def get_root_page_id(self) -> LibraryPageId:
"""Returns the ID the root of the tree of pages"""

tree = self._get_api_json("/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS)
tree = self._get_api_json(

Check warning on line 236 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L236

Added line #L236 was not covered by tests
"/pages/home/tree", timeout=context.http_timeout_long_seconds
)
return tree["page"]["@id"]

def get_page_tree(self) -> LibraryTree:

tree_data = self._get_api_json(
"/pages/home/tree", timeout=HTTP_TIMEOUT_LONG_SECONDS
"/pages/home/tree", timeout=context.http_timeout_long_seconds
)

root = LibraryPage(
Expand Down Expand Up @@ -283,7 +283,7 @@
def get_page_content(self, page: LibraryPage) -> LibraryPageContent:
"""Returns the 'raw' content of a given page"""
tree = self._get_api_json(
f"/pages/{page.id}/contents", timeout=HTTP_TIMEOUT_NORMAL_SECONDS
f"/pages/{page.id}/contents", timeout=context.http_timeout_normal_seconds
)
if not isinstance(tree["body"][0], str):
raise MindtouchParsingError(
Expand Down Expand Up @@ -313,7 +313,7 @@
"""
if page.definition is None:
raw_definition = self._get_api_json(
f"/pages/{page.id}", timeout=HTTP_TIMEOUT_NORMAL_SECONDS
f"/pages/{page.id}", timeout=context.http_timeout_normal_seconds
)
raw_tags = raw_definition.get("tags", None)
if raw_tags is None:
Expand Down Expand Up @@ -369,7 +369,7 @@
tree = self._get_api_json(
f"/pages/{template}/contents",
query_params=f"pageid={page_id}",
timeout=HTTP_TIMEOUT_NORMAL_SECONDS,
timeout=context.http_timeout_normal_seconds,
)
if not tree.get("body", ""):
raise MindtouchParsingError(
Expand Down
17 changes: 6 additions & 11 deletions scraper/src/mindtouch2zim/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import pathlib

from zimscraperlib.download import get_session
from zimscraperlib.logging import DEFAULT_FORMAT_WITH_THREADS, getLogger

from mindtouch2zim.__about__ import __version__
Expand All @@ -10,15 +9,11 @@
VERSION = __version__
ROOT_DIR = pathlib.Path(__file__).parent

# As of 2024-09-24, all libraries appears to be in English.
LANGUAGE_ISO_639_3 = "eng"

HTTP_TIMEOUT_NORMAL_SECONDS = 15
HTTP_TIMEOUT_LONG_SECONDS = 30

# Loading the CSS leads to many bad assets at these URLs, we just ignore them
KNOWN_BAD_ASSETS_REGEX = r"https?:\/\/a\.mtstatic\.com/@(cache|style)"

logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)
STANDARD_KNOWN_BAD_ASSETS_REGEX = r"https?:\/\/a\.mtstatic\.com/@(cache|style)"

web_session = get_session()
# logger to use everywhere (not part of Context class because we need it early, before
# Context has been initialized)
logger: logging.Logger = getLogger(
NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS
)
Loading