Skip to content

Commit 2b107da

Browse files
committed
Pass User-Agent to all web requests
1 parent 08e9733 commit 2b107da

File tree

7 files changed

+87
-17
lines changed

7 files changed

+87
-17
lines changed

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ COPY --from=zimui /src/dist /src/zimui
4343

4444
ENV MINDTOUCH_ZIMUI_DIST=/src/zimui \
4545
MINDTOUCH_OUTPUT=/output \
46-
MINDTOUCH_TMP=/tmp
46+
MINDTOUCH_TMP=/tmp\
47+
MINDTOUCH_CONTACT_INFO=https://www.kiwix.org
4748

4849
CMD ["mindtouch2zim", "--help"]

scraper/src/mindtouch2zim/asset.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@
88
from pif import get_public_ip
99
from PIL import Image
1010
from requests.exceptions import RequestException
11-
from zimscraperlib.download import stream_file
1211
from zimscraperlib.image.optimization import optimize_webp
1312
from zimscraperlib.image.presets import WebpMedium
1413
from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
1514
from zimscraperlib.zim import Creator
1615

1716
from mindtouch2zim.constants import KNOWN_BAD_ASSETS_REGEX, logger, web_session
17+
from mindtouch2zim.download import stream_file
1818
from mindtouch2zim.errors import KnownBadAssetFailedError
1919
from mindtouch2zim.utils import backoff_hdlr
2020

@@ -241,20 +241,22 @@ def get_asset_content(
241241
) -> BytesIO:
242242
"""Download of a given asset, optimize if needed, or download from S3 cache"""
243243

244-
if not always_fetch_online:
245-
header_data = self._get_header_data_for(asset_url)
246-
if header_data.content_type:
247-
mime_type = header_data.content_type.split(";")[0].strip()
248-
if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
249-
return self._get_image_content(
250-
asset_path=asset_path,
251-
asset_url=asset_url,
252-
header_data=header_data,
253-
)
254-
else:
255-
logger.debug(f"Not optimizing, unsupported mime type: {mime_type}")
256-
257244
try:
245+
if not always_fetch_online:
246+
header_data = self._get_header_data_for(asset_url)
247+
if header_data.content_type:
248+
mime_type = header_data.content_type.split(";")[0].strip()
249+
if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
250+
return self._get_image_content(
251+
asset_path=asset_path,
252+
asset_url=asset_url,
253+
header_data=header_data,
254+
)
255+
else:
256+
logger.debug(
257+
f"Not optimizing, unsupported mime type: {mime_type}"
258+
)
259+
258260
return self._download_from_online(asset_url=asset_url)
259261
except RequestException as exc:
260262
# check if the failing download match known bad assets regex early, and if

scraper/src/mindtouch2zim/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,6 @@
2222
logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)
2323

2424
web_session = get_session()
25+
26+
# info passed in User-Agent header of web requests
27+
CONTACT_INFO = "https://www.kiwix.org"

scraper/src/mindtouch2zim/download.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import pathlib
2+
from typing import IO
3+
4+
import requests
5+
import requests.structures
6+
import urllib3
7+
import zimscraperlib.__about__
8+
import zimscraperlib.constants
9+
from zimscraperlib.download import stream_file as stream_file_orig
10+
11+
from mindtouch2zim.constants import CONTACT_INFO, NAME, VERSION
12+
13+
14+
def get_user_agent() -> str:
15+
return (
16+
f"{NAME}/{VERSION} ({CONTACT_INFO}) "
17+
f"{zimscraperlib.constants.NAME}/{zimscraperlib.__about__.__version__} "
18+
f"requests/{requests.__version__} "
19+
f"urllib3/{urllib3._version.__version__}"
20+
)
21+
22+
23+
def stream_file(
24+
url: str,
25+
fpath: pathlib.Path | None = None,
26+
byte_stream: IO[bytes] | None = None,
27+
block_size: int | None = 1024,
28+
proxies: dict[str, str] | None = None,
29+
max_retries: int | None = 5,
30+
headers: dict[str, str] | None = None,
31+
session: requests.Session | None = None,
32+
*,
33+
only_first_block: bool | None = False,
34+
) -> tuple[int, requests.structures.CaseInsensitiveDict[str]]:
35+
if headers:
36+
headers["User-Agent"] = get_user_agent()
37+
else:
38+
headers = {"User-Agent": get_user_agent()}
39+
return stream_file_orig(
40+
url=url,
41+
fpath=fpath,
42+
byte_stream=byte_stream,
43+
block_size=block_size,
44+
proxies=proxies,
45+
max_retries=max_retries,
46+
headers=headers,
47+
session=session,
48+
only_first_block=only_first_block,
49+
)

scraper/src/mindtouch2zim/entrypoint.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,12 @@ def main(tmpdir: str) -> None:
242242
dest="bad_assets_threshold",
243243
)
244244

245+
parser.add_argument(
246+
"--contact-info",
247+
help="Contact information to pass in User-Agent headers",
248+
default=os.getenv("MINDTOUCH_CONTACT_INFO", "https://www.kiwix.org"),
249+
)
250+
245251
args = parser.parse_args()
246252

247253
logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO)
@@ -281,6 +287,7 @@ def main(tmpdir: str) -> None:
281287
assets_workers=args.assets_workers,
282288
bad_assets_regex=args.bad_assets_regex,
283289
bad_assets_threshold=args.bad_assets_threshold,
290+
contact_info=args.contact_info,
284291
).run()
285292
except SystemExit:
286293
logger.error("Generation failed, exiting")

scraper/src/mindtouch2zim/processor.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from requests import RequestException
1414
from requests.exceptions import HTTPError
1515
from schedule import every, run_pending
16-
from zimscraperlib.download import stream_file
1716
from zimscraperlib.image import convert_image, resize_image
1817
from zimscraperlib.image.conversion import convert_svg2png
1918
from zimscraperlib.image.probing import format_for
@@ -47,6 +46,7 @@
4746
logger,
4847
web_session,
4948
)
49+
from mindtouch2zim.download import get_user_agent, stream_file
5050
from mindtouch2zim.errors import NoIllustrationFoundError
5151
from mindtouch2zim.html import get_text
5252
from mindtouch2zim.html_rewriting import HtmlUrlsRewriter
@@ -146,6 +146,7 @@ def __init__(
146146
bad_assets_regex: str | None,
147147
bad_assets_threshold: int,
148148
assets_workers: int,
149+
contact_info: str,
149150
*,
150151
overwrite_existing_zim: bool,
151152
) -> None:
@@ -176,6 +177,8 @@ def __init__(
176177
self.asset_executor = Parallel(
177178
n_jobs=assets_workers, return_as="generator_unordered", backend="threading"
178179
)
180+
mindtouch2zim.constants.CONTACT_INFO = contact_info
181+
logger.debug(f"User-Agent: { get_user_agent()}")
179182

180183
self.stats_items_done = 0
181184
# we add 1 more items to process so that progress is not 100% at the beginning

scraper/tests-integration/test_client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
MindtouchClient,
1515
MindtouchHome,
1616
)
17+
from mindtouch2zim.download import get_user_agent
1718

1819

1920
@pytest.fixture(scope="module")
@@ -119,7 +120,11 @@ def test_get_home_image_url(home: MindtouchHome):
119120
def test_get_home_image_size(home: MindtouchHome, home_png_size: int):
120121
"""Ensures image url is retrievable"""
121122
dst = io.BytesIO()
122-
stream_file(home.welcome_image_url, byte_stream=dst)
123+
stream_file(
124+
home.welcome_image_url,
125+
byte_stream=dst,
126+
headers={"User-Agent": get_user_agent()},
127+
)
123128
assert format_for(dst, from_suffix=False) == "PNG"
124129
assert len(dst.getvalue()) == home_png_size
125130

0 commit comments

Comments
 (0)