From c59051759f33c5934e96b7448d223116b16cfafe Mon Sep 17 00:00:00 2001 From: ddungiii Date: Tue, 13 Feb 2024 14:30:16 +0000 Subject: [PATCH 1/3] fix(crawler): add login fail handling --- apps/core/management/scripts/portal_crawler.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index dd527a1b..6a5f5059 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -208,11 +208,21 @@ def _get_board_today(page_num): linklist = [] links = soup.select("table > tbody > tr > td > a") dates = soup.select("table > tbody > tr > td:nth-child(5)") + total = soup.select("div > ul > li > em")[0].get_text() - if links: - log.info("------- portal login success!") - else: - log.info("------- portal login failed!") + if not links: + log.error("------- portal login failed!") + raise RuntimeError("portal login failed!") + + if int(total) < 10_000: + """ + If the total number of response articles is small, + all responses are public. (LOGIN FAILED) + """ + log.error("------- portal login cookie failed!") + raise RuntimeError(f"portal login cookie {COOKIES} failed!") + + log.info("------- portal login success!") today_date = str(day).replace("-", ".") for link, date in zip(links, dates): From 7da3204ea4a60c3db0a19a1920ee6dd3daca523e Mon Sep 17 00:00:00 2001 From: ddungiii Date: Tue, 13 Feb 2024 14:37:18 +0000 Subject: [PATCH 2/3] fix(crawler): fix bulk_create does not retreive ids --- apps/core/management/scripts/portal_crawler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index 6a5f5059..23a3c608 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -326,7 +326,14 @@ def _get_board_today(page_num): last_portal_article_in_db.save() new_articles.pop() - created_articles = Article.objects.bulk_create(new_articles) + # @NOTE + # MySQL's bulk_create method does not return IDs. However, PortalViewCount requires the IDs of the created articles. + # Therefore, insert one article at a time and retrieve their IDs. + # Reference: https://docs.djangoproject.com/en/5.0/ref/models/querysets/#bulk-create + created_articles = [] + for new_article in new_articles: + new_article.save() + created_articles.append(new_article) new_portal_view_counts = [] From 78700c5f550e1454d01ed85b01a9280925da6b7f Mon Sep 17 00:00:00 2001 From: ddungiii Date: Tue, 13 Feb 2024 15:29:34 +0000 Subject: [PATCH 3/3] fix(crawler): extract raw HTML content instead of prettier --- apps/core/management/scripts/portal_crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index 23a3c608..0a0c1646 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -163,13 +163,13 @@ def _save_portal_image(html, session): for tr in trs: if len(list(tr.children)) == 3: - html = tr.find("td").prettify() + html = str(tr.find("td")) break if html is None: for tr in trs: if len(list(tr.children)) == 2: - html = tr.find("td").prettify() + html = str(tr.find("td")) break html = _save_portal_image(html, session)