-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIQNewsClipScraper.py
106 lines (75 loc) · 3.4 KB
/
IQNewsClipScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import logger
import logging
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
from utils import SOURCES_CODE
class IQNewsClipScraper():
def __init__(self, cookies=None):
self.session = requests.Session()
self._has_next = None
self.logger = logger.create_rotating_log()
if cookies is not None:
self.session.cookies.update(cookies)
def login(self):
response = self.session.get('http://edu.iqnewsclip.com/ajax/authentication.aspx')
return response
def search_once(self, search_key, source, from_date=None, to_date=None):
"""return pandas.DataFrame of one-time keyword searching"""
if isinstance(from_date, (datetime.date, datetime.datetime)):
from_date = f'{from_date.day:02d}/{from_date.month:02d}/{from_date.year+543}'
if isinstance(to_date, (datetime.date, datetime.datetime)):
to_date = f'{to_date.day:02d}/{to_date.month:02d}/{to_date.year+543}'
payload = {
'CtrlSearch1:txtCategory': 'ทุกหัวเรื่องที่รับบริการ',
'CtrlSearch1:hdnews': SOURCES_CODE[source],
'CtrlSearch1:txtSearch': search_key,
'CtrlSearch1:txtDateFrom': from_date,
'CtrlSearch1:txtDateTo': to_date,
}
r = self.session.post('http://edu.iqnewsclip.com/ajax/GetResult.aspx?stype=search&rbt=true', data=payload)
return self.extract_html(r.content)
def search_next(self):
"""return pandas.DataFrame of the next page of current page"""
r = self.session.get('http://edu.iqnewsclip.com/ajax/GetResult.aspx?pg=next')
try:
r.raise_for_status()
df = self.extract_html(r.content)
except requests.exceptions.RequestException as e:
self._has_next = False
df = None
print('RequestException: ', e)
return df
def search_all(self, search_key: str, source: str, from_date=None, to_date=None):
"""return pandas.DataFrame of every-pages data of the given search_key and source"""
df = self.search_once(search_key, source, from_date, to_date)
i = 1
while self.has_next():
df = df.append(self.search_next(), ignore_index=True)
i += 1
# add a symbol column
df.insert(1, 'Symbol', search_key)
self.logger.info(f'Searched {i}/{i} pages of {search_key}-{source}')
return df
def extract_html(self, html):
"""convert html to pandas.DataFrame"""
soup = BeautifulSoup(html, features='lxml', from_encoding="windows-874")
tags = soup.find_all(['a', 'td'], class_=['HeadlineBlue', 'normalGray'])
data = {'Date': [], 'Source':[], 'HeadLine': []}
it = iter(tags)
for i in it:
data['Date'].append(i.text.strip())
data['Source'].append(next(it).text.strip())
data['HeadLine'].append(next(it).text.strip())
try:
if 'Next>>' in soup.find('label', id='lblNavigate1').text:
self._has_next = True
else:
self._has_next = False
except:
self.logger.error('An error occurs when extracting html')
return pd.DataFrame(data)
def has_next(self):
"""return True if next pages exist"""
return self._has_next