-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunc.py
98 lines (79 loc) · 3.12 KB
/
func.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import datetime
import pytz
import urllib.request
from bs4 import BeautifulSoup
from tika import parser
# some useful functions for parsing
# timestamp for parsing date
def create_timestamp():
# This timestamp is in UTC
my_ct = datetime.datetime.now(tz=pytz.UTC)
tz = pytz.timezone('Europe/Kiev')
# Now convert it to another timezone
new_ct = my_ct.astimezone(tz)
timestamp = new_ct.strftime("%d-%m-%Y_%H-%I")
return timestamp
# create results in a csv file
def create_csv(df, filename):
file_timestamp = create_timestamp()
csv_file = df.to_csv(f'result_csv/{filename}_{file_timestamp}.csv', index=False, encoding='utf-8')
return csv_file
# URL request function
def url_request(url):
# http://www.networkinghowtos.com/howto/common-user-agent-list/
HEADERS = ({'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Accept-Language': 'uk-UA, ukr;q=0.5'})
request = urllib.request.Request(url, headers=HEADERS)
response = urllib.request.urlopen(request, timeout=5)
status_code = response.status
print(f'Status code {status_code}')
# if no error, then read the response contents
if 200 <= status_code < 300:
# read the data from the URL
data_response = response.read().decode("utf8")
return data_response
# Function for reading pdf files
def read_pdf(filename):
file = parser.from_file(filename)
return(file)
# Function for cleaning raw text
def cleaning_raw_text(text_strings):
safe_text = text_strings.encode('utf-8', errors='ignore')
safe_text = safe_text.decode('utf-8')
clean_text = str(safe_text).replace("\nn", "\n")
clean_text = str(clean_text).replace("\nnn", "\n")
clean_text = str(clean_text).replace("\n\n\n\n\n", "\n")
clean_text = str(clean_text).replace("\n\n\n\n", "\n")
clean_text = str(clean_text).replace("\n\n\n", "\n")
clean_text = str(clean_text).replace("\n\n", "\n")
clean_text = str(clean_text).replace("\n\n", "\n")
clean_text = str(clean_text).replace("-----", "-")
clean_text = str(clean_text).replace("----", "-")
clean_text = str(clean_text).replace("---", "-")
clean_text = ''.join(clean_text.split('\n', 1))
return clean_text
def htmlToText(html):
soup = BeautifulSoup(html, 'lxml')
# removing scripts, styles and other useless tags
[element.extract() for element in soup(['style', 'script', 'meta', '[document]', 'head', 'title'])]
# getting text from html
text = soup.getText()
return text
def read_html(filename):
file = parser.from_file(filename)
return (file)
html_content_list = []
def read_html_files(path=None):
if path is not None:
for i, file in enumerate(path):
# creating an object
try:
text = read_html(file)
t = [[k, v] for k, v in text.items()]
# print(t)
text_strings = str(t[1][1])
except Exception:
print(f"Something is wrong with reading HTML file #{i}")
continue
html_content_list.append(cleaning_raw_text(text_strings))