-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_vote_text.py
90 lines (83 loc) · 3.18 KB
/
get_vote_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import const
import csv
import pathlib
import time
import urllib.request
import urllib.robotparser
import utils
import progressbar as pb
from argparse import ArgumentParser
from typeguard import typechecked
@typechecked
def get_vote_details(file_in: pathlib.Path, folder_out: pathlib.Path) -> None:
"""
Get the details of each vote
Parameters
----------
file_in : pathlib.Path
File containing the list of vote
folder_out : pathlib.Path
Folder to contain the downloaded vote details
"""
utils.ensure_empty_folder(folder_out)
rp = urllib.robotparser.RobotFileParser()
rp.set_url(const.ROBOTS_GOVTRACK)
rp.read()
bar_i = 1
widgets = [ 'Retrieving Details # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker = '.', left = '[', right = ']')]
with pb.ProgressBar(widgets = widgets) as bar:
with open(file_in, 'r', encoding = 'utf-8', newline = '') as file_in:
reader = csv.reader(file_in, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_ALL)
next(reader, None)
for row in reader:
bar.update(bar_i)
bar_i = bar_i + 1
url = const.URL_VOTE_DETAILS.format(vote_id = row[0])
if rp.can_fetch(const.USER_AGENT, url):
time.sleep(rp.crawl_delay(const.USER_AGENT))
__download_vote_details(folder_out, row[0])
else:
print(f'robots.txt forbids url: {url}')
@typechecked
def __download_vote_details(folder_out : pathlib.Path, vote_id: str) -> None:
"""
Get the csv export of the vote details from a single vote.
Parameters
----------
folder_out : pathlib.Path
Folder to contain the downloaded documents
vote_id : str
The vote's id, {congress}-{year}/{vote #}
"""
detail_url = const.URL_VOTE_DETAILS.format(vote_id = vote_id)
detail_path = folder_out.joinpath(f'./{utils.vote_id_to_path(vote_id)}.csv')
req = urllib.request.Request(detail_url, headers = {'User-Agent': const.USER_AGENT})
try:
with urllib.request.urlopen(req) as response:
response_code = response.getcode()
if response_code == 200:
content = response.read()
with open(detail_path, 'wb') as detail_path:
detail_path.write(content)
else:
print(f'could not open ({response_code}) url: {detail_url}')
except urllib.error.HTTPError as e:
print(f'could not open ({e.reason}) url: {detail_url}')
except urllib.error.URLError as e:
print(f'could not open ({e.reason}) url: {detail_url}')
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument(
'-in', '--file-in',
help = 'File containing the list of votes',
type = pathlib.Path,
required = True)
parser.add_argument(
'-out', '--folder-out',
help = 'Folder to contain the downloaded vote details',
type = pathlib.Path,
required = True)
args = parser.parse_args()
print(f'file in: {args.file_in}')
print(f'folder out: {args.folder_out}')
get_vote_details(args.file_in, args.folder_out)