-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvisualization.py
93 lines (73 loc) · 2.79 KB
/
visualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# The website we're scraping data from is: https://ags.aer.ca/data-maps-models/digital-data
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import requests
from nltk.probability import FreqDist
source = requests.get('https://ags.aer.ca/data-maps-models/digital-data')
content = source.content
# Creating a soup object based on the content of the source
soup = BeautifulSoup(content, features='lxml')
# Narrowing down the searching scope
result = soup.find(class_ = "list-results row")
# html black that contains result of year & id, title, and author data
result_blocks = result.find_all(class_ = "unstyled item-content")
# html block that contains the publication date data
date_blocks = result.find_all(class_ = "pub-date")
# html list block that contains author data
author_blocks = result.find_all(class_ = "author-list")
# Extracting all dates, titles, and author names
# Append the data to separate lists
year_list = []
for element in result_blocks:
li_tag = element.find("li")
year_list.append(li_tag.get_text()[13:17])
publication_date_list = []
for element in date_blocks:
# strip method remove the spaces on the right/leftof the dates
publication_date_list.append(element.get_text().strip())
title_list = []
for element in result_blocks:
a_tag = element.find("a")
title_list.append(a_tag.get_text())
author_list = []
for element in author_blocks:
a_tag = element.find_all("a")
# Append the author names into the list depending on the lenth of the a_tag list (i.e. how many authors)
for x in range(len(a_tag)):
author_list.append(a_tag[x].get_text())
# Extract the abstract for each article (needs a second get request)
# Issue: take too much time ~25 min
"""
abstract_list = []
for element in result_blocks:
a_tag = element.find("a")
link = a_tag.attrs['href']
article = requests.get('https://ags.aer.ca/'+ link)
content = article.content
soup = BeautifulSoup(content, 'lxml')
summary = soup.find(id = "summary")
abstract = summary.find("p")
#print(abstract.get_text())
abstract_list.append(abstract.get_text())
#print("extraction succeed")
"""
# print(year_list)
# print(title_list)
# print(author_list)
# print(publication_date_list)
# print(abstract_list)
# Visualizing the # of data sets being posted every year from 2002 to 2019
num_bins = 18
n, bins, patches = plt.hist(year_list, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Years')
plt.ylabel('# of Sets')
plt.title(r'Number of Published Data Sets from 2002 - 2019')
# Creating a new plot for visualizing the popular publishers
plt.figure()
# Declaring fdist object of class FreqDist
fdist = FreqDist()
for word in author_list:
fdist[word.lower()] += 1
plt.title(r'Number of Published Data Sets from Different Publishers')
fdist.plot(20)