-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpolice.py
35 lines (26 loc) · 1.14 KB
/
police.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests
from bs4 import BeautifulSoup
import json
ua = json.loads(open("config.json").read())["user_agent"]
def getPoliceReportsByYear():
r = requests.get("https://www.bart.gov/about/police/reports", headers={"User-agent": ua})
soup = BeautifulSoup(r.text, "html.parser")
# Big idea:
# We need to scrape all of the links between "<h2>Monthly Chief's Report</h2>" and "<h2>Annual Internal Affairs Reports</h2>"
start_element = soup.find("h2", string="Monthly Chief's Report")
end_element = soup.find("h2", string="Annual Internal Affairs Reports")
html = ""
if start_element and end_element:
# Extract content between these two H2 tags
for element in start_element.find_next_siblings():
if element == end_element:
break
html += str(element) # Get text only
# now, reparse the stringified html
newHTML = BeautifulSoup(html, "html.parser")
allLinks = newHTML.find_all("a")
allLinksHref = []
for link in allLinks:
allLinksHref.append(link.get("href"))
return allLinksHref
print(getPoliceReportsByYear())