Skip to content

Commit e4d96c1

Browse files
judtinzhangvcai122ashleyzhang01
authored
Penn Events Script Updates (#250)
* Add self to penn events script * Add webdriver and use firefox for debugging * fix some date parsing + refactor * rate limit * Lint * uwsgi fix --------- Co-authored-by: vcai122 <vincent.delaware@gmail.com> Co-authored-by: ashleyzhang01 <ashleyzhang10@gmail.com>
1 parent 693f353 commit e4d96c1

File tree

3 files changed

+92
-50
lines changed

3 files changed

+92
-50
lines changed

backend/Pipfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ django = "==5.0.2"
2828
django-cors-headers = "*"
2929
pyyaml = "*"
3030
uritemplate = "*"
31-
uwsgi = {version = "*", markers = "sys_platform== 'linux'"}
31+
uwsgi = "*"
3232
django-filter = "*"
3333
django-labs-accounts = "==0.9.5"
3434
django-debug-toolbar = "*"
@@ -45,6 +45,7 @@ django-redis = "*"
4545
redis = "*"
4646
python-dateutil = "*"
4747
selenium = "*"
48+
webdriver-manager = "*"
4849

4950
[requires]
5051
python_version = "3.11"

backend/Pipfile.lock

+39-13
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/penndata/management/commands/get_penn_today_events.py

+51-36
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
from django.utils import timezone
77
from selenium import webdriver
88
from selenium.webdriver.common.by import By
9+
from selenium.webdriver.firefox.options import Options
10+
from selenium.webdriver.firefox.service import Service as FirefoxService
911
from selenium.webdriver.support import expected_conditions as EC
1012
from selenium.webdriver.support.ui import WebDriverWait
13+
from webdriver_manager.firefox import GeckoDriverManager
1114

1215
from penndata.models import Event
1316

@@ -26,20 +29,12 @@ def handle(self, *args, **kwargs):
2629
# past_events.delete()
2730

2831
# Scrapes Penn Today
29-
try:
30-
driver = webdriver.Chrome()
31-
32-
driver.get(PENN_TODAY_WEBSITE)
33-
events_list = WebDriverWait(driver, 10).until(
34-
EC.presence_of_element_located((By.ID, "events-list"))
32+
if not (
33+
soup := self.connect_and_parse_html(
34+
PENN_TODAY_WEBSITE, EC.presence_of_element_located((By.ID, "events-list"))
3535
)
36-
37-
html_content = events_list.get_attribute("innerHTML")
38-
driver.quit()
39-
except ConnectionError:
40-
return None
41-
42-
soup = BeautifulSoup(html_content, "html.parser")
36+
):
37+
return
4338

4439
event_articles = soup.find_all("article", class_="tease")
4540

@@ -73,12 +68,16 @@ def handle(self, *args, **kwargs):
7368
if start_date.month < current_month:
7469
# If scraped month is before current month, increment year
7570
start_date = start_date.replace(year=current_year + 1)
76-
if start_time_str == ALL_DAY:
71+
print(start_date_str)
72+
if ALL_DAY in start_time_str.lower():
7773
start_time = datetime.time(0, 0)
7874
else:
7975
start_time = datetime.datetime.strptime(start_time_str, "%I:%M%p").time()
8076
start_date = datetime.datetime.combine(start_date, start_time)
8177

78+
if start_date > now + datetime.timedelta(days=31):
79+
continue
80+
8281
event_url = urljoin(PENN_TODAY_WEBSITE, article.find("a", class_="tease__link")["href"])
8382

8483
end_time = self.get_end_time(event_url)
@@ -95,47 +94,63 @@ def handle(self, *args, **kwargs):
9594
end_of_day = datetime.time(23, 59, 59)
9695
if end_date_elem: # end date but no end time
9796
end_date_str = end_date_elem.text.strip().split(" ")[-1]
98-
end_date = datetime.combine(
97+
end_date = datetime.datetime.combine(
9998
datetime.datetime.strptime(end_date_str, "%m/%d/%Y"), end_of_day
10099
)
100+
101101
else: # no end date or end time
102-
end_date = datetime.combine(start_date, end_of_day)
102+
end_date = datetime.datetime.combine(start_date, end_of_day)
103103

104104
Event.objects.update_or_create(
105105
name=name,
106106
defaults={
107-
"event_type": "",
107+
"event_type": "Penn Today",
108108
"image_url": "",
109-
"start": start_date,
110-
"end": end_date,
109+
"start": timezone.make_aware(start_date),
110+
"end": timezone.make_aware(end_date),
111111
"location": location,
112112
"website": event_url,
113113
"description": description,
114114
"email": "",
115115
},
116116
)
117117

118-
self.stdout.write("Uploaded Events!")
118+
self.stdout.write("Uploaded Penn Today Events!")
119+
120+
def connect_and_parse_html(self, event_url, condition):
121+
try:
122+
options = Options()
123+
options.add_argument("--headless")
124+
driver = webdriver.Firefox(
125+
service=FirefoxService(GeckoDriverManager().install()), options=options
126+
)
127+
128+
driver.get(event_url)
129+
print("WAITING FOR ELEMENT")
130+
element = WebDriverWait(driver, 10).until(condition)
131+
print("ELEMENT FOUND")
132+
133+
html_content = element.get_attribute("innerHTML")
134+
driver.quit()
135+
return BeautifulSoup(html_content, "html.parser")
136+
except ConnectionError:
137+
print("Connection Error to webdriver")
138+
return None
119139

120-
def get_end_time(event_url):
121-
driver = webdriver.Chrome()
122-
driver.get(event_url)
123-
event_element = WebDriverWait(driver, 10).until(
124-
EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content"))
140+
def get_end_time(self, event_url):
141+
end_time_soup = self.connect_and_parse_html(
142+
event_url, EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content"))
125143
)
126-
end_time_soup = BeautifulSoup(event_element.get_attribute("innerHTML"), "html.parser")
127144

128145
end_time_range_str = (
129146
end_time_soup.find("p", class_="event__meta event__time").text.strip().replace(".", "")
130147
)
131-
print(end_time_range_str)
132-
if not end_time_range_str or ALL_DAY in end_time_range_str.lower():
133-
driver.quit()
148+
149+
if (
150+
not end_time_range_str
151+
or ALL_DAY in end_time_range_str.lower()
152+
or len(times := end_time_range_str.split(" - ")) <= 1
153+
):
134154
return None # No end time if the event is all day
135-
times = end_time_range_str.split(" - ")
136-
if len(times) <= 1:
137-
driver.quit()
138-
return None
139-
end_time_str = times[1]
140-
driver.quit()
141-
return end_time_str
155+
156+
return times[1]

0 commit comments

Comments
 (0)