6
6
from django .utils import timezone
7
7
from selenium import webdriver
8
8
from selenium .webdriver .common .by import By
9
+ from selenium .webdriver .firefox .options import Options
10
+ from selenium .webdriver .firefox .service import Service as FirefoxService
9
11
from selenium .webdriver .support import expected_conditions as EC
10
12
from selenium .webdriver .support .ui import WebDriverWait
13
+ from webdriver_manager .firefox import GeckoDriverManager
11
14
12
15
from penndata .models import Event
13
16
@@ -26,20 +29,12 @@ def handle(self, *args, **kwargs):
26
29
# past_events.delete()
27
30
28
31
# Scrapes Penn Today
29
- try :
30
- driver = webdriver .Chrome ()
31
-
32
- driver .get (PENN_TODAY_WEBSITE )
33
- events_list = WebDriverWait (driver , 10 ).until (
34
- EC .presence_of_element_located ((By .ID , "events-list" ))
32
+ if not (
33
+ soup := self .connect_and_parse_html (
34
+ PENN_TODAY_WEBSITE , EC .presence_of_element_located ((By .ID , "events-list" ))
35
35
)
36
-
37
- html_content = events_list .get_attribute ("innerHTML" )
38
- driver .quit ()
39
- except ConnectionError :
40
- return None
41
-
42
- soup = BeautifulSoup (html_content , "html.parser" )
36
+ ):
37
+ return
43
38
44
39
event_articles = soup .find_all ("article" , class_ = "tease" )
45
40
@@ -73,12 +68,16 @@ def handle(self, *args, **kwargs):
73
68
if start_date .month < current_month :
74
69
# If scraped month is before current month, increment year
75
70
start_date = start_date .replace (year = current_year + 1 )
76
- if start_time_str == ALL_DAY :
71
+ print (start_date_str )
72
+ if ALL_DAY in start_time_str .lower ():
77
73
start_time = datetime .time (0 , 0 )
78
74
else :
79
75
start_time = datetime .datetime .strptime (start_time_str , "%I:%M%p" ).time ()
80
76
start_date = datetime .datetime .combine (start_date , start_time )
81
77
78
+ if start_date > now + datetime .timedelta (days = 31 ):
79
+ continue
80
+
82
81
event_url = urljoin (PENN_TODAY_WEBSITE , article .find ("a" , class_ = "tease__link" )["href" ])
83
82
84
83
end_time = self .get_end_time (event_url )
@@ -95,47 +94,63 @@ def handle(self, *args, **kwargs):
95
94
end_of_day = datetime .time (23 , 59 , 59 )
96
95
if end_date_elem : # end date but no end time
97
96
end_date_str = end_date_elem .text .strip ().split (" " )[- 1 ]
98
- end_date = datetime .combine (
97
+ end_date = datetime .datetime . combine (
99
98
datetime .datetime .strptime (end_date_str , "%m/%d/%Y" ), end_of_day
100
99
)
100
+
101
101
else : # no end date or end time
102
- end_date = datetime .combine (start_date , end_of_day )
102
+ end_date = datetime .datetime . combine (start_date , end_of_day )
103
103
104
104
Event .objects .update_or_create (
105
105
name = name ,
106
106
defaults = {
107
- "event_type" : "" ,
107
+ "event_type" : "Penn Today " ,
108
108
"image_url" : "" ,
109
- "start" : start_date ,
110
- "end" : end_date ,
109
+ "start" : timezone . make_aware ( start_date ) ,
110
+ "end" : timezone . make_aware ( end_date ) ,
111
111
"location" : location ,
112
112
"website" : event_url ,
113
113
"description" : description ,
114
114
"email" : "" ,
115
115
},
116
116
)
117
117
118
- self .stdout .write ("Uploaded Events!" )
118
+ self .stdout .write ("Uploaded Penn Today Events!" )
119
+
120
+ def connect_and_parse_html (self , event_url , condition ):
121
+ try :
122
+ options = Options ()
123
+ options .add_argument ("--headless" )
124
+ driver = webdriver .Firefox (
125
+ service = FirefoxService (GeckoDriverManager ().install ()), options = options
126
+ )
127
+
128
+ driver .get (event_url )
129
+ print ("WAITING FOR ELEMENT" )
130
+ element = WebDriverWait (driver , 10 ).until (condition )
131
+ print ("ELEMENT FOUND" )
132
+
133
+ html_content = element .get_attribute ("innerHTML" )
134
+ driver .quit ()
135
+ return BeautifulSoup (html_content , "html.parser" )
136
+ except ConnectionError :
137
+ print ("Connection Error to webdriver" )
138
+ return None
119
139
120
- def get_end_time (event_url ):
121
- driver = webdriver .Chrome ()
122
- driver .get (event_url )
123
- event_element = WebDriverWait (driver , 10 ).until (
124
- EC .presence_of_element_located ((By .CLASS_NAME , "event__topper-content" ))
140
+ def get_end_time (self , event_url ):
141
+ end_time_soup = self .connect_and_parse_html (
142
+ event_url , EC .presence_of_element_located ((By .CLASS_NAME , "event__topper-content" ))
125
143
)
126
- end_time_soup = BeautifulSoup (event_element .get_attribute ("innerHTML" ), "html.parser" )
127
144
128
145
end_time_range_str = (
129
146
end_time_soup .find ("p" , class_ = "event__meta event__time" ).text .strip ().replace ("." , "" )
130
147
)
131
- print (end_time_range_str )
132
- if not end_time_range_str or ALL_DAY in end_time_range_str .lower ():
133
- driver .quit ()
148
+
149
+ if (
150
+ not end_time_range_str
151
+ or ALL_DAY in end_time_range_str .lower ()
152
+ or len (times := end_time_range_str .split (" - " )) <= 1
153
+ ):
134
154
return None # No end time if the event is all day
135
- times = end_time_range_str .split (" - " )
136
- if len (times ) <= 1 :
137
- driver .quit ()
138
- return None
139
- end_time_str = times [1 ]
140
- driver .quit ()
141
- return end_time_str
155
+
156
+ return times [1 ]
0 commit comments