-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcourse_scraper.py
158 lines (127 loc) · 5.5 KB
/
course_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import itertools
import os
import pickle
import re
from dataclasses import dataclass
from typing import Iterable, List
import requests
PICKLE_FILENAME = "courses.pickle"
DEGREE_PAGE_URL = "https://academic.openu.ac.il/CS/computer/program/AF.aspx?version=108"
# This splitting string only applies to B.Sc. in CompSci
# Change it to suit your degree page
DEGREE_PAGE_CHOICE_SPLITTER = "בחירה - לפחות 27-31"
@dataclass
class Course:
id: int = -1
name: str = ""
credits: int = -1
advanced: bool = False
domain: str = ""
required: bool = False
must_courses: Iterable[int] = ()
recommend_courses: Iterable[int] = ()
def get_course_by_id(courses: Iterable[Course], id: int) -> Course:
try:
return [course for course in courses if course.id == id][0]
except IndexError:
return Course()
def cleanup_hebrew(string: str) -> str:
'''remove the RTL and LTR symbols'''
return string.replace("‬", "").replace("‫", "")
def manual_filter(courses: List[Course]) -> None:
'''Make manual changes to the Course list'''
# this course has a very long name
get_course_by_id(courses, 20476).name = "מתמטיקה בדידה"
get_course_by_id(courses, 20425).name = "הסתברות ומבוא לסטטיסטיקה למדמ\"ח"
# 3 regular credits + 2 advanced credits
get_course_by_id(courses, 20604).credits = 5
def scrape_data() -> List[Course]:
'''Scrape the data and parse it'''
# Download the degree page
response = requests.get(DEGREE_PAGE_URL)
content = response.content.decode('utf-8')
# split it into must and choice
before, after = content.split(DEGREE_PAGE_CHOICE_SPLITTER, 1)
must_url_matches = re.findall(
r'https*://www.openu.ac.il/courses/\d+\.htm', before)
choise_url_matches = re.findall(
r'https*://www.openu.ac.il/courses/\d+\.htm', after)
# make sure we have that information when we create the Course instance
urls = itertools.chain([(url, True) for url in must_url_matches], [
(url, False) for url in choise_url_matches])
# parse the urls
courses = []
for url_match, required in urls:
response = requests.get(url_match)
# no idea why they use a different encoding
page_content = response.content.decode('windows-1255')
# get the title
title_match = re.search(r'<title>(.*?)</title>', page_content)
# title is in format "{RTL}id name{LTR}"
id, name = cleanup_hebrew(title_match.group(1)).split(" ", 1)
# extract credits and course level
credits_match = re.search(
r'<strong>(\d+).*? נקודות זכות(?: ברמ(?:ת|ה) (רגילה|מתקדמת|פתיחה))*', page_content)
if credits_match:
credits = credits_match.group(1)
level = credits_match.group(2)
else:
credits = 0
level = ""
advanced = level == "מתקדמת"
# domain is "science / mathematics" or "science / computer science"
domain = re.search(r'<strong>\s*שיוך:\s*<\/strong>(.*?)<\/p>',
page_content, re.DOTALL).group(1).split("/")[1].strip()
# parse prerequisite courses
requirements_match = re.search(
r'<p>\s*<img src="gifs/triangle.jpg" \b.*?>(.*?)<\/p>', page_content, re.DOTALL)
if requirements_match:
requirements = requirements_match.group(1)
try:
# assume all courses mentioned before the word are required
# and all the ones after it are not
before, after = requirements.split("מומלץ")
except ValueError:
before, after = requirements, ""
# parse prerequisite course ids
must = list([int(x) for x in re.findall(
r'https*://www\.openu\.ac\.il/courses/(\d+)\.htm', before)])
recommend = list([int(x) for x in re.findall(
r'https*://www\.openu\.ac\.il/courses/(\d+)\.htm', after)])
courses.append(Course(id=int(id),
name=name,
credits=int(credits),
advanced=advanced,
domain=domain,
required=required,
must_courses=must,
recommend_courses=recommend))
# filter out all unknown courses
all_ids = {course.id for course in courses} # get ids of all known courses
for course in courses:
# recrate lists using only the known ids
course.must_courses = [
id for id in course.must_courses if id in all_ids]
course.recommend_courses = [
id for id in course.recommend_courses if id in all_ids]
# hand made modifications to the output
manual_filter(courses)
return courses
def load_courses() -> List[Course]:
"""
Load the courses array from disk, or download it if it doesn't exist.
Caution: There is no security built in to this, verify the pickle file yourself.
"""
try:
with open('courses.pickle', 'rb') as f:
courses = pickle.load(f)
except (FileNotFoundError, pickle.UnpicklingError):
courses = scrape_data()
with open('courses.pickle', 'wb') as f:
pickle.dump(courses, f)
return courses
if __name__ == "__main__":
# force scrape the courses
if os.path.isfile(PICKLE_FILENAME):
os.remove(PICKLE_FILENAME)
print(load_courses())