-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathonline_scraper.py
236 lines (206 loc) · 7.27 KB
/
online_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#%%
import pandas as pd
import datetime
from time import gmtime,strftime
from selenium import webdriver
import logging
from linkedin_jobs_scraper import LinkedinScraper
from linkedin_jobs_scraper.events import Events, EventData
from linkedin_jobs_scraper.query import Query, QueryOptions, QueryFilters
from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters, TypeFilters, ExperienceLevelFilters, OnSiteOrRemoteFilters
import json
import random
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import chromedriver_autoinstaller
#%%
#
chromedriver_autoinstaller.install() # Check if the current version of chromedriver exists
# and if it doesn't exist, download it automatically,
# then add chromedriver to path
chrome_options = webdriver.ChromeOptions()
options = [
"--no-sandbox",
# Define window size here
"--window-size=1200,1200",
"--ignore-certificate-errors",
"--headless",
#"--disable-gpu",
#"--window-size=1920,1200",
#"--ignore-certificate-errors",
#"--disable-extensions",
"--disable-dev-shm-usage"
#'--remote-debugging-port=9222'
]
for option in options:
chrome_options.add_argument(option)
driver = webdriver.Chrome(options = chrome_options)
#%%
#%%
# Change root logger level (default is WARN)
logging.basicConfig(level = logging.INFO)
# cache = []
# def on_data(data: EventData):
# scraped = {
# "job_id": data.job_id,
# "link": data.link,
# "apply_link": data.apply_link,
# "title": data.title,
# "company": data.company,
# "place": data.place,
# "description": data.description,
# "description_html": data.description_html,
# "date": data.date
# #"seniority_level": data.seniority_level,
# #"job_function": data.job_function,
# #"employment_type": data.employment_type,
# #"industries": data.industries
# }
# cache.append(scraped)
job_postings = []
def on_data(data: EventData):
print(data)
job_postings.append([data.job_id,data.link,data.title,data.company,data.place,data.description,data.description_html,data.date,data.skills])
def on_error(error):
print('[ON_ERROR]', error)
def on_end():
print('[ON_END]')
scraper = LinkedinScraper(
#chrome_executable_path=r'C:\Fonte\chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver)
chrome_executable_path=None,
chrome_options=None, # Custom Chrome options here
headless=True, # Overrides headless mode only if chrome_options is None
max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
slow_mo=1.2 # Slow down the scraper to avoid 'Too many requests (429)' errors
)
# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)
#%%
#%%
# group_of_items = {'Seattle, Washington, United States',
# 'San Francisco, California, United States',
# 'United States Remote',
# 'European Union Remote',
# 'Boston, Massachusetts, United States',
# 'Berlin, Germany',
# 'London, England, United Kingdom',
# 'Canada Remote',
# 'Toronto, Ontario, Canada',
# 'Vancouver, British Columbia, Canada',
# 'Madrid, Community of Madrid, Spain',
# 'Barcelona, Catalonia, Spain',
# 'Spain',
# 'Lisbon, Portugal',
# 'Porto, Portugal',
# 'Milan, Lombardy, Italy',
# 'Brussels Region, Belgium',
# 'Switzerland',
# 'São Paulo, Brazil'
# }
#num_to_select = 6
#locationsToQuery = random.sample(sorted(group_of_items), num_to_select)
group_of_items = ['Seattle, Washington, United States',
'San Francisco, California, United States',
'United States Remote',
'European Union Remote',
'Boston, Massachusetts, United States',
'Berlin, Germany',
'London, England, United Kingdom',
'Canada Remote',
'Toronto, Ontario, Canada',
'Vancouver, British Columbia, Canada',
'Madrid, Community of Madrid, Spain',
'Barcelona, Catalonia, Spain',
'Spain',
'Lisbon, Portugal',
'Porto, Portugal',
'Milan, Lombardy, Italy',
'Brussels Region, Belgium',
'Switzerland',
'São Paulo, Brazil',
'Rome, Latium, Italy',
'Netherlands',
'Stuttgart Region',
'Stockholm, Stockholm County, Sweden',
'Denmark',
'United Kingdom'
]
#num_to_select = 1
#locationsToQuery = random.sample(sorted(group_of_items), num_to_select)
def select_items_by_day(item_list):
# Get the current day of the week (0 = Monday, 6 = Sunday)
today = datetime.datetime.today().weekday()
# Define the ranges for each day of the week
ranges = {
0: range(0, 5), # Monday: items 1 through 6
1: range(5, 10), # Tuesday: items 7 through 12
2: range(10, 15), # Wednesday: items 13 through 18
3: range(15, 20), # Thursday: items 19 through 24
4: range(20, 25) # Friday: items 25 through 30
}
# Calculate the current range based on the day of the week
item_range = ranges[today % 5]
# Select items from the dictionary based on the calculated range
selected_items = [item_list[i] for i in item_range if i < len(item_list)]
return selected_items
#Select places based on day of the week, 5 places for each day
locationsToQuery = select_items_by_day(group_of_items)
query_1 = [
Query(
query='Data Analyst',
options=QueryOptions(
locations=locationsToQuery,
#optimize=True, # Blocks requests for resources like images and stylesheet
limit=60, # Limit the number of jobs to scrape
skip_promoted_jobs=True,
filters=QueryFilters(
relevance=RelevanceFilters.RECENT,
time=TimeFilters.MONTH,
on_site_or_remote=[OnSiteOrRemoteFilters.REMOTE],
#type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
experience=None
)
)
),
]
scraper.run(query_1)
# with open('data/jobs.json', 'w') as f:
# json.dump(cache, f, indent=4)
# print(f"Operation completed. Scraped {len(cache)} jobs")
#%%
#%%
df = pd.DataFrame(job_postings,columns=['Job_ID','Link','Title','Company','Place','Description','HTML','Date','Skills'])
#%%
#%%
df.to_csv('./data/data_analyst_'+strftime("%Y%m%d", gmtime())+'.csv',index=False)
#%%
##### TRYING TO OBTAIN INFO FOR DATA SCIENTIST POSITIONS #####
job_postings = []
query_2 = [
Query(
query='Data Scientist',
options=QueryOptions(
locations=locationsToQuery,
#optimize=True, # Blocks requests for resources like images and stylesheet
limit=60, # Limit the number of jobs to scrape
skip_promoted_jobs=True,
filters=QueryFilters(
relevance=RelevanceFilters.RECENT,
time=TimeFilters.MONTH,
on_site_or_remote=[OnSiteOrRemoteFilters.REMOTE],
#type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
experience=None
)
)
),
]
scraper.run(query_2)
#%%
df2 = pd.DataFrame(job_postings,columns=['Job_ID','Link','Title','Company','Place','Description','HTML','Date','Skills'])
#%%
#%%
df2.to_csv('./data/data_scientist_'+strftime("%Y%m%d", gmtime())+'.csv',index=False)
#%%