Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

102 lines
2.8 KiB

"""
A class to define the methods to scrape LinkedIn job web pages
"""
class JobScraper(object):
def __init__(self, soup, url, query):
"""
Initialize the class
:param soup: BeautifulSoup instance
:param url: str job URL to scrape
:param query: str query to perform
"""
self.soup = soup
self.url = url
self.query = query
def get_job_skills(self):
"""
Get the skills required by the job offer being scraped.
:return: list of skills
"""
requested_skills = [
rq.get_text()
for rq in self.soup.find_all(class_="jobs-ppc-criteria__value")
]
return requested_skills
def get_job_title(self):
"""
Get the job title of the job page is being scraped.
Return a string containing the job title
:return: str job title
"""
try:
job_title = self.soup.find_all(class_="jobs-top-card__job-title")[
0
].get_text()
except IndexError:
job_title = ""
return job_title
def get_job_location(self):
"""
Get the location of the job offer being scraped.
Return a string containing the location.
"""
def validate_location(loc):
"""
Validate the location by checking that the string extracted
by the preferred "jobs-top-card__exact-location" HTML class
is not empty, otherwise get the location string from the
"jobs-top-card__bullet" HTML class
:param loc: str of the location
:return: str location
"""
if loc:
return loc
else:
try:
loc = [
l.get_text().strip()
for l in self.soup.find_all(class_="jobs-top-card__bullet")
][0]
except IndexError:
loc = ""
return loc
try:
location = [
l.get_text().strip()
for l in self.soup.find_all(class_="jobs-top-card__exact-location")
][0]
except IndexError:
location = ""
return validate_location(location)
def get_job_data(self):
"""
Get the job data by using the get* methods of the class.
Return a dictionary
:return: dict job data
"""
skills = self.get_job_skills()
if len(skills) == 0:
return {}
else:
job_data = {
"URL": self.url,
"query": self.query,
"job_title": self.get_job_title(),
"location": self.get_job_location(),
"skills": skills,
}
return job_data