Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

226 lines
7.4 KiB

"""
A class to define the methods to scrape LinkedIn user-profile webpages
"""
from time import sleep
from bs4 import BeautifulSoup as bs
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from utils import (
filter_non_printable,
is_button_found,
scroll_profile_page,
validate_field,
validate_user_data,
)
class UserScraper(object):
def __init__(self, driver):
"""
Initialize the class
:param driver: selenium chrome driver object
"""
self.driver = driver
@staticmethod
def get_name(soup):
"""
Get the name of the user whose profile page is being scraped.
:param soup: BeautifulSoup object
:return: name: str name of the user
"""
try:
name_tag = soup.find_all(class_="pv-top-card-section__name")[0]
name = name_tag.get_text(strip=True)
return name
except IndexError:
return ""
@staticmethod
def get_job_title(soup):
"""
Get the job title of the user whose profile
page is being scraped
:param soup: BeautifulSoup object
:return: job_title: str
"""
try:
job_title_tag = soup.find_all(class_="pv-top-card-section__headline")[0]
job_title = job_title_tag.get_text(strip=True)
job_title = filter_non_printable(job_title)
return job_title
except IndexError:
return ""
@staticmethod
def get_location(soup):
"""
Get the location of the user whose profile
page is being scraped.
:param soup: BeautifulSoup object
:return: location: str
"""
try:
location_tag = soup.find_all(class_="pv-top-card-section__location")[0]
location = location_tag.get_text(strip=True)
return location
except IndexError:
return ""
@staticmethod
def get_degree(soup):
"""
Get the last degree of the user whose profile page
is being scraped.
:param soup: BeautifulSoup object
:return: degree: str
"""
degree_tags = soup.find_all(class_="pv-entity__degree-name")
if len(degree_tags) != 0:
degree = degree_tags[0].get_text().split("\n")[2]
degree = validate_field(degree)
else:
degree = ""
return degree
def get_skills(self):
"""
Get the skills of the user whose profile page is being scraped.
Scroll down the page by sending the PAGE_DOWN button
until either the "show more" button in the skills section
has been found, or the end of the page has been reached
Return a list of skills.
:return: list: skills
"""
skills = []
button_found = False
endofpage_reached = False
attempt = 0
max_attempts = 3
delay = 3 # seconds
body = self.driver.find_element_by_tag_name("body")
last_height = self.driver.execute_script("return document.body.scrollHeight")
while not button_found:
body.send_keys(Keys.PAGE_DOWN)
sleep(2)
new_height = self.driver.execute_script("return document.body.scrollHeight")
button_found, showmore_button = is_button_found(self.driver, delay)
if button_found:
self.driver.execute_script("arguments[0].click();", showmore_button)
sleep(2)
soup = bs(self.driver.page_source, "html.parser")
skills_tags = soup.find_all(
class_="pv-skill-category-entity__name-text"
)
skills = [item.get_text(strip=True) for item in skills_tags]
skills = [validate_field(skill) for skill in skills]
if new_height == last_height:
attempt += 1
if attempt == max_attempts:
endofpage_reached = True
else:
last_height = new_height
if button_found or endofpage_reached:
break
return skills
@staticmethod
def get_languages(soup):
"""
Get the languages in the "Accomplishments" section
of the user whose profile page is being scraped.
Look for the accomplishment tags first, and get all the language
elements from them.
Return a list of languages.
:param soup: BeautifulSoup object
:return: list: languages list
"""
languages = []
accomplishment_tags = soup.find_all(
class_="pv-accomplishments-block__list-container"
)
for a in accomplishment_tags:
try:
if a["id"] == "languages-expandable-content":
languages = [l.get_text() for l in a.find_all("li")]
except KeyError:
pass
return languages
def scrape_user(self, query, url):
"""
Get the user data for a given query and linkedin URL.
Call get_name() and get_job_title() to get name and
job title, respectively. Scroll down the given URL
to make the skill-section HTML code appear;
call get_skills() and get_degree() to extract the user skills
and their degree, respectively. Scroll down the page until its
end to extract the user languages by calling
get_languages().
Finally, return a dictionary with the extracted data.
:param query: str
:param url: str URL to scrape
:return:
"""
attempt = 0
max_attempts = 3
success = False
user_data = {}
while not success:
try:
attempt += 1
self.driver.get(url)
sleep(2)
self.driver.execute_script("document.body.style.zoom='50%'")
sleep(3)
skills = self.get_skills()
scroll_profile_page(self.driver)
soup = bs(self.driver.page_source, "html.parser")
name = self.get_name(soup)
job_title = self.get_job_title(soup)
location = self.get_location(soup)
degree = self.get_degree(soup)
languages = self.get_languages(soup)
user_data = {
"URL": url,
"name": name,
"query": query,
"job_title": job_title,
"degree": degree,
"location": location,
"languages": languages,
"skills": skills,
}
success = True
except TimeoutException:
print(
"\nINFO :: TimeoutException raised while " + "getting URL\n" + url
)
print(
"INFO :: Attempt n."
+ str(attempt)
+ " of "
+ str(max_attempts)
+ "\nNext attempt in 60 seconds"
)
sleep(60)
if success:
break
if attempt == max_attempts and not user_data:
print(
"INFO :: Max number of attempts reached. "
+ "Skipping URL"
+ "\nUser data will be empty."
)
return validate_user_data(user_data)