You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
226 lines
7.4 KiB
226 lines
7.4 KiB
""" |
|
A class to define the methods to scrape LinkedIn user-profile webpages |
|
|
|
""" |
|
from time import sleep |
|
|
|
from bs4 import BeautifulSoup as bs |
|
from selenium.common.exceptions import TimeoutException |
|
from selenium.webdriver.common.keys import Keys |
|
|
|
from utils import ( |
|
filter_non_printable, |
|
is_button_found, |
|
scroll_profile_page, |
|
validate_field, |
|
validate_user_data, |
|
) |
|
|
|
|
|
class UserScraper(object): |
|
def __init__(self, driver): |
|
""" |
|
Initialize the class |
|
|
|
:param driver: selenium chrome driver object |
|
""" |
|
self.driver = driver |
|
|
|
@staticmethod |
|
def get_name(soup): |
|
""" |
|
Get the name of the user whose profile page is being scraped. |
|
|
|
:param soup: BeautifulSoup object |
|
:return: name: str name of the user |
|
""" |
|
try: |
|
name_tag = soup.find_all(class_="pv-top-card-section__name")[0] |
|
name = name_tag.get_text(strip=True) |
|
return name |
|
except IndexError: |
|
return "" |
|
|
|
@staticmethod |
|
def get_job_title(soup): |
|
""" |
|
Get the job title of the user whose profile |
|
page is being scraped |
|
|
|
:param soup: BeautifulSoup object |
|
:return: job_title: str |
|
""" |
|
try: |
|
job_title_tag = soup.find_all(class_="pv-top-card-section__headline")[0] |
|
job_title = job_title_tag.get_text(strip=True) |
|
job_title = filter_non_printable(job_title) |
|
return job_title |
|
except IndexError: |
|
return "" |
|
|
|
@staticmethod |
|
def get_location(soup): |
|
""" |
|
Get the location of the user whose profile |
|
page is being scraped. |
|
|
|
:param soup: BeautifulSoup object |
|
:return: location: str |
|
""" |
|
try: |
|
location_tag = soup.find_all(class_="pv-top-card-section__location")[0] |
|
location = location_tag.get_text(strip=True) |
|
return location |
|
except IndexError: |
|
return "" |
|
|
|
@staticmethod |
|
def get_degree(soup): |
|
""" |
|
Get the last degree of the user whose profile page |
|
is being scraped. |
|
|
|
:param soup: BeautifulSoup object |
|
:return: degree: str |
|
""" |
|
degree_tags = soup.find_all(class_="pv-entity__degree-name") |
|
if len(degree_tags) != 0: |
|
degree = degree_tags[0].get_text().split("\n")[2] |
|
degree = validate_field(degree) |
|
else: |
|
degree = "" |
|
return degree |
|
|
|
def get_skills(self): |
|
""" |
|
Get the skills of the user whose profile page is being scraped. |
|
Scroll down the page by sending the PAGE_DOWN button |
|
until either the "show more" button in the skills section |
|
has been found, or the end of the page has been reached |
|
Return a list of skills. |
|
|
|
:return: list: skills |
|
""" |
|
skills = [] |
|
button_found = False |
|
endofpage_reached = False |
|
attempt = 0 |
|
max_attempts = 3 |
|
delay = 3 # seconds |
|
body = self.driver.find_element_by_tag_name("body") |
|
last_height = self.driver.execute_script("return document.body.scrollHeight") |
|
while not button_found: |
|
body.send_keys(Keys.PAGE_DOWN) |
|
sleep(2) |
|
new_height = self.driver.execute_script("return document.body.scrollHeight") |
|
button_found, showmore_button = is_button_found(self.driver, delay) |
|
if button_found: |
|
self.driver.execute_script("arguments[0].click();", showmore_button) |
|
sleep(2) |
|
soup = bs(self.driver.page_source, "html.parser") |
|
skills_tags = soup.find_all( |
|
class_="pv-skill-category-entity__name-text" |
|
) |
|
skills = [item.get_text(strip=True) for item in skills_tags] |
|
skills = [validate_field(skill) for skill in skills] |
|
if new_height == last_height: |
|
attempt += 1 |
|
if attempt == max_attempts: |
|
endofpage_reached = True |
|
else: |
|
last_height = new_height |
|
if button_found or endofpage_reached: |
|
break |
|
return skills |
|
|
|
@staticmethod |
|
def get_languages(soup): |
|
""" |
|
Get the languages in the "Accomplishments" section |
|
of the user whose profile page is being scraped. |
|
Look for the accomplishment tags first, and get all the language |
|
elements from them. |
|
Return a list of languages. |
|
|
|
:param soup: BeautifulSoup object |
|
:return: list: languages list |
|
""" |
|
languages = [] |
|
accomplishment_tags = soup.find_all( |
|
class_="pv-accomplishments-block__list-container" |
|
) |
|
for a in accomplishment_tags: |
|
try: |
|
if a["id"] == "languages-expandable-content": |
|
languages = [l.get_text() for l in a.find_all("li")] |
|
except KeyError: |
|
pass |
|
return languages |
|
|
|
def scrape_user(self, query, url): |
|
""" |
|
Get the user data for a given query and linkedin URL. |
|
Call get_name() and get_job_title() to get name and |
|
job title, respectively. Scroll down the given URL |
|
to make the skill-section HTML code appear; |
|
call get_skills() and get_degree() to extract the user skills |
|
and their degree, respectively. Scroll down the page until its |
|
end to extract the user languages by calling |
|
get_languages(). |
|
Finally, return a dictionary with the extracted data. |
|
|
|
:param query: str |
|
:param url: str URL to scrape |
|
:return: |
|
""" |
|
attempt = 0 |
|
max_attempts = 3 |
|
success = False |
|
user_data = {} |
|
while not success: |
|
try: |
|
attempt += 1 |
|
self.driver.get(url) |
|
sleep(2) |
|
self.driver.execute_script("document.body.style.zoom='50%'") |
|
sleep(3) |
|
skills = self.get_skills() |
|
scroll_profile_page(self.driver) |
|
soup = bs(self.driver.page_source, "html.parser") |
|
name = self.get_name(soup) |
|
job_title = self.get_job_title(soup) |
|
location = self.get_location(soup) |
|
degree = self.get_degree(soup) |
|
languages = self.get_languages(soup) |
|
user_data = { |
|
"URL": url, |
|
"name": name, |
|
"query": query, |
|
"job_title": job_title, |
|
"degree": degree, |
|
"location": location, |
|
"languages": languages, |
|
"skills": skills, |
|
} |
|
success = True |
|
except TimeoutException: |
|
print( |
|
"\nINFO :: TimeoutException raised while " + "getting URL\n" + url |
|
) |
|
print( |
|
"INFO :: Attempt n." |
|
+ str(attempt) |
|
+ " of " |
|
+ str(max_attempts) |
|
+ "\nNext attempt in 60 seconds" |
|
) |
|
sleep(60) |
|
if success: |
|
break |
|
if attempt == max_attempts and not user_data: |
|
print( |
|
"INFO :: Max number of attempts reached. " |
|
+ "Skipping URL" |
|
+ "\nUser data will be empty." |
|
) |
|
return validate_user_data(user_data)
|
|
|