Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

137 lines
5.4 KiB

import logging
import time
from abc import abstractmethod
from os import environ
import selenium.webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
logger = logging.getLogger(__name__)
class Scraper(object):
"""
Wrapper for selenium Chrome driver with methods to scroll through a page and
to scrape and parse info from a linkedin page
Params:
- cookie {str}: li_at session cookie required to scrape linkedin profiles
- driver {webdriver}: driver to be used for scraping
- scroll_pause {float}: amount of time to pause (s) while incrementally
scrolling through the page
- scroll_increment {int}: pixel increment for scrolling
- timeout {float}: time to wait for page to load first batch of async content
"""
def __init__(self, cookie=None, scraperInstance=None, driver=selenium.webdriver.Chrome, driver_options={}, scroll_pause=0.1, scroll_increment=300, timeout=10):
if type(self) is Scraper:
raise Exception(
'Scraper is an abstract class and cannot be instantiated directly')
if scraperInstance:
self.was_passed_instance = True
self.driver = scraperInstance.driver
self.scroll_increment = scraperInstance.scroll_increment
self.timeout = scraperInstance.timeout
self.scroll_pause = scraperInstance.scroll_pause
return
self.was_passed_instance = False
self.driver = driver(**driver_options)
self.scroll_pause = scroll_pause
self.scroll_increment = scroll_increment
self.timeout = timeout
self.driver.get('https://www.linkedin.com')
self.driver.set_window_size(1920, 1080)
if 'LI_EMAIL' in environ and 'LI_PASS' in environ:
self.login(environ['LI_EMAIL'], environ['LI_PASS'])
else:
if not cookie and 'LI_AT' not in environ:
raise ValueError(
'Must either define LI_AT environment variable, or pass a cookie string to the Scraper')
elif not cookie:
cookie = environ['LI_AT']
self.driver.add_cookie({
'name': 'li_at',
'value': cookie,
'domain': '.linkedin.com'
})
@abstractmethod
def scrape(self):
raise Exception('Must override abstract method scrape')
def login(self, email, password):
email_input = self.driver.find_element_by_css_selector(
'input.login-email')
password_input = self.driver.find_element_by_css_selector(
'input.login-password')
email_input.send_keys(email)
password_input.send_keys(password)
password_input.send_keys(Keys.ENTER)
def get_html(self, url):
self.load_profile_page(url)
return self.driver.page_source
def scroll_to_bottom(self):
"""Scroll to the bottom of the page
Params:
- scroll_pause_time {float}: time to wait (s) between page scroll increments
- scroll_increment {int}: increment size of page scrolls (pixels)
"""
# NOTE: this starts scrolling from the current scroll position, not the top of the page.
current_height = self.driver.execute_script(
"return document.documentElement.scrollTop")
while True:
self.click_expandable_buttons()
# Scroll down to bottom in increments of self.scroll_increment
new_height = self.driver.execute_script(
"return Math.min({}, document.body.scrollHeight)".format(current_height + self.scroll_increment))
if (new_height == current_height):
break
self.driver.execute_script(
"window.scrollTo(0, {});".format(new_height))
current_height = new_height
# Wait to load page
time.sleep(self.scroll_pause)
def click_expandable_buttons(self):
expandable_button_selectors = [
'button[aria-expanded="false"].pv-skills-section__additional-skills',
'button[aria-expanded="false"].pv-profile-section__see-more-inline',
'button[aria-expanded="false"].pv-top-card-section__summary-toggle-button',
'button[aria-expanded="false"].inline-show-more-text__button',
'button[data-control-name="contact_see_more"]'
]
for name in expandable_button_selectors:
try:
self.driver.find_element_by_css_selector(name).click()
except:
pass
# Use JQuery to click on invisible expandable 'see more...' elements
self.driver.execute_script(
'document.querySelectorAll(".lt-line-clamp__ellipsis:not(.lt-line-clamp__ellipsis--dummy) .lt-line-clamp__more").forEach(el => el.click())')
def wait(self, condition):
return WebDriverWait(self.driver, self.timeout).until(condition)
def wait_for_el(self, selector):
return self.wait(EC.presence_of_element_located((
By.CSS_SELECTOR, selector
)))
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
self.quit()
def quit(self):
if self.driver and not self.was_passed_instance:
self.driver.quit()