jupyter-notebooks/unpublished/job_scrape/linkedin/.ipynb_checkpoints/employee-scraper-checkpoint.py

import time
from time import sleep

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

from utils import api, password, user_name

import os

from dotenv import load_dotenv
from linkedin_api import Linkedin

load_dotenv()
user_name = os.getenv("user_name")
password = os.getenv("password")
api = Linkedin(user_name, password)


options = Options()
options.add_argument("--disable-infobars")
options.add_argument("--start-maximized")
options.add_argument("--disable-extensions")
options.add_argument("disable-blink-features=AutomationControlled")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
)

d = webdriver.Chrome(options=options, executable_path="drivers/chromedriver")


def login(d):
    d.get("https://www.linkedin.com/login")
    sleep(3)
    actions = ActionChains(d)
    actions.send_keys(user_name)
    actions.send_keys(Keys.TAB)
    actions.send_keys(password)
    actions.send_keys(Keys.ENTER)
    actions.perform()

    content = d.page_source
    soup = BeautifulSoup("".join(content), "html.parser")

    if "input__email_verification_pin" in soup.text:
        code = d.find_element_by_id("input__email_verification_pin")
        key = input("Enter key:")
        code.send_keys(key)
        code.send_keys(Keys.ENTER)

    return "Successfully logged in! "


def scroll_down(driver, url, keyword):
    print("Page loading.......")
    d.get(url + "/people/?keywords={}".format(keyword))
    """A method for scrolling the page."""

    # Get scroll height.
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:

        # Scroll down to the bottom.
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load the page.
        time.sleep(1.5)

        # Calculate new scroll height and compare with last scroll height.
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            break

        last_height = new_height
    print("Page loaded!")


def get_links(d):
    links = []
    content = d.page_source
    soup = BeautifulSoup("".join(content), "html.parser")
    ul = soup.find_all(
        "ul", attrs={"class": "org-people-profiles-module__profile-list"}
    )
    li = ul[0].find_all("li")

    for i in li:
        try:
            links.append(i.find("a")["href"])
        except:
            pass
    return links


def get_employee(id):
    # print(id)
    temp = api.get_profile(id.split("/")[2])
    link = "https://www.linkedin.com/" + id
    company = temp["experience"][0]["companyName"]
    full_name = "{} {}".format(temp["firstName"], temp["lastName"])
    try:
        current_position = temp["experience"][0]["title"]
    except:
        current_position = ""
    try:
        current_position_start = "{}/{}".format(
            temp["experience"][0]["timePeriod"]["startDate"]["month"],
            temp["experience"][0]["timePeriod"]["startDate"]["year"],
        )
    except:
        current_position_start = ""
    try:
        header_location = temp["locationName"]
    except:
        header_location = ""
    try:
        geo = temp["experience"][0]["geoLocationName"]
        city = geo.split(", ")[0]
        country = geo.split(", ")[-1]
    except:
        city = ""
        country = ""
    try:
        education_1 = temp["education"][0]["schoolName"]
    except:
        education_1 = ""

    try:
        education_1_year = "{}-{}".format(
            temp["education"][0]["timePeriod"]["startDate"]["year"],
            temp["education"][0]["timePeriod"]["endDate"]["year"],
        )
    except:
        education_1_year = ""

    if len(temp["education"]) > 1:
        try:
            education_2 = temp["education"][1]["schoolName"]
        except:
            education_2 = ""
        try:
            education_2_year = "{}-{}".format(
                temp["education"][1]["timePeriod"]["startDate"]["year"],
                temp["education"][1]["timePeriod"]["endDate"]["year"],
            )
        except:
            education_2_year = ""
            pass
    else:
        education_2 = ""
        education_2_year = ""
    return [
        full_name,
        company,
        current_position,
        current_position_start,
        header_location,
        city,
        country,
        education_1,
        education_1_year,
        education_2,
        education_2_year,
        link,
    ]


login(d)
while True:
    url = input("Enter the URL:")
    keyword = input("Enter keyword:")
    file = input("Enter save file name:")
    scroll_down(d, url, keyword)
    links = get_links(d)
    print("{} employees found".format(len(links)))
    data = []
    for i in range(len(links)):
        print(i + 1, "extracting...")
        try:
            data.append(get_employee(links[i]))
        except Exception as e:
            pass
    df = pd.DataFrame(
        data,
        columns=[
            "full_name",
            "company",
            "current_position",
            "current_position_start",
            "header_location",
            "city",
            "country",
            "education_1",
            "education_1_year",
            "education_2",
            "education_2_year",
            "linkedin_url",
        ],
    )
    df.to_csv("output/{}.csv".format(file), index=False)