You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
206 lines
5.5 KiB
206 lines
5.5 KiB
import time |
|
from time import sleep |
|
|
|
import pandas as pd |
|
from bs4 import BeautifulSoup |
|
from selenium import webdriver |
|
from selenium.webdriver import ActionChains |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.webdriver.common.keys import Keys |
|
|
|
from utils import api, password, user_name |
|
|
|
import os |
|
|
|
from dotenv import load_dotenv |
|
from linkedin_api import Linkedin |
|
|
|
load_dotenv() |
|
user_name = os.getenv("user_name") |
|
password = os.getenv("password") |
|
api = Linkedin(user_name, password) |
|
|
|
|
|
options = Options() |
|
options.add_argument("--disable-infobars") |
|
options.add_argument("--start-maximized") |
|
options.add_argument("--disable-extensions") |
|
options.add_argument("disable-blink-features=AutomationControlled") |
|
options.add_argument( |
|
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36" |
|
) |
|
|
|
d = webdriver.Chrome(options=options, executable_path="drivers/chromedriver") |
|
|
|
|
|
def login(d): |
|
d.get("https://www.linkedin.com/login") |
|
sleep(3) |
|
actions = ActionChains(d) |
|
actions.send_keys(user_name) |
|
actions.send_keys(Keys.TAB) |
|
actions.send_keys(password) |
|
actions.send_keys(Keys.ENTER) |
|
actions.perform() |
|
|
|
content = d.page_source |
|
soup = BeautifulSoup("".join(content), "html.parser") |
|
|
|
if "input__email_verification_pin" in soup.text: |
|
code = d.find_element_by_id("input__email_verification_pin") |
|
key = input("Enter key:") |
|
code.send_keys(key) |
|
code.send_keys(Keys.ENTER) |
|
|
|
return "Successfully logged in! " |
|
|
|
|
|
def scroll_down(driver, url, keyword): |
|
print("Page loading.......") |
|
d.get(url + "/people/?keywords={}".format(keyword)) |
|
"""A method for scrolling the page.""" |
|
|
|
# Get scroll height. |
|
last_height = driver.execute_script("return document.body.scrollHeight") |
|
|
|
while True: |
|
|
|
# Scroll down to the bottom. |
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") |
|
|
|
# Wait to load the page. |
|
time.sleep(1.5) |
|
|
|
# Calculate new scroll height and compare with last scroll height. |
|
new_height = driver.execute_script("return document.body.scrollHeight") |
|
|
|
if new_height == last_height: |
|
break |
|
|
|
last_height = new_height |
|
print("Page loaded!") |
|
|
|
|
|
def get_links(d): |
|
links = [] |
|
content = d.page_source |
|
soup = BeautifulSoup("".join(content), "html.parser") |
|
ul = soup.find_all( |
|
"ul", attrs={"class": "org-people-profiles-module__profile-list"} |
|
) |
|
li = ul[0].find_all("li") |
|
|
|
for i in li: |
|
try: |
|
links.append(i.find("a")["href"]) |
|
except: |
|
pass |
|
return links |
|
|
|
|
|
def get_employee(id): |
|
# print(id) |
|
temp = api.get_profile(id.split("/")[2]) |
|
link = "https://www.linkedin.com/" + id |
|
company = temp["experience"][0]["companyName"] |
|
full_name = "{} {}".format(temp["firstName"], temp["lastName"]) |
|
try: |
|
current_position = temp["experience"][0]["title"] |
|
except: |
|
current_position = "" |
|
try: |
|
current_position_start = "{}/{}".format( |
|
temp["experience"][0]["timePeriod"]["startDate"]["month"], |
|
temp["experience"][0]["timePeriod"]["startDate"]["year"], |
|
) |
|
except: |
|
current_position_start = "" |
|
try: |
|
header_location = temp["locationName"] |
|
except: |
|
header_location = "" |
|
try: |
|
geo = temp["experience"][0]["geoLocationName"] |
|
city = geo.split(", ")[0] |
|
country = geo.split(", ")[-1] |
|
except: |
|
city = "" |
|
country = "" |
|
try: |
|
education_1 = temp["education"][0]["schoolName"] |
|
except: |
|
education_1 = "" |
|
|
|
try: |
|
education_1_year = "{}-{}".format( |
|
temp["education"][0]["timePeriod"]["startDate"]["year"], |
|
temp["education"][0]["timePeriod"]["endDate"]["year"], |
|
) |
|
except: |
|
education_1_year = "" |
|
|
|
if len(temp["education"]) > 1: |
|
try: |
|
education_2 = temp["education"][1]["schoolName"] |
|
except: |
|
education_2 = "" |
|
try: |
|
education_2_year = "{}-{}".format( |
|
temp["education"][1]["timePeriod"]["startDate"]["year"], |
|
temp["education"][1]["timePeriod"]["endDate"]["year"], |
|
) |
|
except: |
|
education_2_year = "" |
|
pass |
|
else: |
|
education_2 = "" |
|
education_2_year = "" |
|
return [ |
|
full_name, |
|
company, |
|
current_position, |
|
current_position_start, |
|
header_location, |
|
city, |
|
country, |
|
education_1, |
|
education_1_year, |
|
education_2, |
|
education_2_year, |
|
link, |
|
] |
|
|
|
|
|
login(d) |
|
while True: |
|
url = input("Enter the URL:") |
|
keyword = input("Enter keyword:") |
|
file = input("Enter save file name:") |
|
scroll_down(d, url, keyword) |
|
links = get_links(d) |
|
print("{} employees found".format(len(links))) |
|
data = [] |
|
for i in range(len(links)): |
|
print(i + 1, "extracting...") |
|
try: |
|
data.append(get_employee(links[i])) |
|
except Exception as e: |
|
pass |
|
df = pd.DataFrame( |
|
data, |
|
columns=[ |
|
"full_name", |
|
"company", |
|
"current_position", |
|
"current_position_start", |
|
"header_location", |
|
"city", |
|
"country", |
|
"education_1", |
|
"education_1_year", |
|
"education_2", |
|
"education_2_year", |
|
"linkedin_url", |
|
], |
|
) |
|
df.to_csv("output/{}.csv".format(file), index=False)
|
|
|