Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

206 lines
5.5 KiB

import time
from time import sleep
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from utils import api, password, user_name
import os
from dotenv import load_dotenv
from linkedin_api import Linkedin
load_dotenv()
user_name = os.getenv("user_name")
password = os.getenv("password")
api = Linkedin(user_name, password)
options = Options()
options.add_argument("--disable-infobars")
options.add_argument("--start-maximized")
options.add_argument("--disable-extensions")
options.add_argument("disable-blink-features=AutomationControlled")
options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
)
d = webdriver.Chrome(options=options, executable_path="drivers/chromedriver")
def login(d):
d.get("https://www.linkedin.com/login")
sleep(3)
actions = ActionChains(d)
actions.send_keys(user_name)
actions.send_keys(Keys.TAB)
actions.send_keys(password)
actions.send_keys(Keys.ENTER)
actions.perform()
content = d.page_source
soup = BeautifulSoup("".join(content), "html.parser")
if "input__email_verification_pin" in soup.text:
code = d.find_element_by_id("input__email_verification_pin")
key = input("Enter key:")
code.send_keys(key)
code.send_keys(Keys.ENTER)
return "Successfully logged in! "
def scroll_down(driver, url, keyword):
print("Page loading.......")
d.get(url + "/people/?keywords={}".format(keyword))
"""A method for scrolling the page."""
# Get scroll height.
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to the bottom.
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load the page.
time.sleep(1.5)
# Calculate new scroll height and compare with last scroll height.
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
print("Page loaded!")
def get_links(d):
links = []
content = d.page_source
soup = BeautifulSoup("".join(content), "html.parser")
ul = soup.find_all(
"ul", attrs={"class": "org-people-profiles-module__profile-list"}
)
li = ul[0].find_all("li")
for i in li:
try:
links.append(i.find("a")["href"])
except:
pass
return links
def get_employee(id):
# print(id)
temp = api.get_profile(id.split("/")[2])
link = "https://www.linkedin.com/" + id
company = temp["experience"][0]["companyName"]
full_name = "{} {}".format(temp["firstName"], temp["lastName"])
try:
current_position = temp["experience"][0]["title"]
except:
current_position = ""
try:
current_position_start = "{}/{}".format(
temp["experience"][0]["timePeriod"]["startDate"]["month"],
temp["experience"][0]["timePeriod"]["startDate"]["year"],
)
except:
current_position_start = ""
try:
header_location = temp["locationName"]
except:
header_location = ""
try:
geo = temp["experience"][0]["geoLocationName"]
city = geo.split(", ")[0]
country = geo.split(", ")[-1]
except:
city = ""
country = ""
try:
education_1 = temp["education"][0]["schoolName"]
except:
education_1 = ""
try:
education_1_year = "{}-{}".format(
temp["education"][0]["timePeriod"]["startDate"]["year"],
temp["education"][0]["timePeriod"]["endDate"]["year"],
)
except:
education_1_year = ""
if len(temp["education"]) > 1:
try:
education_2 = temp["education"][1]["schoolName"]
except:
education_2 = ""
try:
education_2_year = "{}-{}".format(
temp["education"][1]["timePeriod"]["startDate"]["year"],
temp["education"][1]["timePeriod"]["endDate"]["year"],
)
except:
education_2_year = ""
pass
else:
education_2 = ""
education_2_year = ""
return [
full_name,
company,
current_position,
current_position_start,
header_location,
city,
country,
education_1,
education_1_year,
education_2,
education_2_year,
link,
]
login(d)
while True:
url = input("Enter the URL:")
keyword = input("Enter keyword:")
file = input("Enter save file name:")
scroll_down(d, url, keyword)
links = get_links(d)
print("{} employees found".format(len(links)))
data = []
for i in range(len(links)):
print(i + 1, "extracting...")
try:
data.append(get_employee(links[i]))
except Exception as e:
pass
df = pd.DataFrame(
data,
columns=[
"full_name",
"company",
"current_position",
"current_position_start",
"header_location",
"city",
"country",
"education_1",
"education_1_year",
"education_2",
"education_2_year",
"linkedin_url",
],
)
df.to_csv("output/{}.csv".format(file), index=False)