You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
370 lines
11 KiB
370 lines
11 KiB
import argparse |
|
import csv |
|
import os |
|
import datetime |
|
import json |
|
import logging |
|
import logging as log |
|
import random |
|
import yaml |
|
|
|
# from requests import Session |
|
import time |
|
from datetime import date |
|
from time import sleep |
|
|
|
# import pdfkit |
|
import pandas as pd |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from selenium import webdriver |
|
|
|
|
|
# from utils import pretty_date, random_wait, scroll_down, write_to_csv |
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.webdriver.common.action_chains import ActionChains |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities |
|
from selenium.webdriver.common.keys import Keys |
|
from selenium.webdriver.support import expected_conditions |
|
from selenium.webdriver.support.wait import WebDriverWait |
|
from utils.original import * |
|
from utils.network import * |
|
from utils.webdriver.anon import browser_capabilities, browser_options |
|
import numpy as np |
|
|
|
|
|
def extract( |
|
soup, search_url, search_keyword, search_window, search_date, blackListTitles |
|
): |
|
job_objects = [] |
|
blacklist_job_objects = [] |
|
datas = [] |
|
blacklist_datas = [] |
|
|
|
# snippet = soup.find_all("li", class_="result-card") |
|
# snippet_html_grab("/data/data/staging/linkedin/job-snippet.html", snippet) |
|
|
|
for job in soup.find_all("li", class_="result-card"): |
|
try: |
|
try: |
|
id = job.get("data-id") |
|
except: |
|
log.error("id failed") |
|
|
|
try: |
|
search_url = search_url |
|
except: |
|
log.error("search_url failed") |
|
|
|
try: |
|
title = ( |
|
job.find("h3", class_="job-result-card__title") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
log.error("title failed") |
|
|
|
try: |
|
employer = ( |
|
job.find("a", class_="job-result-card__subtitle-link") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
log.error("employer failed") |
|
|
|
try: |
|
company_url = job.find( |
|
"a", class_="job-result-card__subtitle-link" |
|
).get("href") |
|
except: |
|
company_url = "" |
|
log.error("company_url failed") |
|
|
|
try: |
|
job_url = job.find("a", class_="result-card__full-card-link").get( |
|
"href" |
|
) |
|
except: |
|
job_url = "" |
|
log.error("job_url failed") |
|
|
|
try: |
|
location = ( |
|
job.find("span", class_="job-result-card__location") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
location = "" |
|
log.error("location failed") |
|
try: |
|
job_site = "linkedin" |
|
except: |
|
log.error("job_site failed") |
|
|
|
try: |
|
data_posted_on = ( |
|
job.find("time", class_="job-result-card__listdate--new") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
data_posted_on = ( |
|
job.find("time", class_="job-result-card__listdate") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
|
|
try: |
|
quick_apply = ( |
|
job.find("span", class_="job-result-card__easy-apply-label") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
quick_apply = "" |
|
|
|
try: |
|
pull_date = search_date |
|
except: |
|
log.error("pull_date failed") |
|
|
|
try: |
|
search_keyword = search_keyword |
|
except: |
|
log.error("search_keyword failed") |
|
|
|
try: |
|
search_window = search_window |
|
except: |
|
log.error("search_window failed") |
|
|
|
blacklisted_title_ind = blacklisted_title_check( |
|
title, employer, blackListTitles |
|
) |
|
try: |
|
|
|
job_object = { |
|
# data_job_id: { |
|
"id": id, |
|
"title": title, |
|
"location": location, |
|
"employer": employer, |
|
# "summary": job_snippet, |
|
"quick_apply": quick_apply, |
|
"data_posted_on": data_posted_on, |
|
# "apply_method": apply_method, |
|
# "data_job_id": data_job_id, |
|
"job_url": job_url, |
|
"employer_url": company_url, |
|
"search_url": search_url, |
|
"job_site": job_site |
|
# "lat": geo_lat, |
|
# "lng": geo_lng |
|
# }, |
|
# "search": { |
|
# "site": job_site, |
|
# "date": pull_date, |
|
# "window": search_window, |
|
# "keyword": search_keyword |
|
# } |
|
} |
|
except: |
|
log.error("job_object failed") |
|
try: |
|
csv_object_header = [ |
|
"id", |
|
"title", |
|
"employer", |
|
"company_url", |
|
"job_url", |
|
"location", |
|
"job_site", |
|
"data_posted_on", |
|
"quick_apply", |
|
"search_keyword", |
|
"search_window", |
|
"search_url", |
|
"pull_date", |
|
"blacklisted_title_ind", |
|
] |
|
except: |
|
log.error("csv_object_header failed") |
|
try: |
|
csv_object_template = [ |
|
id, |
|
title, |
|
employer, |
|
company_url, |
|
job_url, |
|
location, |
|
job_site, |
|
data_posted_on, |
|
quick_apply, |
|
search_keyword, |
|
search_window, |
|
search_url, |
|
pull_date, |
|
blacklisted_title_ind, |
|
] |
|
except: |
|
log.error("csv_object_template failed") |
|
if blacklisted_title_ind: |
|
log.warn(f"[BLACKLIST] {title} - {employer}") |
|
blacklist_job_objects.append(job_object) |
|
blacklist_datas.append(csv_object_template) |
|
else: |
|
log.info(f"[GRABBED] {title} - {employer}") |
|
job_objects.append(job_object) |
|
datas.append(csv_object_template) |
|
|
|
except: |
|
log.error("attempt failed, %s" % (job_url)) |
|
|
|
cwd = os.getcwd() |
|
log.info("Current working directory: {0}".format(cwd)) |
|
|
|
generate_json( |
|
job_objects, |
|
"/data/data/staging/linkedin/jobs.json", |
|
) |
|
generate_json( |
|
blacklist_job_objects, |
|
"/data/data/staging/linkedin/jobs-blacklist.json", |
|
) |
|
|
|
generate_csv( |
|
datas, |
|
csv_object_header, |
|
"/data/data/staging/linkedin/jobs.csv", |
|
) |
|
generate_csv( |
|
blacklist_datas, |
|
csv_object_header, |
|
"/data/data/staging/linkedin/jobs-blacklist.csv", |
|
) |
|
|
|
|
|
def LinkedinScraperCLI(search_keyword, search_window="5"): |
|
LinkedinScraper(search_keyword=search_keyword, search_window=search_window) |
|
|
|
|
|
class LinkedinScraper: |
|
def __init__(self, search_keyword, search_window="5"): |
|
"""Parameter initialization""" |
|
try: |
|
with open("/data/scripts/config.yaml", "r") as stream: |
|
try: |
|
parameters = yaml.safe_load(stream) |
|
except yaml.YAMLError as exc: |
|
raise exc |
|
except: |
|
with open("config.yaml", "r") as stream: |
|
try: |
|
parameters = yaml.safe_load(stream) |
|
except yaml.YAMLError as exc: |
|
raise exc |
|
username = parameters["linkedin_username"] |
|
password = parameters["linkedin_password"] |
|
search_date = pretty_date() |
|
module = "linkedin" |
|
blackListTitles = parameters.get("blackListTitles", []) |
|
# search_keyword = kwargs.get('templates_dict', None).get('search_keyword', None) |
|
chrome_options = browser_options() |
|
capabilities = browser_capabilities(module) |
|
log.info( |
|
"Searching linkedin for " |
|
+ search_keyword |
|
+ " in the last " |
|
+ search_window |
|
+ " day(s)" |
|
) |
|
dirpath = os.getcwd() |
|
log.info("current directory is : " + dirpath) |
|
url = ( |
|
"https://www.linkedin.com/jobs/search/?f_L=United%20States&f_TPR=r86400&keywords=" |
|
+ search_keyword |
|
+ "&location=United%20States" |
|
) |
|
driver = webdriver.Remote( |
|
command_executor="http://192.168.1.101:4444/wd/hub", |
|
options=chrome_options, |
|
desired_capabilities=capabilities, |
|
) |
|
|
|
# "https://www.linkedin.com/jobs/search/?f_CF=f_WRA&f_L=United%20States&f_TPR=r86400&geoId=103644278&keywords=selenium&location=United%20States" |
|
search_url = url |
|
print(url) |
|
driver.maximize_window() |
|
driver.get(url) |
|
driver.get(url) |
|
|
|
time.sleep(random_wait()) |
|
|
|
try: |
|
driver.find_element(By.LINK_TEXT, "Sign in to use advanced searches") |
|
# except NoSuchElementException: |
|
log.error("Need to login") |
|
except: |
|
log.warn("No Need to login") |
|
|
|
infinite_scoll( |
|
driver, module, log, "/data/data/staging/linkedin/infinite-scroll.png" |
|
) |
|
|
|
source_data = driver.page_source |
|
|
|
page_html_grab("/data/data/staging/linkedin/full-page.html", source_data) |
|
|
|
soup = BeautifulSoup(source_data, "lxml") |
|
|
|
log.info("Soup pulled and browser session ended") |
|
|
|
extract( |
|
soup, |
|
search_url, |
|
search_keyword, |
|
search_window, |
|
search_date, |
|
blackListTitles, |
|
) |
|
driver.close() |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="Scrape linkedin") |
|
|
|
parser.add_argument( |
|
"-k", |
|
"--keywords", |
|
# nargs="+", |
|
type=str, |
|
dest="keywords", |
|
help="List the keywords you want to scrape for", |
|
) |
|
parser.add_argument( |
|
"-d", |
|
"--days", |
|
action="store", |
|
dest="days", |
|
default="5", |
|
type=str, |
|
help="How many days", |
|
) |
|
args = parser.parse_args() |
|
|
|
if args.keywords: |
|
|
|
C = LinkedinScraperCLI(search_keyword=args.keywords, search_window=args.days) |
|
# C.login(EMAIL, PASSWORD) |
|
# C.collect("groups") |
|
|
|
# with open("config.json") as config_file: |
|
# data = json.load(config_file) |
|
|
|
# bot = LinkedinScraper() |
|
# bot.apply()
|
|
|