Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

370 lines
11 KiB

import argparse
import csv
import os
import datetime
import json
import logging
import logging as log
import random
import yaml
# from requests import Session
import time
from datetime import date
from time import sleep
# import pdfkit
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
# from utils import pretty_date, random_wait, scroll_down, write_to_csv
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from utils.original import *
from utils.network import *
from utils.webdriver.anon import browser_capabilities, browser_options
import numpy as np
def extract(
soup, search_url, search_keyword, search_window, search_date, blackListTitles
):
job_objects = []
blacklist_job_objects = []
datas = []
blacklist_datas = []
# snippet = soup.find_all("li", class_="result-card")
# snippet_html_grab("/data/data/staging/linkedin/job-snippet.html", snippet)
for job in soup.find_all("li", class_="result-card"):
try:
try:
id = job.get("data-id")
except:
log.error("id failed")
try:
search_url = search_url
except:
log.error("search_url failed")
try:
title = (
job.find("h3", class_="job-result-card__title")
.text.replace("\n", "")
.strip()
)
except:
log.error("title failed")
try:
employer = (
job.find("a", class_="job-result-card__subtitle-link")
.text.replace("\n", "")
.strip()
)
except:
log.error("employer failed")
try:
company_url = job.find(
"a", class_="job-result-card__subtitle-link"
).get("href")
except:
company_url = ""
log.error("company_url failed")
try:
job_url = job.find("a", class_="result-card__full-card-link").get(
"href"
)
except:
job_url = ""
log.error("job_url failed")
try:
location = (
job.find("span", class_="job-result-card__location")
.text.replace("\n", "")
.strip()
)
except:
location = ""
log.error("location failed")
try:
job_site = "linkedin"
except:
log.error("job_site failed")
try:
data_posted_on = (
job.find("time", class_="job-result-card__listdate--new")
.text.replace("\n", "")
.strip()
)
except:
data_posted_on = (
job.find("time", class_="job-result-card__listdate")
.text.replace("\n", "")
.strip()
)
try:
quick_apply = (
job.find("span", class_="job-result-card__easy-apply-label")
.text.replace("\n", "")
.strip()
)
except:
quick_apply = ""
try:
pull_date = search_date
except:
log.error("pull_date failed")
try:
search_keyword = search_keyword
except:
log.error("search_keyword failed")
try:
search_window = search_window
except:
log.error("search_window failed")
blacklisted_title_ind = blacklisted_title_check(
title, employer, blackListTitles
)
try:
job_object = {
# data_job_id: {
"id": id,
"title": title,
"location": location,
"employer": employer,
# "summary": job_snippet,
"quick_apply": quick_apply,
"data_posted_on": data_posted_on,
# "apply_method": apply_method,
# "data_job_id": data_job_id,
"job_url": job_url,
"employer_url": company_url,
"search_url": search_url,
"job_site": job_site
# "lat": geo_lat,
# "lng": geo_lng
# },
# "search": {
# "site": job_site,
# "date": pull_date,
# "window": search_window,
# "keyword": search_keyword
# }
}
except:
log.error("job_object failed")
try:
csv_object_header = [
"id",
"title",
"employer",
"company_url",
"job_url",
"location",
"job_site",
"data_posted_on",
"quick_apply",
"search_keyword",
"search_window",
"search_url",
"pull_date",
"blacklisted_title_ind",
]
except:
log.error("csv_object_header failed")
try:
csv_object_template = [
id,
title,
employer,
company_url,
job_url,
location,
job_site,
data_posted_on,
quick_apply,
search_keyword,
search_window,
search_url,
pull_date,
blacklisted_title_ind,
]
except:
log.error("csv_object_template failed")
if blacklisted_title_ind:
log.warn(f"[BLACKLIST] {title} - {employer}")
blacklist_job_objects.append(job_object)
blacklist_datas.append(csv_object_template)
else:
log.info(f"[GRABBED] {title} - {employer}")
job_objects.append(job_object)
datas.append(csv_object_template)
except:
log.error("attempt failed, %s" % (job_url))
cwd = os.getcwd()
log.info("Current working directory: {0}".format(cwd))
generate_json(
job_objects,
"/data/data/staging/linkedin/jobs.json",
)
generate_json(
blacklist_job_objects,
"/data/data/staging/linkedin/jobs-blacklist.json",
)
generate_csv(
datas,
csv_object_header,
"/data/data/staging/linkedin/jobs.csv",
)
generate_csv(
blacklist_datas,
csv_object_header,
"/data/data/staging/linkedin/jobs-blacklist.csv",
)
def LinkedinScraperCLI(search_keyword, search_window="5"):
LinkedinScraper(search_keyword=search_keyword, search_window=search_window)
class LinkedinScraper:
def __init__(self, search_keyword, search_window="5"):
"""Parameter initialization"""
try:
with open("/data/scripts/config.yaml", "r") as stream:
try:
parameters = yaml.safe_load(stream)
except yaml.YAMLError as exc:
raise exc
except:
with open("config.yaml", "r") as stream:
try:
parameters = yaml.safe_load(stream)
except yaml.YAMLError as exc:
raise exc
username = parameters["linkedin_username"]
password = parameters["linkedin_password"]
search_date = pretty_date()
module = "linkedin"
blackListTitles = parameters.get("blackListTitles", [])
# search_keyword = kwargs.get('templates_dict', None).get('search_keyword', None)
chrome_options = browser_options()
capabilities = browser_capabilities(module)
log.info(
"Searching linkedin for "
+ search_keyword
+ " in the last "
+ search_window
+ " day(s)"
)
dirpath = os.getcwd()
log.info("current directory is : " + dirpath)
url = (
"https://www.linkedin.com/jobs/search/?f_L=United%20States&f_TPR=r86400&keywords="
+ search_keyword
+ "&location=United%20States"
)
driver = webdriver.Remote(
command_executor="http://192.168.1.101:4444/wd/hub",
options=chrome_options,
desired_capabilities=capabilities,
)
# "https://www.linkedin.com/jobs/search/?f_CF=f_WRA&f_L=United%20States&f_TPR=r86400&geoId=103644278&keywords=selenium&location=United%20States"
search_url = url
print(url)
driver.maximize_window()
driver.get(url)
driver.get(url)
time.sleep(random_wait())
try:
driver.find_element(By.LINK_TEXT, "Sign in to use advanced searches")
# except NoSuchElementException:
log.error("Need to login")
except:
log.warn("No Need to login")
infinite_scoll(
driver, module, log, "/data/data/staging/linkedin/infinite-scroll.png"
)
source_data = driver.page_source
page_html_grab("/data/data/staging/linkedin/full-page.html", source_data)
soup = BeautifulSoup(source_data, "lxml")
log.info("Soup pulled and browser session ended")
extract(
soup,
search_url,
search_keyword,
search_window,
search_date,
blackListTitles,
)
driver.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape linkedin")
parser.add_argument(
"-k",
"--keywords",
# nargs="+",
type=str,
dest="keywords",
help="List the keywords you want to scrape for",
)
parser.add_argument(
"-d",
"--days",
action="store",
dest="days",
default="5",
type=str,
help="How many days",
)
args = parser.parse_args()
if args.keywords:
C = LinkedinScraperCLI(search_keyword=args.keywords, search_window=args.days)
# C.login(EMAIL, PASSWORD)
# C.collect("groups")
# with open("config.json") as config_file:
# data = json.load(config_file)
# bot = LinkedinScraper()
# bot.apply()