import argparse import csv import os import datetime import json import logging import logging as log import random import yaml # from requests import Session import time from datetime import date from time import sleep # import pdfkit import pandas as pd import requests from bs4 import BeautifulSoup from selenium import webdriver # from utils import pretty_date, random_wait, scroll_down, write_to_csv from selenium.common.exceptions import TimeoutException, NoSuchElementException from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait from utils.original import * from utils.network import * from utils.webdriver.anon import browser_capabilities, browser_options import numpy as np def extract( soup, search_url, search_keyword, search_window, search_date, blackListTitles ): job_objects = [] blacklist_job_objects = [] datas = [] blacklist_datas = [] # snippet = soup.find_all("li", class_="result-card") # snippet_html_grab("/data/data/staging/linkedin/job-snippet.html", snippet) for job in soup.find_all("li", class_="result-card"): try: try: id = job.get("data-id") except: log.error("id failed") try: search_url = search_url except: log.error("search_url failed") try: title = ( job.find("h3", class_="job-result-card__title") .text.replace("\n", "") .strip() ) except: log.error("title failed") try: employer = ( job.find("a", class_="job-result-card__subtitle-link") .text.replace("\n", "") .strip() ) except: log.error("employer failed") try: company_url = job.find( "a", class_="job-result-card__subtitle-link" ).get("href") except: company_url = "" log.error("company_url failed") try: job_url = job.find("a", class_="result-card__full-card-link").get( "href" ) except: job_url = "" log.error("job_url failed") try: location = ( job.find("span", class_="job-result-card__location") .text.replace("\n", "") .strip() ) except: location = "" log.error("location failed") try: job_site = "linkedin" except: log.error("job_site failed") try: data_posted_on = ( job.find("time", class_="job-result-card__listdate--new") .text.replace("\n", "") .strip() ) except: data_posted_on = ( job.find("time", class_="job-result-card__listdate") .text.replace("\n", "") .strip() ) try: quick_apply = ( job.find("span", class_="job-result-card__easy-apply-label") .text.replace("\n", "") .strip() ) except: quick_apply = "" try: pull_date = search_date except: log.error("pull_date failed") try: search_keyword = search_keyword except: log.error("search_keyword failed") try: search_window = search_window except: log.error("search_window failed") blacklisted_title_ind = blacklisted_title_check( title, employer, blackListTitles ) try: job_object = { # data_job_id: { "id": id, "title": title, "location": location, "employer": employer, # "summary": job_snippet, "quick_apply": quick_apply, "data_posted_on": data_posted_on, # "apply_method": apply_method, # "data_job_id": data_job_id, "job_url": job_url, "employer_url": company_url, "search_url": search_url, "job_site": job_site # "lat": geo_lat, # "lng": geo_lng # }, # "search": { # "site": job_site, # "date": pull_date, # "window": search_window, # "keyword": search_keyword # } } except: log.error("job_object failed") try: csv_object_header = [ "id", "title", "employer", "company_url", "job_url", "location", "job_site", "data_posted_on", "quick_apply", "search_keyword", "search_window", "search_url", "pull_date", "blacklisted_title_ind", ] except: log.error("csv_object_header failed") try: csv_object_template = [ id, title, employer, company_url, job_url, location, job_site, data_posted_on, quick_apply, search_keyword, search_window, search_url, pull_date, blacklisted_title_ind, ] except: log.error("csv_object_template failed") if blacklisted_title_ind: log.warn(f"[BLACKLIST] {title} - {employer}") blacklist_job_objects.append(job_object) blacklist_datas.append(csv_object_template) else: log.info(f"[GRABBED] {title} - {employer}") job_objects.append(job_object) datas.append(csv_object_template) except: log.error("attempt failed, %s" % (job_url)) cwd = os.getcwd() log.info("Current working directory: {0}".format(cwd)) generate_json( job_objects, "/data/data/staging/linkedin/jobs.json", ) generate_json( blacklist_job_objects, "/data/data/staging/linkedin/jobs-blacklist.json", ) generate_csv( datas, csv_object_header, "/data/data/staging/linkedin/jobs.csv", ) generate_csv( blacklist_datas, csv_object_header, "/data/data/staging/linkedin/jobs-blacklist.csv", ) def LinkedinScraperCLI(search_keyword, search_window="5"): LinkedinScraper(search_keyword=search_keyword, search_window=search_window) class LinkedinScraper: def __init__(self, search_keyword, search_window="5"): """Parameter initialization""" try: with open("/data/scripts/config.yaml", "r") as stream: try: parameters = yaml.safe_load(stream) except yaml.YAMLError as exc: raise exc except: with open("config.yaml", "r") as stream: try: parameters = yaml.safe_load(stream) except yaml.YAMLError as exc: raise exc username = parameters["linkedin_username"] password = parameters["linkedin_password"] search_date = pretty_date() module = "linkedin" blackListTitles = parameters.get("blackListTitles", []) # search_keyword = kwargs.get('templates_dict', None).get('search_keyword', None) chrome_options = browser_options() capabilities = browser_capabilities(module) log.info( "Searching linkedin for " + search_keyword + " in the last " + search_window + " day(s)" ) dirpath = os.getcwd() log.info("current directory is : " + dirpath) url = ( "https://www.linkedin.com/jobs/search/?f_L=United%20States&f_TPR=r86400&keywords=" + search_keyword + "&location=United%20States" ) driver = webdriver.Remote( command_executor="http://192.168.1.101:4444/wd/hub", options=chrome_options, desired_capabilities=capabilities, ) # "https://www.linkedin.com/jobs/search/?f_CF=f_WRA&f_L=United%20States&f_TPR=r86400&geoId=103644278&keywords=selenium&location=United%20States" search_url = url print(url) driver.maximize_window() driver.get(url) driver.get(url) time.sleep(random_wait()) try: driver.find_element(By.LINK_TEXT, "Sign in to use advanced searches") # except NoSuchElementException: log.error("Need to login") except: log.warn("No Need to login") infinite_scoll( driver, module, log, "/data/data/staging/linkedin/infinite-scroll.png" ) source_data = driver.page_source page_html_grab("/data/data/staging/linkedin/full-page.html", source_data) soup = BeautifulSoup(source_data, "lxml") log.info("Soup pulled and browser session ended") extract( soup, search_url, search_keyword, search_window, search_date, blackListTitles, ) driver.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape linkedin") parser.add_argument( "-k", "--keywords", # nargs="+", type=str, dest="keywords", help="List the keywords you want to scrape for", ) parser.add_argument( "-d", "--days", action="store", dest="days", default="5", type=str, help="How many days", ) args = parser.parse_args() if args.keywords: C = LinkedinScraperCLI(search_keyword=args.keywords, search_window=args.days) # C.login(EMAIL, PASSWORD) # C.collect("groups") # with open("config.json") as config_file: # data = json.load(config_file) # bot = LinkedinScraper() # bot.apply()