#! /usr/bin/env python3 import argparse import csv import datetime import json import random # from requests import Session import time from datetime import date from time import sleep import pandas as pd import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait from utils import pretty_date, random_wait, scroll_down, write_to_csv def extract(soup, search_keyword, search_window, search_date, blackListTitles): job_objects = [] blacklist_job_objects = [] datas = [] blacklist_datas = [] test = soup.find_all("li", class_="result-card") rr = test[0].prettify() file = open("linkedin-snippet.html", "w") file.write(rr) file.close() print("Snippet dumped") for job in soup.find_all("li", class_="result-card"): try: id = job.get("data-id") search_url = url try: title = ( job.find("h3", class_="job-result-card__title") .text.replace("\n", "") .strip() ) except: title = "" print("no title") try: company = ( job.find("a", class_="job-result-card__subtitle-link") .text.replace("\n", "") .strip() ) except: company = "" try: company_url = job.find( "a", class_="job-result-card__subtitle-link" ).get("href") except: company_url = "" try: job_url = job.find("a", class_="result-card__full-card-link").get( "href" ) except: job_url = "" try: location = ( job.find("span", class_="job-result-card__location") .text.replace("\n", "") .strip() ) except: location = "" job_site = "linkedin" try: data_posted_on = ( job.find("time", class_="job-result-card__listdate--new") .text.replace("\n", "") .strip() ) except: data_posted_on = ( job.find("time", class_="job-result-card__listdate") .text.replace("\n", "") .strip() ) try: quick_apply = ( job.find("span", class_="job-result-card__easy-apply-label") .text.replace("\n", "") .strip() ) except: quick_apply = "" try: job_object = { # data_job_id: { "id": id, "title": title, "location": location, "company": company, # "summary": job_snippet, "quick_apply": quick_apply, "data_posted_on": data_posted_on, # "apply_method": apply_method, # "data_job_id": data_job_id, "job_url": job_url, "employer_url": company_url, "search_url": search_url, "job_site": job_site # "lat": geo_lat, # "lng": geo_lng # }, # "search": { # "site": job_site, # "date": pull_date, # "window": search_window, # "keyword": search_keyword # } } except: print("job_object failed") print(f"Grabbed {title} - {employer}") job_objects.append(job_object) datas.append( [ id, title, company, location, data_posted_on, job_url, job_site, search_keyword, quick_apply, "", "", "USA_ENGLISH", "", "", "", ] ) except: # print("attempt failed, %s" % (job_link)) print("attempt failed") # datas.append([data_listing_version_key, job_site, search_keyword]) # except: # # print("attempt failed, %s" % (job_link)) # print("attempt failed") # # print(job) # # print(job_objects) # # write_to_csv(ziprecruiter.csv, all_jobs) # # print( tabulate(dataframe[0], headers='columns', tablefmt='psql') ) # # # # with open("ziprecruiter" + search_date + ".json", "w") as outfile: print("Exporting linkedin.json") with open("linkedin.json", "w") as outfile: json.dump(job_objects, outfile, indent=4) print("Exporting linkedin.csv") with open("linkedin.csv", "w+", newline="", encoding="utf-8") as save_file: writer = csv.writer(save_file) writer.writerow( [ "id", "title", "company", "location", "date", "link", "provider", "query", "easy_apply", "status", "tags", "locale", "wage", "remoteness", "blurb", ] ) # writer.writerow(["id", "title", "company", "location", "data_posted_on", "url"]) for data in datas: writer.writerow(data) print("Done!") class LinkedinScraper: def __init__(self, search_keyword, search_window="5"): """Parameter initialization""" # search_date = pretty_date() # search_keyword = data["search_keyword"] # search_window = data["search_window"] # ip = driver.get('https://api.ipify.org') chrome_options = browser_options() capabilities = browser_capabilities() driver = webdriver.Remote( command_executor="http://192.168.1.101:4444/wd/hub", options=chrome_options, desired_capabilities=capabilities, ) # external_ip = driver.get('https://api.ipify.org').read().decode('utf8') # print(external_ip) url = ( "https://www.linkedin.com/jobs/search/?f_L=United%20States&f_TPR=r86400&keywords=" + search_keyword + "&location=United%20States" ) # "https://www.linkedin.com/jobs/search/?f_CF=f_WRA&f_L=United%20States&f_TPR=r86400&geoId=103644278&keywords=selenium&location=United%20States" print(url) driver.maximize_window() driver.get(url) driver.get(url) time.sleep(random_wait()) # element = driver.find_element_by_css_selector(".zrs_close_btn") # element.click() # driver.find_element(By.CSS_SELECTOR, ".load_more_jobs").click() scroll_down(driver) # h1 class headline source_data = driver.page_source soup = BeautifulSoup(source_data, "lxml") driver.save_screenshot("linkedin-scroll.png") driver.close() # print(soup.prettify()) # ps = soup # with open("test.json", "w") as outfile: # json.dump(ps, outfile, indent=4) # print("Exported linkedin.json") print("Soup pulled and browser session ended") job_objects = [] datas = [] for job in soup.find_all("li", class_="result-card"): try: id = job.get("data-id") search_url = url try: title = ( job.find("h3", class_="job-result-card__title") .text.replace("\n", "") .strip() ) except: title = "" print("no title") try: company = ( job.find("a", class_="job-result-card__subtitle-link") .text.replace("\n", "") .strip() ) except: company = "" try: company_url = job.find( "a", class_="job-result-card__subtitle-link" ).get("href") except: company_url = "" try: job_url = job.find("a", class_="result-card__full-card-link").get( "href" ) except: job_url = "" try: location = ( job.find("span", class_="job-result-card__location") .text.replace("\n", "") .strip() ) except: location = "" job_site = "linkedin" try: data_posted_on = ( job.find("time", class_="job-result-card__listdate--new") .text.replace("\n", "") .strip() ) except: data_posted_on = ( job.find("time", class_="job-result-card__listdate") .text.replace("\n", "") .strip() ) try: quick_apply = ( job.find("span", class_="job-result-card__easy-apply-label") .text.replace("\n", "") .strip() ) except: quick_apply = "" try: job_object = { # data_job_id: { "id": id, "title": title, "location": location, "company": company, # "summary": job_snippet, "quick_apply": quick_apply, "data_posted_on": data_posted_on, # "apply_method": apply_method, # "data_job_id": data_job_id, "job_url": job_url, "employer_url": company_url, "search_url": search_url, "job_site": job_site # "lat": geo_lat, # "lng": geo_lng # }, # "search": { # "site": job_site, # "date": pull_date, # "window": search_window, # "keyword": search_keyword # } } except: print("job_object failed") print(f"Grabbed {title} - {employer}") job_objects.append(job_object) datas.append( [ id, title, company, location, data_posted_on, job_url, job_site, search_keyword, quick_apply, "", "", "USA_ENGLISH", "", "", "", ] ) except: # print("attempt failed, %s" % (job_link)) print("attempt failed") # datas.append([data_listing_version_key, job_site, search_keyword]) # except: # # print("attempt failed, %s" % (job_link)) # print("attempt failed") # # print(job) # # print(job_objects) # # write_to_csv(ziprecruiter.csv, all_jobs) # # print( tabulate(dataframe[0], headers='columns', tablefmt='psql') ) # # # # with open("ziprecruiter" + search_date + ".json", "w") as outfile: print("Exporting linkedin.json") with open("linkedin.json", "w") as outfile: json.dump(job_objects, outfile, indent=4) print("Exporting linkedin.csv") with open("linkedin.csv", "w+", newline="", encoding="utf-8") as save_file: writer = csv.writer(save_file) writer.writerow( [ "id", "title", "company", "location", "date", "link", "provider", "query", "easy_apply", "status", "tags", "locale", "wage", "remoteness", "blurb", ] ) # writer.writerow(["id", "title", "company", "location", "data_posted_on", "url"]) for data in datas: writer.writerow(data) print("Done!") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape Ziprecruiter") parser.add_argument( "-k", "--keywords", # nargs="+", type=str, dest="keywords", help="List the keywords you want to scrape for", ) parser.add_argument( "-d", "--days", action="store", dest="days", default="5", type=str, help="How many days", ) args = parser.parse_args() if args.keywords: C = LinkedinScraper(search_keyword=args.keywords, search_window=args.days) # C.login(EMAIL, PASSWORD) # C.collect("groups") # with open("config.json") as config_file: # data = json.load(config_file) # bot = LinkedinScraper() # bot.apply()