You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
480 lines
15 KiB
480 lines
15 KiB
#! /usr/bin/env python3 |
|
import argparse |
|
import csv |
|
import datetime |
|
import json |
|
import random |
|
|
|
# from requests import Session |
|
import time |
|
from datetime import date |
|
from time import sleep |
|
|
|
import pandas as pd |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.webdriver.common.action_chains import ActionChains |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities |
|
from selenium.webdriver.common.keys import Keys |
|
from selenium.webdriver.support import expected_conditions |
|
from selenium.webdriver.support.wait import WebDriverWait |
|
|
|
from utils import pretty_date, random_wait, scroll_down, write_to_csv |
|
|
|
|
|
def extract(soup, search_keyword, search_window, search_date, blackListTitles): |
|
job_objects = [] |
|
blacklist_job_objects = [] |
|
datas = [] |
|
blacklist_datas = [] |
|
|
|
test = soup.find_all("li", class_="result-card") |
|
rr = test[0].prettify() |
|
file = open("linkedin-snippet.html", "w") |
|
file.write(rr) |
|
file.close() |
|
print("Snippet dumped") |
|
|
|
for job in soup.find_all("li", class_="result-card"): |
|
try: |
|
id = job.get("data-id") |
|
search_url = url |
|
try: |
|
title = ( |
|
job.find("h3", class_="job-result-card__title") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
title = "" |
|
print("no title") |
|
|
|
try: |
|
company = ( |
|
job.find("a", class_="job-result-card__subtitle-link") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
company = "" |
|
|
|
try: |
|
company_url = job.find( |
|
"a", class_="job-result-card__subtitle-link" |
|
).get("href") |
|
except: |
|
company_url = "" |
|
|
|
try: |
|
job_url = job.find("a", class_="result-card__full-card-link").get( |
|
"href" |
|
) |
|
except: |
|
job_url = "" |
|
|
|
try: |
|
location = ( |
|
job.find("span", class_="job-result-card__location") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
location = "" |
|
|
|
job_site = "linkedin" |
|
|
|
try: |
|
data_posted_on = ( |
|
job.find("time", class_="job-result-card__listdate--new") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
data_posted_on = ( |
|
job.find("time", class_="job-result-card__listdate") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
|
|
try: |
|
quick_apply = ( |
|
job.find("span", class_="job-result-card__easy-apply-label") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
quick_apply = "" |
|
|
|
try: |
|
|
|
job_object = { |
|
# data_job_id: { |
|
"id": id, |
|
"title": title, |
|
"location": location, |
|
"company": company, |
|
# "summary": job_snippet, |
|
"quick_apply": quick_apply, |
|
"data_posted_on": data_posted_on, |
|
# "apply_method": apply_method, |
|
# "data_job_id": data_job_id, |
|
"job_url": job_url, |
|
"employer_url": company_url, |
|
"search_url": search_url, |
|
"job_site": job_site |
|
# "lat": geo_lat, |
|
# "lng": geo_lng |
|
# }, |
|
# "search": { |
|
# "site": job_site, |
|
# "date": pull_date, |
|
# "window": search_window, |
|
# "keyword": search_keyword |
|
# } |
|
} |
|
except: |
|
print("job_object failed") |
|
|
|
print(f"Grabbed {title} - {employer}") |
|
job_objects.append(job_object) |
|
datas.append( |
|
[ |
|
id, |
|
title, |
|
company, |
|
location, |
|
data_posted_on, |
|
job_url, |
|
job_site, |
|
search_keyword, |
|
quick_apply, |
|
"", |
|
"", |
|
"USA_ENGLISH", |
|
"", |
|
"", |
|
"", |
|
] |
|
) |
|
|
|
except: |
|
# print("attempt failed, %s" % (job_link)) |
|
print("attempt failed") |
|
|
|
# datas.append([data_listing_version_key, job_site, search_keyword]) |
|
|
|
# except: |
|
# # print("attempt failed, %s" % (job_link)) |
|
# print("attempt failed") |
|
# # print(job) |
|
# # print(job_objects) |
|
|
|
# # write_to_csv(ziprecruiter.csv, all_jobs) |
|
|
|
# # print( tabulate(dataframe[0], headers='columns', tablefmt='psql') ) |
|
# # |
|
# # with open("ziprecruiter" + search_date + ".json", "w") as outfile: |
|
print("Exporting linkedin.json") |
|
with open("linkedin.json", "w") as outfile: |
|
json.dump(job_objects, outfile, indent=4) |
|
|
|
print("Exporting linkedin.csv") |
|
with open("linkedin.csv", "w+", newline="", encoding="utf-8") as save_file: |
|
writer = csv.writer(save_file) |
|
writer.writerow( |
|
[ |
|
"id", |
|
"title", |
|
"company", |
|
"location", |
|
"date", |
|
"link", |
|
"provider", |
|
"query", |
|
"easy_apply", |
|
"status", |
|
"tags", |
|
"locale", |
|
"wage", |
|
"remoteness", |
|
"blurb", |
|
] |
|
) |
|
# writer.writerow(["id", "title", "company", "location", "data_posted_on", "url"]) |
|
for data in datas: |
|
writer.writerow(data) |
|
|
|
print("Done!") |
|
|
|
|
|
class LinkedinScraper: |
|
def __init__(self, search_keyword, search_window="5"): |
|
"""Parameter initialization""" |
|
# search_date = pretty_date() |
|
# search_keyword = data["search_keyword"] |
|
# search_window = data["search_window"] |
|
# ip = driver.get('https://api.ipify.org') |
|
|
|
chrome_options = browser_options() |
|
capabilities = browser_capabilities() |
|
|
|
driver = webdriver.Remote( |
|
command_executor="http://192.168.1.101:4444/wd/hub", |
|
options=chrome_options, |
|
desired_capabilities=capabilities, |
|
) |
|
|
|
# external_ip = driver.get('https://api.ipify.org').read().decode('utf8') |
|
# print(external_ip) |
|
|
|
url = ( |
|
"https://www.linkedin.com/jobs/search/?f_L=United%20States&f_TPR=r86400&keywords=" |
|
+ search_keyword |
|
+ "&location=United%20States" |
|
) |
|
# "https://www.linkedin.com/jobs/search/?f_CF=f_WRA&f_L=United%20States&f_TPR=r86400&geoId=103644278&keywords=selenium&location=United%20States" |
|
|
|
print(url) |
|
driver.maximize_window() |
|
driver.get(url) |
|
driver.get(url) |
|
|
|
time.sleep(random_wait()) |
|
# element = driver.find_element_by_css_selector(".zrs_close_btn") |
|
# element.click() |
|
|
|
# driver.find_element(By.CSS_SELECTOR, ".load_more_jobs").click() |
|
|
|
scroll_down(driver) |
|
|
|
# h1 class headline |
|
|
|
source_data = driver.page_source |
|
|
|
soup = BeautifulSoup(source_data, "lxml") |
|
|
|
driver.save_screenshot("linkedin-scroll.png") |
|
|
|
driver.close() |
|
|
|
# print(soup.prettify()) |
|
|
|
# ps = soup |
|
|
|
# with open("test.json", "w") as outfile: |
|
# json.dump(ps, outfile, indent=4) |
|
# print("Exported linkedin.json") |
|
|
|
print("Soup pulled and browser session ended") |
|
|
|
job_objects = [] |
|
datas = [] |
|
|
|
for job in soup.find_all("li", class_="result-card"): |
|
try: |
|
id = job.get("data-id") |
|
search_url = url |
|
try: |
|
title = ( |
|
job.find("h3", class_="job-result-card__title") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
title = "" |
|
print("no title") |
|
|
|
try: |
|
company = ( |
|
job.find("a", class_="job-result-card__subtitle-link") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
company = "" |
|
|
|
try: |
|
company_url = job.find( |
|
"a", class_="job-result-card__subtitle-link" |
|
).get("href") |
|
except: |
|
company_url = "" |
|
|
|
try: |
|
job_url = job.find("a", class_="result-card__full-card-link").get( |
|
"href" |
|
) |
|
except: |
|
job_url = "" |
|
|
|
try: |
|
location = ( |
|
job.find("span", class_="job-result-card__location") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
location = "" |
|
|
|
job_site = "linkedin" |
|
|
|
try: |
|
data_posted_on = ( |
|
job.find("time", class_="job-result-card__listdate--new") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
data_posted_on = ( |
|
job.find("time", class_="job-result-card__listdate") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
|
|
try: |
|
quick_apply = ( |
|
job.find("span", class_="job-result-card__easy-apply-label") |
|
.text.replace("\n", "") |
|
.strip() |
|
) |
|
except: |
|
quick_apply = "" |
|
|
|
try: |
|
|
|
job_object = { |
|
# data_job_id: { |
|
"id": id, |
|
"title": title, |
|
"location": location, |
|
"company": company, |
|
# "summary": job_snippet, |
|
"quick_apply": quick_apply, |
|
"data_posted_on": data_posted_on, |
|
# "apply_method": apply_method, |
|
# "data_job_id": data_job_id, |
|
"job_url": job_url, |
|
"employer_url": company_url, |
|
"search_url": search_url, |
|
"job_site": job_site |
|
# "lat": geo_lat, |
|
# "lng": geo_lng |
|
# }, |
|
# "search": { |
|
# "site": job_site, |
|
# "date": pull_date, |
|
# "window": search_window, |
|
# "keyword": search_keyword |
|
# } |
|
} |
|
except: |
|
print("job_object failed") |
|
|
|
print(f"Grabbed {title} - {employer}") |
|
job_objects.append(job_object) |
|
datas.append( |
|
[ |
|
id, |
|
title, |
|
company, |
|
location, |
|
data_posted_on, |
|
job_url, |
|
job_site, |
|
search_keyword, |
|
quick_apply, |
|
"", |
|
"", |
|
"USA_ENGLISH", |
|
"", |
|
"", |
|
"", |
|
] |
|
) |
|
|
|
except: |
|
# print("attempt failed, %s" % (job_link)) |
|
print("attempt failed") |
|
|
|
# datas.append([data_listing_version_key, job_site, search_keyword]) |
|
|
|
# except: |
|
# # print("attempt failed, %s" % (job_link)) |
|
# print("attempt failed") |
|
# # print(job) |
|
# # print(job_objects) |
|
|
|
# # write_to_csv(ziprecruiter.csv, all_jobs) |
|
|
|
# # print( tabulate(dataframe[0], headers='columns', tablefmt='psql') ) |
|
# # |
|
# # with open("ziprecruiter" + search_date + ".json", "w") as outfile: |
|
print("Exporting linkedin.json") |
|
with open("linkedin.json", "w") as outfile: |
|
json.dump(job_objects, outfile, indent=4) |
|
|
|
print("Exporting linkedin.csv") |
|
with open("linkedin.csv", "w+", newline="", encoding="utf-8") as save_file: |
|
writer = csv.writer(save_file) |
|
writer.writerow( |
|
[ |
|
"id", |
|
"title", |
|
"company", |
|
"location", |
|
"date", |
|
"link", |
|
"provider", |
|
"query", |
|
"easy_apply", |
|
"status", |
|
"tags", |
|
"locale", |
|
"wage", |
|
"remoteness", |
|
"blurb", |
|
] |
|
) |
|
# writer.writerow(["id", "title", "company", "location", "data_posted_on", "url"]) |
|
for data in datas: |
|
writer.writerow(data) |
|
|
|
print("Done!") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="Scrape Ziprecruiter") |
|
|
|
parser.add_argument( |
|
"-k", |
|
"--keywords", |
|
# nargs="+", |
|
type=str, |
|
dest="keywords", |
|
help="List the keywords you want to scrape for", |
|
) |
|
parser.add_argument( |
|
"-d", |
|
"--days", |
|
action="store", |
|
dest="days", |
|
default="5", |
|
type=str, |
|
help="How many days", |
|
) |
|
args = parser.parse_args() |
|
|
|
if args.keywords: |
|
|
|
C = LinkedinScraper(search_keyword=args.keywords, search_window=args.days) |
|
# C.login(EMAIL, PASSWORD) |
|
# C.collect("groups") |
|
|
|
# with open("config.json") as config_file: |
|
# data = json.load(config_file) |
|
|
|
# bot = LinkedinScraper() |
|
# bot.apply()
|
|
|