Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

480 lines
15 KiB

#! /usr/bin/env python3
import argparse
import csv
import datetime
import json
import random
# from requests import Session
import time
from datetime import date
from time import sleep
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from utils import pretty_date, random_wait, scroll_down, write_to_csv
def extract(soup, search_keyword, search_window, search_date, blackListTitles):
job_objects = []
blacklist_job_objects = []
datas = []
blacklist_datas = []
test = soup.find_all("li", class_="result-card")
rr = test[0].prettify()
file = open("linkedin-snippet.html", "w")
file.write(rr)
file.close()
print("Snippet dumped")
for job in soup.find_all("li", class_="result-card"):
try:
id = job.get("data-id")
search_url = url
try:
title = (
job.find("h3", class_="job-result-card__title")
.text.replace("\n", "")
.strip()
)
except:
title = ""
print("no title")
try:
company = (
job.find("a", class_="job-result-card__subtitle-link")
.text.replace("\n", "")
.strip()
)
except:
company = ""
try:
company_url = job.find(
"a", class_="job-result-card__subtitle-link"
).get("href")
except:
company_url = ""
try:
job_url = job.find("a", class_="result-card__full-card-link").get(
"href"
)
except:
job_url = ""
try:
location = (
job.find("span", class_="job-result-card__location")
.text.replace("\n", "")
.strip()
)
except:
location = ""
job_site = "linkedin"
try:
data_posted_on = (
job.find("time", class_="job-result-card__listdate--new")
.text.replace("\n", "")
.strip()
)
except:
data_posted_on = (
job.find("time", class_="job-result-card__listdate")
.text.replace("\n", "")
.strip()
)
try:
quick_apply = (
job.find("span", class_="job-result-card__easy-apply-label")
.text.replace("\n", "")
.strip()
)
except:
quick_apply = ""
try:
job_object = {
# data_job_id: {
"id": id,
"title": title,
"location": location,
"company": company,
# "summary": job_snippet,
"quick_apply": quick_apply,
"data_posted_on": data_posted_on,
# "apply_method": apply_method,
# "data_job_id": data_job_id,
"job_url": job_url,
"employer_url": company_url,
"search_url": search_url,
"job_site": job_site
# "lat": geo_lat,
# "lng": geo_lng
# },
# "search": {
# "site": job_site,
# "date": pull_date,
# "window": search_window,
# "keyword": search_keyword
# }
}
except:
print("job_object failed")
print(f"Grabbed {title} - {employer}")
job_objects.append(job_object)
datas.append(
[
id,
title,
company,
location,
data_posted_on,
job_url,
job_site,
search_keyword,
quick_apply,
"",
"",
"USA_ENGLISH",
"",
"",
"",
]
)
except:
# print("attempt failed, %s" % (job_link))
print("attempt failed")
# datas.append([data_listing_version_key, job_site, search_keyword])
# except:
# # print("attempt failed, %s" % (job_link))
# print("attempt failed")
# # print(job)
# # print(job_objects)
# # write_to_csv(ziprecruiter.csv, all_jobs)
# # print( tabulate(dataframe[0], headers='columns', tablefmt='psql') )
# #
# # with open("ziprecruiter" + search_date + ".json", "w") as outfile:
print("Exporting linkedin.json")
with open("linkedin.json", "w") as outfile:
json.dump(job_objects, outfile, indent=4)
print("Exporting linkedin.csv")
with open("linkedin.csv", "w+", newline="", encoding="utf-8") as save_file:
writer = csv.writer(save_file)
writer.writerow(
[
"id",
"title",
"company",
"location",
"date",
"link",
"provider",
"query",
"easy_apply",
"status",
"tags",
"locale",
"wage",
"remoteness",
"blurb",
]
)
# writer.writerow(["id", "title", "company", "location", "data_posted_on", "url"])
for data in datas:
writer.writerow(data)
print("Done!")
class LinkedinScraper:
def __init__(self, search_keyword, search_window="5"):
"""Parameter initialization"""
# search_date = pretty_date()
# search_keyword = data["search_keyword"]
# search_window = data["search_window"]
# ip = driver.get('https://api.ipify.org')
chrome_options = browser_options()
capabilities = browser_capabilities()
driver = webdriver.Remote(
command_executor="http://192.168.1.101:4444/wd/hub",
options=chrome_options,
desired_capabilities=capabilities,
)
# external_ip = driver.get('https://api.ipify.org').read().decode('utf8')
# print(external_ip)
url = (
"https://www.linkedin.com/jobs/search/?f_L=United%20States&f_TPR=r86400&keywords="
+ search_keyword
+ "&location=United%20States"
)
# "https://www.linkedin.com/jobs/search/?f_CF=f_WRA&f_L=United%20States&f_TPR=r86400&geoId=103644278&keywords=selenium&location=United%20States"
print(url)
driver.maximize_window()
driver.get(url)
driver.get(url)
time.sleep(random_wait())
# element = driver.find_element_by_css_selector(".zrs_close_btn")
# element.click()
# driver.find_element(By.CSS_SELECTOR, ".load_more_jobs").click()
scroll_down(driver)
# h1 class headline
source_data = driver.page_source
soup = BeautifulSoup(source_data, "lxml")
driver.save_screenshot("linkedin-scroll.png")
driver.close()
# print(soup.prettify())
# ps = soup
# with open("test.json", "w") as outfile:
# json.dump(ps, outfile, indent=4)
# print("Exported linkedin.json")
print("Soup pulled and browser session ended")
job_objects = []
datas = []
for job in soup.find_all("li", class_="result-card"):
try:
id = job.get("data-id")
search_url = url
try:
title = (
job.find("h3", class_="job-result-card__title")
.text.replace("\n", "")
.strip()
)
except:
title = ""
print("no title")
try:
company = (
job.find("a", class_="job-result-card__subtitle-link")
.text.replace("\n", "")
.strip()
)
except:
company = ""
try:
company_url = job.find(
"a", class_="job-result-card__subtitle-link"
).get("href")
except:
company_url = ""
try:
job_url = job.find("a", class_="result-card__full-card-link").get(
"href"
)
except:
job_url = ""
try:
location = (
job.find("span", class_="job-result-card__location")
.text.replace("\n", "")
.strip()
)
except:
location = ""
job_site = "linkedin"
try:
data_posted_on = (
job.find("time", class_="job-result-card__listdate--new")
.text.replace("\n", "")
.strip()
)
except:
data_posted_on = (
job.find("time", class_="job-result-card__listdate")
.text.replace("\n", "")
.strip()
)
try:
quick_apply = (
job.find("span", class_="job-result-card__easy-apply-label")
.text.replace("\n", "")
.strip()
)
except:
quick_apply = ""
try:
job_object = {
# data_job_id: {
"id": id,
"title": title,
"location": location,
"company": company,
# "summary": job_snippet,
"quick_apply": quick_apply,
"data_posted_on": data_posted_on,
# "apply_method": apply_method,
# "data_job_id": data_job_id,
"job_url": job_url,
"employer_url": company_url,
"search_url": search_url,
"job_site": job_site
# "lat": geo_lat,
# "lng": geo_lng
# },
# "search": {
# "site": job_site,
# "date": pull_date,
# "window": search_window,
# "keyword": search_keyword
# }
}
except:
print("job_object failed")
print(f"Grabbed {title} - {employer}")
job_objects.append(job_object)
datas.append(
[
id,
title,
company,
location,
data_posted_on,
job_url,
job_site,
search_keyword,
quick_apply,
"",
"",
"USA_ENGLISH",
"",
"",
"",
]
)
except:
# print("attempt failed, %s" % (job_link))
print("attempt failed")
# datas.append([data_listing_version_key, job_site, search_keyword])
# except:
# # print("attempt failed, %s" % (job_link))
# print("attempt failed")
# # print(job)
# # print(job_objects)
# # write_to_csv(ziprecruiter.csv, all_jobs)
# # print( tabulate(dataframe[0], headers='columns', tablefmt='psql') )
# #
# # with open("ziprecruiter" + search_date + ".json", "w") as outfile:
print("Exporting linkedin.json")
with open("linkedin.json", "w") as outfile:
json.dump(job_objects, outfile, indent=4)
print("Exporting linkedin.csv")
with open("linkedin.csv", "w+", newline="", encoding="utf-8") as save_file:
writer = csv.writer(save_file)
writer.writerow(
[
"id",
"title",
"company",
"location",
"date",
"link",
"provider",
"query",
"easy_apply",
"status",
"tags",
"locale",
"wage",
"remoteness",
"blurb",
]
)
# writer.writerow(["id", "title", "company", "location", "data_posted_on", "url"])
for data in datas:
writer.writerow(data)
print("Done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape Ziprecruiter")
parser.add_argument(
"-k",
"--keywords",
# nargs="+",
type=str,
dest="keywords",
help="List the keywords you want to scrape for",
)
parser.add_argument(
"-d",
"--days",
action="store",
dest="days",
default="5",
type=str,
help="How many days",
)
args = parser.parse_args()
if args.keywords:
C = LinkedinScraper(search_keyword=args.keywords, search_window=args.days)
# C.login(EMAIL, PASSWORD)
# C.collect("groups")
# with open("config.json") as config_file:
# data = json.load(config_file)
# bot = LinkedinScraper()
# bot.apply()