Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

106 lines
3.4 KiB

"""
Scrape linkedin jobs by using selenium, to simulate the navigation
(click, scroll) and BeautifulSoup to parse the HTML code of the page
Perform a number of queries and log a number of files
for each scraped job offer.
Write dataset to mongoDB with the scraped data
"""
import argparse
from time import sleep
from bs4 import BeautifulSoup
from classes.JobScraper import JobScraper
from selenium.common.exceptions import TimeoutException
from utils import (
connect_mongo,
get_job_urls,
get_unseen_urls,
init_driver,
load_config,
login,
print_scraped_data,
scroll_job_panel,
)
parser = argparse.ArgumentParser(
description=("Scrape linkedin job offers based on the " +
"queries specified in the conf file")
)
parser.add_argument('-c', '--conf',
type=str,
metavar='',
required=True,
help='Specify the path of the configuration file')
args = parser.parse_args()
conf = load_config(args.conf)
parameters = conf["parameters"]
credentials = conf["credentials"]
CHROME_PATH = parameters["CHROME_PATH"]
CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"]
QUERIES = parameters["JOB_QUERIES"]
LINUSERNAME = credentials["LINUSERNAME"]
LINPWD = credentials["LINPWD"]
MONGOUSER = credentials["MONGOUSER"]
MONGOPWD = credentials["MONGOPWD"]
HOST = parameters["HOST"]
client = connect_mongo(HOST, MONGOUSER, MONGOPWD)
db = client["linkedin"]
jobs = db["jobs"]
driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH)
driver.get("https://www.linkedin.com")
login(driver, LINUSERNAME, LINPWD)
JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords="
for query in QUERIES:
driver.get(JOB_SEARCH_URL + query)
sleep(0.5)
scroll_job_panel(driver)
soup = BeautifulSoup(driver.page_source, 'html.parser')
n_results_element = soup.find(class_="t-12 t-black--light t-normal")
n_results_string = n_results_element.get_text()
n_results = int(n_results_string.split()[0].replace(',', ''))
job_urls = get_job_urls(soup)
start = 25
url = JOB_SEARCH_URL + query + "&start=" + str(start)
while start < n_results:
try:
driver.get(url)
scroll_job_panel(driver)
soup = BeautifulSoup(driver.page_source, 'html.parser')
job_urls.extend(get_job_urls(soup))
start += 25
except TimeoutException:
print(
"\nINFO :: TimeoutException raised while getting " +
"URL\n" + url
)
if len(job_urls) == 0:
print()
print("WARNING :: Could not get any URLs for the query\n" +
query)
print("Please double-check that LinkedIn is not " +
"blocking the query")
continue
unseen_urls = get_unseen_urls(jobs, job_urls)
if len(unseen_urls) != 0:
print("INFO :: Resuming from URL", unseen_urls[0])
else:
print("INFO :: All job URLs for the query " + query +
" have already been scraped. " +
"Moving onto the next query if any.")
continue
for url in unseen_urls:
driver.get(url)
class Resume:r.page_source, 'html.parser')
js = JobScraper(soup, url, query)
job_data = js.get_job_data()
if job_data and\
not db["jobs"].count_documents(job_data, limit=1):
print_scraped_data(job_data)
jobs.insert_one(job_data)
driver.quit()
class Resume: