jupyter-notebooks/unpublished/job_scrape/linkedin/linkedin-scraper/scrape_jobs.py

"""
Scrape linkedin jobs by using selenium, to simulate the navigation
(click, scroll) and BeautifulSoup to parse the HTML code of the page
Perform a number of queries and log a number of files
for each scraped job offer.
Write dataset to mongoDB with the scraped data

"""
import argparse
from time import sleep

from bs4 import BeautifulSoup
from classes.JobScraper import JobScraper
from selenium.common.exceptions import TimeoutException

from utils import (
    connect_mongo,
    get_job_urls,
    get_unseen_urls,
    init_driver,
    load_config,
    login,
    print_scraped_data,
    scroll_job_panel,
)

parser = argparse.ArgumentParser(
    description=("Scrape linkedin job offers based on the " +
                 "queries specified in the conf file")
)
parser.add_argument('-c', '--conf',
                    type=str,
                    metavar='',
                    required=True,
                    help='Specify the path of the configuration file')
args = parser.parse_args()
conf = load_config(args.conf)
parameters = conf["parameters"]
credentials = conf["credentials"]
CHROME_PATH = parameters["CHROME_PATH"]
CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"]
QUERIES = parameters["JOB_QUERIES"]
LINUSERNAME = credentials["LINUSERNAME"]
LINPWD = credentials["LINPWD"]
MONGOUSER = credentials["MONGOUSER"]
MONGOPWD = credentials["MONGOPWD"]
HOST = parameters["HOST"]
client = connect_mongo(HOST, MONGOUSER, MONGOPWD)
db = client["linkedin"]
jobs = db["jobs"]
driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH)
driver.get("https://www.linkedin.com")
login(driver, LINUSERNAME, LINPWD)
JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords="
for query in QUERIES:
    driver.get(JOB_SEARCH_URL + query)
    sleep(0.5)
    scroll_job_panel(driver)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    n_results_element = soup.find(class_="t-12 t-black--light t-normal")
    n_results_string = n_results_element.get_text()
    n_results = int(n_results_string.split()[0].replace(',', ''))
    job_urls = get_job_urls(soup)
    start = 25
    url = JOB_SEARCH_URL + query + "&start=" + str(start)
    while start < n_results:
        try:
            driver.get(url)
            scroll_job_panel(driver)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            job_urls.extend(get_job_urls(soup))
            start += 25
        except TimeoutException:
            print(
                "\nINFO :: TimeoutException raised while getting " +
                "URL\n" + url
            )
    if len(job_urls) == 0:
        print()
        print("WARNING :: Could not get any URLs for the query\n" +
              query)
        print("Please double-check that LinkedIn is not " +
              "blocking the query")
        continue
    unseen_urls = get_unseen_urls(jobs, job_urls)
    if len(unseen_urls) != 0:
        print("INFO :: Resuming from URL", unseen_urls[0])
    else:
        print("INFO :: All job URLs for the query " + query +
              " have already been scraped. " +
              "Moving onto the next query if any.")
        continue
    for url in unseen_urls:
        driver.get(url)

class Resume:r.page_source, 'html.parser')
        js = JobScraper(soup, url, query)
        job_data = js.get_job_data()
        if job_data and\
           not db["jobs"].count_documents(job_data, limit=1):
            print_scraped_data(job_data)
            jobs.insert_one(job_data)
driver.quit()


class Resume: