""" Scrape linkedin jobs by using selenium, to simulate the navigation (click, scroll) and BeautifulSoup to parse the HTML code of the page Perform a number of queries and log a number of files for each scraped job offer. Write dataset to mongoDB with the scraped data """ import argparse from time import sleep from bs4 import BeautifulSoup from classes.JobScraper import JobScraper from selenium.common.exceptions import TimeoutException from utils import ( connect_mongo, get_job_urls, get_unseen_urls, init_driver, load_config, login, print_scraped_data, scroll_job_panel, ) parser = argparse.ArgumentParser( description=("Scrape linkedin job offers based on the " + "queries specified in the conf file") ) parser.add_argument('-c', '--conf', type=str, metavar='', required=True, help='Specify the path of the configuration file') args = parser.parse_args() conf = load_config(args.conf) parameters = conf["parameters"] credentials = conf["credentials"] CHROME_PATH = parameters["CHROME_PATH"] CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"] QUERIES = parameters["JOB_QUERIES"] LINUSERNAME = credentials["LINUSERNAME"] LINPWD = credentials["LINPWD"] MONGOUSER = credentials["MONGOUSER"] MONGOPWD = credentials["MONGOPWD"] HOST = parameters["HOST"] client = connect_mongo(HOST, MONGOUSER, MONGOPWD) db = client["linkedin"] jobs = db["jobs"] driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH) driver.get("https://www.linkedin.com") login(driver, LINUSERNAME, LINPWD) JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords=" for query in QUERIES: driver.get(JOB_SEARCH_URL + query) sleep(0.5) scroll_job_panel(driver) soup = BeautifulSoup(driver.page_source, 'html.parser') n_results_element = soup.find(class_="t-12 t-black--light t-normal") n_results_string = n_results_element.get_text() n_results = int(n_results_string.split()[0].replace(',', '')) job_urls = get_job_urls(soup) start = 25 url = JOB_SEARCH_URL + query + "&start=" + str(start) while start < n_results: try: driver.get(url) scroll_job_panel(driver) soup = BeautifulSoup(driver.page_source, 'html.parser') job_urls.extend(get_job_urls(soup)) start += 25 except TimeoutException: print( "\nINFO :: TimeoutException raised while getting " + "URL\n" + url ) if len(job_urls) == 0: print() print("WARNING :: Could not get any URLs for the query\n" + query) print("Please double-check that LinkedIn is not " + "blocking the query") continue unseen_urls = get_unseen_urls(jobs, job_urls) if len(unseen_urls) != 0: print("INFO :: Resuming from URL", unseen_urls[0]) else: print("INFO :: All job URLs for the query " + query + " have already been scraped. " + "Moving onto the next query if any.") continue for url in unseen_urls: driver.get(url) class Resume:r.page_source, 'html.parser') js = JobScraper(soup, url, query) job_data = js.get_job_data() if job_data and\ not db["jobs"].count_documents(job_data, limit=1): print_scraped_data(job_data) jobs.insert_one(job_data) driver.quit() class Resume: