You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
106 lines
3.4 KiB
106 lines
3.4 KiB
""" |
|
Scrape linkedin jobs by using selenium, to simulate the navigation |
|
(click, scroll) and BeautifulSoup to parse the HTML code of the page |
|
Perform a number of queries and log a number of files |
|
for each scraped job offer. |
|
Write dataset to mongoDB with the scraped data |
|
|
|
""" |
|
import argparse |
|
from time import sleep |
|
|
|
from bs4 import BeautifulSoup |
|
from classes.JobScraper import JobScraper |
|
from selenium.common.exceptions import TimeoutException |
|
|
|
from utils import ( |
|
connect_mongo, |
|
get_job_urls, |
|
get_unseen_urls, |
|
init_driver, |
|
load_config, |
|
login, |
|
print_scraped_data, |
|
scroll_job_panel, |
|
) |
|
|
|
parser = argparse.ArgumentParser( |
|
description=("Scrape linkedin job offers based on the " + |
|
"queries specified in the conf file") |
|
) |
|
parser.add_argument('-c', '--conf', |
|
type=str, |
|
metavar='', |
|
required=True, |
|
help='Specify the path of the configuration file') |
|
args = parser.parse_args() |
|
conf = load_config(args.conf) |
|
parameters = conf["parameters"] |
|
credentials = conf["credentials"] |
|
CHROME_PATH = parameters["CHROME_PATH"] |
|
CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"] |
|
QUERIES = parameters["JOB_QUERIES"] |
|
LINUSERNAME = credentials["LINUSERNAME"] |
|
LINPWD = credentials["LINPWD"] |
|
MONGOUSER = credentials["MONGOUSER"] |
|
MONGOPWD = credentials["MONGOPWD"] |
|
HOST = parameters["HOST"] |
|
client = connect_mongo(HOST, MONGOUSER, MONGOPWD) |
|
db = client["linkedin"] |
|
jobs = db["jobs"] |
|
driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH) |
|
driver.get("https://www.linkedin.com") |
|
login(driver, LINUSERNAME, LINPWD) |
|
JOB_SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords=" |
|
for query in QUERIES: |
|
driver.get(JOB_SEARCH_URL + query) |
|
sleep(0.5) |
|
scroll_job_panel(driver) |
|
soup = BeautifulSoup(driver.page_source, 'html.parser') |
|
n_results_element = soup.find(class_="t-12 t-black--light t-normal") |
|
n_results_string = n_results_element.get_text() |
|
n_results = int(n_results_string.split()[0].replace(',', '')) |
|
job_urls = get_job_urls(soup) |
|
start = 25 |
|
url = JOB_SEARCH_URL + query + "&start=" + str(start) |
|
while start < n_results: |
|
try: |
|
driver.get(url) |
|
scroll_job_panel(driver) |
|
soup = BeautifulSoup(driver.page_source, 'html.parser') |
|
job_urls.extend(get_job_urls(soup)) |
|
start += 25 |
|
except TimeoutException: |
|
print( |
|
"\nINFO :: TimeoutException raised while getting " + |
|
"URL\n" + url |
|
) |
|
if len(job_urls) == 0: |
|
print() |
|
print("WARNING :: Could not get any URLs for the query\n" + |
|
query) |
|
print("Please double-check that LinkedIn is not " + |
|
"blocking the query") |
|
continue |
|
unseen_urls = get_unseen_urls(jobs, job_urls) |
|
if len(unseen_urls) != 0: |
|
print("INFO :: Resuming from URL", unseen_urls[0]) |
|
else: |
|
print("INFO :: All job URLs for the query " + query + |
|
" have already been scraped. " + |
|
"Moving onto the next query if any.") |
|
continue |
|
for url in unseen_urls: |
|
driver.get(url) |
|
|
|
class Resume:r.page_source, 'html.parser') |
|
js = JobScraper(soup, url, query) |
|
job_data = js.get_job_data() |
|
if job_data and\ |
|
not db["jobs"].count_documents(job_data, limit=1): |
|
print_scraped_data(job_data) |
|
jobs.insert_one(job_data) |
|
driver.quit() |
|
|
|
|
|
class Resume:
|
|
|