You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
203 lines
5.8 KiB
203 lines
5.8 KiB
import random |
|
from time import sleep |
|
|
|
import pandas as pd |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from tqdm.auto import tqdm |
|
|
|
|
|
class JobsScraper: |
|
"""JobsScraper is a simple job postings scraper for Indeed.""" |
|
|
|
def __init__( |
|
self, |
|
country: str, |
|
position: str, |
|
location: str, |
|
pages: int, |
|
max_delay: int = 0, |
|
full_urls: bool = False, |
|
): |
|
""" |
|
Create a JobsScraper object. |
|
Parameters |
|
------------ |
|
country: str |
|
Prefix country. |
|
Available countries: |
|
AE, AQ, AR, AT, AU, BE, BH, BR, CA, CH, CL, CO, |
|
CZ, DE, DK, ES, FI, FR, GB, GR, HK, HU, ID, IE, |
|
IL, IN, IT, KW, LU, MX, MY, NL, NO, NZ, OM, PE, |
|
PH, PK, PL, PT, QA, RO, RU, SA, SE, SG, TR, TW, |
|
US, VE, ZA. |
|
position: str |
|
Job position. |
|
location: str |
|
Job location. |
|
pages: int |
|
Number of pages to be scraped. Each page contains 15 results. |
|
max_delay: int, default = 0 |
|
Max number of seconds of delay for the scraping of a single posting. |
|
full_urls: bool, default = False |
|
If set to True, it shows the job url column not truncated in the DataFrame. |
|
""" |
|
if country.upper() == "US": |
|
self._url = f"https://indeed.com/jobs?q={position}&l={location}" |
|
else: |
|
self._url = "https://{}.indeed.com/jobs?q={}&l={}".format( |
|
country, |
|
position, |
|
location, |
|
) |
|
self._country = country |
|
self._headers = { |
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36", |
|
} |
|
self._pages = pages |
|
self._max_delay = max_delay |
|
self._jobs = [] |
|
|
|
if full_urls: |
|
pd.set_option("display.max_colwidth", None) |
|
else: |
|
pd.reset_option("display.max_colwidth") |
|
|
|
def _extract_page(self, page): |
|
|
|
with requests.Session() as request: |
|
r = request.get( |
|
url="{}&start={}".format( |
|
self._url, |
|
page, |
|
), |
|
headers=self._headers, |
|
) |
|
|
|
soup = BeautifulSoup(r.content, "html.parser") |
|
|
|
return soup |
|
|
|
def _transform_page(self, soup): |
|
|
|
jobs = soup.find_all("div", class_="jobsearch-SerpJobCard") |
|
|
|
for job in jobs: |
|
|
|
try: |
|
title = ( |
|
job.find( |
|
"a", |
|
class_="jobtitle", |
|
) |
|
.text.strip() |
|
.replace("\n", "") |
|
) |
|
except: |
|
title = None |
|
try: |
|
company = ( |
|
job.find( |
|
"span", |
|
class_="company", |
|
) |
|
.text.strip() |
|
.replace("\n", "") |
|
) |
|
except: |
|
company = None |
|
try: |
|
summary = ( |
|
job.find( |
|
"div", |
|
{"class": "summary"}, |
|
) |
|
.text.strip() |
|
.replace("\n", "") |
|
) |
|
except: |
|
summary = None |
|
|
|
if job.find("div", class_="location"): |
|
try: |
|
location = ( |
|
job.find( |
|
"div", |
|
class_="location", |
|
) |
|
.text.strip() |
|
.replace("\n", "") |
|
) |
|
except: |
|
location = None |
|
else: |
|
try: |
|
location = ( |
|
job.find( |
|
"span", |
|
class_="location", |
|
) |
|
.text.strip() |
|
.replace("\n", "") |
|
) |
|
except: |
|
location = None |
|
try: |
|
href = job.h2.a.get("href") |
|
if self._country.upper() == "US": |
|
job_url = f"https://indeed.com{href}" |
|
else: |
|
job_url = f"https://{self._country}.indeed.com{href}" |
|
except: |
|
job_url = None |
|
try: |
|
salary = ( |
|
job.find( |
|
"span", |
|
class_="salary", |
|
) |
|
.text.strip() |
|
.replace("\n", "") |
|
) |
|
except: |
|
salary = None |
|
|
|
job = { |
|
"title": title, |
|
"location": location, |
|
"company": company, |
|
"summary": summary, |
|
"salary": salary, |
|
"url": job_url, |
|
} |
|
|
|
self._jobs.append(job) |
|
|
|
print(f"Scraping {title}...") |
|
|
|
if self._max_delay > 0: |
|
sleep(random.randint(0, self._max_delay)) |
|
|
|
def scrape(self) -> pd.DataFrame: |
|
""" |
|
Perform the scraping for the parameters provided in the class constructor. |
|
If duplicates are found, they get dropped. |
|
Returns |
|
------------ |
|
df: pd.DataFrame |
|
Return a scraped Dataframe. |
|
""" |
|
|
|
for i in tqdm( |
|
range(0, self._pages * 10, 10), |
|
desc="Scraping in progress...", |
|
total=self._pages, |
|
): |
|
|
|
page = self._extract_page(i) |
|
self._transform_page(page) |
|
|
|
df = pd.DataFrame(self._jobs) |
|
df.drop_duplicates(inplace=True) |
|
|
|
return df
|
|
|