Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

203 lines
5.8 KiB

import random
from time import sleep
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
class JobsScraper:
"""JobsScraper is a simple job postings scraper for Indeed."""
def __init__(
self,
country: str,
position: str,
location: str,
pages: int,
max_delay: int = 0,
full_urls: bool = False,
):
"""
Create a JobsScraper object.
Parameters
------------
country: str
Prefix country.
Available countries:
AE, AQ, AR, AT, AU, BE, BH, BR, CA, CH, CL, CO,
CZ, DE, DK, ES, FI, FR, GB, GR, HK, HU, ID, IE,
IL, IN, IT, KW, LU, MX, MY, NL, NO, NZ, OM, PE,
PH, PK, PL, PT, QA, RO, RU, SA, SE, SG, TR, TW,
US, VE, ZA.
position: str
Job position.
location: str
Job location.
pages: int
Number of pages to be scraped. Each page contains 15 results.
max_delay: int, default = 0
Max number of seconds of delay for the scraping of a single posting.
full_urls: bool, default = False
If set to True, it shows the job url column not truncated in the DataFrame.
"""
if country.upper() == "US":
self._url = f"https://indeed.com/jobs?q={position}&l={location}"
else:
self._url = "https://{}.indeed.com/jobs?q={}&l={}".format(
country,
position,
location,
)
self._country = country
self._headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
}
self._pages = pages
self._max_delay = max_delay
self._jobs = []
if full_urls:
pd.set_option("display.max_colwidth", None)
else:
pd.reset_option("display.max_colwidth")
def _extract_page(self, page):
with requests.Session() as request:
r = request.get(
url="{}&start={}".format(
self._url,
page,
),
headers=self._headers,
)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def _transform_page(self, soup):
jobs = soup.find_all("div", class_="jobsearch-SerpJobCard")
for job in jobs:
try:
title = (
job.find(
"a",
class_="jobtitle",
)
.text.strip()
.replace("\n", "")
)
except:
title = None
try:
company = (
job.find(
"span",
class_="company",
)
.text.strip()
.replace("\n", "")
)
except:
company = None
try:
summary = (
job.find(
"div",
{"class": "summary"},
)
.text.strip()
.replace("\n", "")
)
except:
summary = None
if job.find("div", class_="location"):
try:
location = (
job.find(
"div",
class_="location",
)
.text.strip()
.replace("\n", "")
)
except:
location = None
else:
try:
location = (
job.find(
"span",
class_="location",
)
.text.strip()
.replace("\n", "")
)
except:
location = None
try:
href = job.h2.a.get("href")
if self._country.upper() == "US":
job_url = f"https://indeed.com{href}"
else:
job_url = f"https://{self._country}.indeed.com{href}"
except:
job_url = None
try:
salary = (
job.find(
"span",
class_="salary",
)
.text.strip()
.replace("\n", "")
)
except:
salary = None
job = {
"title": title,
"location": location,
"company": company,
"summary": summary,
"salary": salary,
"url": job_url,
}
self._jobs.append(job)
print(f"Scraping {title}...")
if self._max_delay > 0:
sleep(random.randint(0, self._max_delay))
def scrape(self) -> pd.DataFrame:
"""
Perform the scraping for the parameters provided in the class constructor.
If duplicates are found, they get dropped.
Returns
------------
df: pd.DataFrame
Return a scraped Dataframe.
"""
for i in tqdm(
range(0, self._pages * 10, 10),
desc="Scraping in progress...",
total=self._pages,
):
page = self._extract_page(i)
self._transform_page(page)
df = pd.DataFrame(self._jobs)
df.drop_duplicates(inplace=True)
return df