jupyter-notebooks/unpublished/job_scrape/indeed/scraper.py

import random
from time import sleep

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm


class JobsScraper:
    """JobsScraper is a simple job postings scraper for Indeed."""

    def __init__(
        self,
        country: str,
        position: str,
        location: str,
        pages: int,
        max_delay: int = 0,
        full_urls: bool = False,
    ):
        """
        Create a JobsScraper object.
        Parameters
        ------------
        country: str
            Prefix country.
            Available countries:
            AE, AQ, AR, AT, AU, BE, BH, BR, CA, CH, CL, CO,
            CZ, DE, DK, ES, FI, FR, GB, GR, HK, HU, ID, IE,
            IL, IN, IT, KW, LU, MX, MY, NL, NO, NZ, OM, PE,
            PH, PK, PL, PT, QA, RO, RU, SA, SE, SG, TR, TW,
            US, VE, ZA.
        position: str
            Job position.
        location: str
            Job location.
        pages: int
            Number of pages to be scraped. Each page contains 15 results.
        max_delay: int, default = 0
            Max number of seconds of delay for the scraping of a single posting.
        full_urls: bool, default = False
            If set to True, it shows the job url column not truncated in the DataFrame.
        """
        if country.upper() == "US":
            self._url = f"https://indeed.com/jobs?q={position}&l={location}"
        else:
            self._url = "https://{}.indeed.com/jobs?q={}&l={}".format(
                country,
                position,
                location,
            )
        self._country = country
        self._headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
        }
        self._pages = pages
        self._max_delay = max_delay
        self._jobs = []

        if full_urls:
            pd.set_option("display.max_colwidth", None)
        else:
            pd.reset_option("display.max_colwidth")

    def _extract_page(self, page):

        with requests.Session() as request:
            r = request.get(
                url="{}&start={}".format(
                    self._url,
                    page,
                ),
                headers=self._headers,
            )

        soup = BeautifulSoup(r.content, "html.parser")

        return soup

    def _transform_page(self, soup):

        jobs = soup.find_all("div", class_="jobsearch-SerpJobCard")

        for job in jobs:

            try:
                title = (
                    job.find(
                        "a",
                        class_="jobtitle",
                    )
                    .text.strip()
                    .replace("\n", "")
                )
            except:
                title = None
            try:
                company = (
                    job.find(
                        "span",
                        class_="company",
                    )
                    .text.strip()
                    .replace("\n", "")
                )
            except:
                company = None
            try:
                summary = (
                    job.find(
                        "div",
                        {"class": "summary"},
                    )
                    .text.strip()
                    .replace("\n", "")
                )
            except:
                summary = None

            if job.find("div", class_="location"):
                try:
                    location = (
                        job.find(
                            "div",
                            class_="location",
                        )
                        .text.strip()
                        .replace("\n", "")
                    )
                except:
                    location = None
            else:
                try:
                    location = (
                        job.find(
                            "span",
                            class_="location",
                        )
                        .text.strip()
                        .replace("\n", "")
                    )
                except:
                    location = None
            try:
                href = job.h2.a.get("href")
                if self._country.upper() == "US":
                    job_url = f"https://indeed.com{href}"
                else:
                    job_url = f"https://{self._country}.indeed.com{href}"
            except:
                job_url = None
            try:
                salary = (
                    job.find(
                        "span",
                        class_="salary",
                    )
                    .text.strip()
                    .replace("\n", "")
                )
            except:
                salary = None

            job = {
                "title": title,
                "location": location,
                "company": company,
                "summary": summary,
                "salary": salary,
                "url": job_url,
            }

            self._jobs.append(job)

            print(f"Scraping {title}...")

            if self._max_delay > 0:
                sleep(random.randint(0, self._max_delay))

    def scrape(self) -> pd.DataFrame:
        """
        Perform the scraping for the parameters provided in the class constructor.
        If duplicates are found, they get dropped.
        Returns
        ------------
        df: pd.DataFrame
            Return a scraped Dataframe.
        """

        for i in tqdm(
            range(0, self._pages * 10, 10),
            desc="Scraping in progress...",
            total=self._pages,
        ):

            page = self._extract_page(i)
            self._transform_page(page)

        df = pd.DataFrame(self._jobs)
        df.drop_duplicates(inplace=True)

        return df