jupyter-notebooks/unpublished/job_scrape/linkedin/scraper.py

#! /usr/bin/env python3
import argparse
import csv
import datetime
import json
import random

# from requests import Session
import time
from datetime import date
from time import sleep

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

from utils import pretty_date, random_wait, scroll_down, write_to_csv


def extract(soup, search_keyword, search_window, search_date, blackListTitles):
    job_objects = []
    blacklist_job_objects = []
    datas = []
    blacklist_datas = []

    test = soup.find_all("li", class_="result-card")
    rr = test[0].prettify()
    file = open("linkedin-snippet.html", "w")
    file.write(rr)
    file.close()
    print("Snippet dumped")

    for job in soup.find_all("li", class_="result-card"):
        try:
            id = job.get("data-id")
            search_url = url
            try:
                title = (
                    job.find("h3", class_="job-result-card__title")
                    .text.replace("\n", "")
                    .strip()
                )
            except:
                title = ""
                print("no title")

            try:
                company = (
                    job.find("a", class_="job-result-card__subtitle-link")
                    .text.replace("\n", "")
                    .strip()
                )
            except:
                company = ""

            try:
                company_url = job.find(
                    "a", class_="job-result-card__subtitle-link"
                ).get("href")
            except:
                company_url = ""

            try:
                job_url = job.find("a", class_="result-card__full-card-link").get(
                    "href"
                )
            except:
                job_url = ""

            try:
                location = (
                    job.find("span", class_="job-result-card__location")
                    .text.replace("\n", "")
                    .strip()
                )
            except:
                location = ""

            job_site = "linkedin"

            try:
                data_posted_on = (
                    job.find("time", class_="job-result-card__listdate--new")
                    .text.replace("\n", "")
                    .strip()
                )
            except:
                data_posted_on = (
                    job.find("time", class_="job-result-card__listdate")
                    .text.replace("\n", "")
                    .strip()
                )

            try:
                quick_apply = (
                    job.find("span", class_="job-result-card__easy-apply-label")
                    .text.replace("\n", "")
                    .strip()
                )
            except:
                quick_apply = ""

            try:

                job_object = {
                    # data_job_id: {
                    "id": id,
                    "title": title,
                    "location": location,
                    "company": company,
                    # "summary": job_snippet,
                    "quick_apply": quick_apply,
                    "data_posted_on": data_posted_on,
                    # "apply_method": apply_method,
                    # "data_job_id": data_job_id,
                    "job_url": job_url,
                    "employer_url": company_url,
                    "search_url": search_url,
                    "job_site": job_site
                    #   "lat": geo_lat,
                    #   "lng": geo_lng
                    # },
                    # "search": {
                    #   "site": job_site,
                    #   "date": pull_date,
                    #   "window": search_window,
                    #   "keyword": search_keyword
                    # }
                }
            except:
                print("job_object failed")

            print(f"Grabbed {title} - {employer}")
            job_objects.append(job_object)
            datas.append(
                [
                    id,
                    title,
                    company,
                    location,
                    data_posted_on,
                    job_url,
                    job_site,
                    search_keyword,
                    quick_apply,
                    "",
                    "",
                    "USA_ENGLISH",
                    "",
                    "",
                    "",
                ]
            )

        except:
            # print("attempt failed, %s" % (job_link))
            print("attempt failed")

    #         datas.append([data_listing_version_key, job_site, search_keyword])

    # except:
    #         # print("attempt failed, %s" % (job_link))
    # print("attempt failed")
    #         # print(job)
    # # print(job_objects)

    # # write_to_csv(ziprecruiter.csv, all_jobs)

    # # print( tabulate(dataframe[0], headers='columns', tablefmt='psql') )
    # #
    # # with open("ziprecruiter" + search_date + ".json", "w") as outfile:
    print("Exporting linkedin.json")
    with open("linkedin.json", "w") as outfile:
        json.dump(job_objects, outfile, indent=4)

    print("Exporting linkedin.csv")
    with open("linkedin.csv", "w+", newline="", encoding="utf-8") as save_file:
        writer = csv.writer(save_file)
        writer.writerow(
            [
                "id",
                "title",
                "company",
                "location",
                "date",
                "link",
                "provider",
                "query",
                "easy_apply",
                "status",
                "tags",
                "locale",
                "wage",
                "remoteness",
                "blurb",
            ]
        )
        # writer.writerow(["id", "title", "company", "location", "data_posted_on", "url"])
        for data in datas:
            writer.writerow(data)

    print("Done!")


class LinkedinScraper:
    def __init__(self, search_keyword, search_window="5"):
        """Parameter initialization"""
        # search_date = pretty_date()
        # search_keyword = data["search_keyword"]
        # search_window = data["search_window"]
        # ip = driver.get('https://api.ipify.org')

        chrome_options = browser_options()
        capabilities = browser_capabilities()

        driver = webdriver.Remote(
            command_executor="http://192.168.1.101:4444/wd/hub",
            options=chrome_options,
            desired_capabilities=capabilities,
        )

        # external_ip = driver.get('https://api.ipify.org').read().decode('utf8')
        # print(external_ip)

        url = (
            "https://www.linkedin.com/jobs/search/?f_L=United%20States&f_TPR=r86400&keywords="
            + search_keyword
            + "&location=United%20States"
        )
        # "https://www.linkedin.com/jobs/search/?f_CF=f_WRA&f_L=United%20States&f_TPR=r86400&geoId=103644278&keywords=selenium&location=United%20States"

        print(url)
        driver.maximize_window()
        driver.get(url)
        driver.get(url)

        time.sleep(random_wait())
        # element = driver.find_element_by_css_selector(".zrs_close_btn")
        # element.click()

        # driver.find_element(By.CSS_SELECTOR, ".load_more_jobs").click()

        scroll_down(driver)

        # h1 class headline

        source_data = driver.page_source

        soup = BeautifulSoup(source_data, "lxml")

        driver.save_screenshot("linkedin-scroll.png")

        driver.close()

        # print(soup.prettify())

        # ps = soup

        # with open("test.json", "w") as outfile:
        #     json.dump(ps, outfile, indent=4)
        # print("Exported linkedin.json")

        print("Soup pulled and browser session ended")

        job_objects = []
        datas = []

        for job in soup.find_all("li", class_="result-card"):
            try:
                id = job.get("data-id")
                search_url = url
                try:
                    title = (
                        job.find("h3", class_="job-result-card__title")
                        .text.replace("\n", "")
                        .strip()
                    )
                except:
                    title = ""
                    print("no title")

                try:
                    company = (
                        job.find("a", class_="job-result-card__subtitle-link")
                        .text.replace("\n", "")
                        .strip()
                    )
                except:
                    company = ""

                try:
                    company_url = job.find(
                        "a", class_="job-result-card__subtitle-link"
                    ).get("href")
                except:
                    company_url = ""

                try:
                    job_url = job.find("a", class_="result-card__full-card-link").get(
                        "href"
                    )
                except:
                    job_url = ""

                try:
                    location = (
                        job.find("span", class_="job-result-card__location")
                        .text.replace("\n", "")
                        .strip()
                    )
                except:
                    location = ""

                job_site = "linkedin"

                try:
                    data_posted_on = (
                        job.find("time", class_="job-result-card__listdate--new")
                        .text.replace("\n", "")
                        .strip()
                    )
                except:
                    data_posted_on = (
                        job.find("time", class_="job-result-card__listdate")
                        .text.replace("\n", "")
                        .strip()
                    )

                try:
                    quick_apply = (
                        job.find("span", class_="job-result-card__easy-apply-label")
                        .text.replace("\n", "")
                        .strip()
                    )
                except:
                    quick_apply = ""

                try:

                    job_object = {
                        # data_job_id: {
                        "id": id,
                        "title": title,
                        "location": location,
                        "company": company,
                        # "summary": job_snippet,
                        "quick_apply": quick_apply,
                        "data_posted_on": data_posted_on,
                        # "apply_method": apply_method,
                        # "data_job_id": data_job_id,
                        "job_url": job_url,
                        "employer_url": company_url,
                        "search_url": search_url,
                        "job_site": job_site
                        #   "lat": geo_lat,
                        #   "lng": geo_lng
                        # },
                        # "search": {
                        #   "site": job_site,
                        #   "date": pull_date,
                        #   "window": search_window,
                        #   "keyword": search_keyword
                        # }
                    }
                except:
                    print("job_object failed")

                print(f"Grabbed {title} - {employer}")
                job_objects.append(job_object)
                datas.append(
                    [
                        id,
                        title,
                        company,
                        location,
                        data_posted_on,
                        job_url,
                        job_site,
                        search_keyword,
                        quick_apply,
                        "",
                        "",
                        "USA_ENGLISH",
                        "",
                        "",
                        "",
                    ]
                )

            except:
                # print("attempt failed, %s" % (job_link))
                print("attempt failed")

        #         datas.append([data_listing_version_key, job_site, search_keyword])

        # except:
        #         # print("attempt failed, %s" % (job_link))
        # print("attempt failed")
        #         # print(job)
        # # print(job_objects)

        # # write_to_csv(ziprecruiter.csv, all_jobs)

        # # print( tabulate(dataframe[0], headers='columns', tablefmt='psql') )
        # #
        # # with open("ziprecruiter" + search_date + ".json", "w") as outfile:
        print("Exporting linkedin.json")
        with open("linkedin.json", "w") as outfile:
            json.dump(job_objects, outfile, indent=4)

        print("Exporting linkedin.csv")
        with open("linkedin.csv", "w+", newline="", encoding="utf-8") as save_file:
            writer = csv.writer(save_file)
            writer.writerow(
                [
                    "id",
                    "title",
                    "company",
                    "location",
                    "date",
                    "link",
                    "provider",
                    "query",
                    "easy_apply",
                    "status",
                    "tags",
                    "locale",
                    "wage",
                    "remoteness",
                    "blurb",
                ]
            )
            # writer.writerow(["id", "title", "company", "location", "data_posted_on", "url"])
            for data in datas:
                writer.writerow(data)

        print("Done!")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Scrape Ziprecruiter")

    parser.add_argument(
        "-k",
        "--keywords",
        # nargs="+",
        type=str,
        dest="keywords",
        help="List the keywords you want to scrape for",
    )
    parser.add_argument(
        "-d",
        "--days",
        action="store",
        dest="days",
        default="5",
        type=str,
        help="How many days",
    )
    args = parser.parse_args()

    if args.keywords:

        C = LinkedinScraper(search_keyword=args.keywords, search_window=args.days)
        # C.login(EMAIL, PASSWORD)
        # C.collect("groups")

    # with open("config.json") as config_file:
    #     data = json.load(config_file)

    # bot = LinkedinScraper()
    # bot.apply()