jupyter-notebooks/unpublished/job_scrape/linkedin/.ipynb_checkpoints/deep-checkpoint.py

#! /usr/bin/env python3
import argparse
import csv
import datetime
import json
import random

# from requests import Session
import time
from datetime import date
from time import sleep

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

from utils import pretty_date, random_wait, scroll_down, write_to_csv


class LinkedinScraper:
    def __init__(self, search_keyword="devops", search_window="5"):
        """Parameter initialization"""
        search_date = pretty_date()
        # search_keyword = data["search_keyword"]
        # search_window = data["search_window"]
        PROXY = "192.168.1.101:8889"

        # def get_default_chrome_options():

        # chrome_options = webdriver.ChromeOptions()
        chrome_options = Options()
        # # chrome_options.add_argument('--headless')
        chrome_options.add_argument("--disable-infobars")
        chrome_options.add_argument("--disable-extensions")
        # chrome_options.add_argument('disable-blink-features=AutomationControlled')
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--incognito")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("--disable-blink-features")
        chrome_options.add_argument("--verbose")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument(
            "--no-default-browser-check"
        )  # Overrides default choices
        chrome_options.add_argument("--no-first-run")
        chrome_options.add_argument("--disable-default-apps")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
        )
        chrome_options.page_load_strategy = "normal"
        capabilities = {
            "browserName": "chrome",
            "browserVersion": "latest",
            "pageLoadStrategy": "normal",
            "javascriptEnabled": True,
            "selenoid:options": {
                "enableVNC": True,
                "enableVideo": True,
                "enableLog": True,
                "videoName": f"Linkedin-{datetime.datetime.now()}.mp4",
                "logName": f"Linkedin-{datetime.datetime.now()}.log",
                "name": "Chrome",
            },
            "proxy": {
                "httpProxy": PROXY,
                "ftpProxy": PROXY,
                "sslProxy": PROXY,
                "proxyType": "MANUAL",
            },
        }

        url = "https://www.linkedin.com/jobs/view/linux-systems-kubernetes-docker-at-motion-recruitment-2438784506"

        driver = webdriver.Remote(
            command_executor="http://192.168.1.101:4444/wd/hub",
            options=chrome_options,
            desired_capabilities=capabilities,
        )

        ip = driver.get("https://api.ipify.org").text
        print("My public IP address is: {}".format(ip))
        driver.maximize_window()
        driver.get(url)
        driver.get(url)

        time.sleep(random_wait())
        # element = driver.find_element_by_css_selector(".zrs_close_btn")
        # element.click()

        element = driver.find_element(
            By.CSS_SELECTOR, ".show-more-less-html__button--more"
        )
        element.click()
        # scroll_down(driver)

        source_data = driver.page_source

        soup = BeautifulSoup(source_data, "lxml")

        driver.save_screenshot("linkedin-deep.png")

        driver.close()
        # element = cdriver.find_element_by_css_selector('.some-css.selector')

        # element.screenshot_as_png('elemenent.png')
        # print(soup.prettify())
        # description = soup.find("section", class_="description").text.replace("\n", "").strip()
        description = soup.find("section", class_="description").text
        # ps = soup
        features = soup.find("ul", class_="job-criteria__list").text
        external_link = soup.find("a", class_="apply-button--link").get("href")

        print(description)
        print(features)
        print(external_link)
        # with open("test.json", "w") as outfile:
        #     json.dump(ps, outfile, indent=4)
        # print("Exported linkedin.json")

        print("Soup pulled and browser session ended")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Scrape Ziprecruiter")

    parser.add_argument(
        "-k",
        "--keywords",
        # nargs="+",
        type=str,
        dest="keywords",
        help="List the keywords you want to scrape for",
    )
    parser.add_argument(
        "-d",
        "--days",
        action="store",
        dest="days",
        default="5",
        type=str,
        help="How many days",
    )
    args = parser.parse_args()

    if args.keywords:

        C = LinkedinScraper(search_keyword=args.keywords, search_window=args.days)
        # C.login(EMAIL, PASSWORD)
        # C.collect("groups")

    # with open("config.json") as config_file:
    #     data = json.load(config_file)

    bot = LinkedinScraper()
    # bot.apply()