#! /usr/bin/env python3 import argparse import csv import datetime import json import random # from requests import Session import time from datetime import date from time import sleep import pandas as pd import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait from utils import pretty_date, random_wait, scroll_down, write_to_csv class LinkedinScraper: def __init__(self, search_keyword="devops", search_window="5"): """Parameter initialization""" search_date = pretty_date() # search_keyword = data["search_keyword"] # search_window = data["search_window"] PROXY = "192.168.1.101:8889" # def get_default_chrome_options(): # chrome_options = webdriver.ChromeOptions() chrome_options = Options() # # chrome_options.add_argument('--headless') chrome_options.add_argument("--disable-infobars") chrome_options.add_argument("--disable-extensions") # chrome_options.add_argument('disable-blink-features=AutomationControlled') chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--incognito") chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-blink-features") chrome_options.add_argument("--verbose") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument( "--no-default-browser-check" ) # Overrides default choices chrome_options.add_argument("--no-first-run") chrome_options.add_argument("--disable-default-apps") chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" ) chrome_options.page_load_strategy = "normal" capabilities = { "browserName": "chrome", "browserVersion": "latest", "pageLoadStrategy": "normal", "javascriptEnabled": True, "selenoid:options": { "enableVNC": True, "enableVideo": True, "enableLog": True, "videoName": f"Linkedin-{datetime.datetime.now()}.mp4", "logName": f"Linkedin-{datetime.datetime.now()}.log", "name": "Chrome", }, "proxy": { "httpProxy": PROXY, "ftpProxy": PROXY, "sslProxy": PROXY, "proxyType": "MANUAL", }, } url = "https://www.linkedin.com/jobs/view/linux-systems-kubernetes-docker-at-motion-recruitment-2438784506" driver = webdriver.Remote( command_executor="http://192.168.1.101:4444/wd/hub", options=chrome_options, desired_capabilities=capabilities, ) ip = driver.get("https://api.ipify.org").text print("My public IP address is: {}".format(ip)) driver.maximize_window() driver.get(url) driver.get(url) time.sleep(random_wait()) # element = driver.find_element_by_css_selector(".zrs_close_btn") # element.click() element = driver.find_element( By.CSS_SELECTOR, ".show-more-less-html__button--more" ) element.click() # scroll_down(driver) source_data = driver.page_source soup = BeautifulSoup(source_data, "lxml") driver.save_screenshot("linkedin-deep.png") driver.close() # element = cdriver.find_element_by_css_selector('.some-css.selector') # element.screenshot_as_png('elemenent.png') # print(soup.prettify()) # description = soup.find("section", class_="description").text.replace("\n", "").strip() description = soup.find("section", class_="description").text # ps = soup features = soup.find("ul", class_="job-criteria__list").text external_link = soup.find("a", class_="apply-button--link").get("href") print(description) print(features) print(external_link) # with open("test.json", "w") as outfile: # json.dump(ps, outfile, indent=4) # print("Exported linkedin.json") print("Soup pulled and browser session ended") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Scrape Ziprecruiter") parser.add_argument( "-k", "--keywords", # nargs="+", type=str, dest="keywords", help="List the keywords you want to scrape for", ) parser.add_argument( "-d", "--days", action="store", dest="days", default="5", type=str, help="How many days", ) args = parser.parse_args() if args.keywords: C = LinkedinScraper(search_keyword=args.keywords, search_window=args.days) # C.login(EMAIL, PASSWORD) # C.collect("groups") # with open("config.json") as config_file: # data = json.load(config_file) bot = LinkedinScraper() # bot.apply()