Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

165 lines
5.5 KiB

#! /usr/bin/env python3
import argparse
import csv
import datetime
import json
import random
# from requests import Session
import time
from datetime import date
from time import sleep
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from utils import pretty_date, random_wait, scroll_down, write_to_csv
class LinkedinScraper:
def __init__(self, search_keyword="devops", search_window="5"):
"""Parameter initialization"""
search_date = pretty_date()
# search_keyword = data["search_keyword"]
# search_window = data["search_window"]
PROXY = "192.168.1.101:8889"
# def get_default_chrome_options():
# chrome_options = webdriver.ChromeOptions()
chrome_options = Options()
# # chrome_options.add_argument('--headless')
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-extensions")
# chrome_options.add_argument('disable-blink-features=AutomationControlled')
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features")
chrome_options.add_argument("--verbose")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument(
"--no-default-browser-check"
) # Overrides default choices
chrome_options.add_argument("--no-first-run")
chrome_options.add_argument("--disable-default-apps")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
)
chrome_options.page_load_strategy = "normal"
capabilities = {
"browserName": "chrome",
"browserVersion": "latest",
"pageLoadStrategy": "normal",
"javascriptEnabled": True,
"selenoid:options": {
"enableVNC": True,
"enableVideo": True,
"enableLog": True,
"videoName": f"Linkedin-{datetime.datetime.now()}.mp4",
"logName": f"Linkedin-{datetime.datetime.now()}.log",
"name": "Chrome",
},
"proxy": {
"httpProxy": PROXY,
"ftpProxy": PROXY,
"sslProxy": PROXY,
"proxyType": "MANUAL",
},
}
url = "https://www.linkedin.com/jobs/view/linux-systems-kubernetes-docker-at-motion-recruitment-2438784506"
driver = webdriver.Remote(
command_executor="http://192.168.1.101:4444/wd/hub",
options=chrome_options,
desired_capabilities=capabilities,
)
ip = driver.get("https://api.ipify.org").text
print("My public IP address is: {}".format(ip))
driver.maximize_window()
driver.get(url)
driver.get(url)
time.sleep(random_wait())
# element = driver.find_element_by_css_selector(".zrs_close_btn")
# element.click()
element = driver.find_element(
By.CSS_SELECTOR, ".show-more-less-html__button--more"
)
element.click()
# scroll_down(driver)
source_data = driver.page_source
soup = BeautifulSoup(source_data, "lxml")
driver.save_screenshot("linkedin-deep.png")
driver.close()
# element = cdriver.find_element_by_css_selector('.some-css.selector')
# element.screenshot_as_png('elemenent.png')
# print(soup.prettify())
# description = soup.find("section", class_="description").text.replace("\n", "").strip()
description = soup.find("section", class_="description").text
# ps = soup
features = soup.find("ul", class_="job-criteria__list").text
external_link = soup.find("a", class_="apply-button--link").get("href")
print(description)
print(features)
print(external_link)
# with open("test.json", "w") as outfile:
# json.dump(ps, outfile, indent=4)
# print("Exported linkedin.json")
print("Soup pulled and browser session ended")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape Ziprecruiter")
parser.add_argument(
"-k",
"--keywords",
# nargs="+",
type=str,
dest="keywords",
help="List the keywords you want to scrape for",
)
parser.add_argument(
"-d",
"--days",
action="store",
dest="days",
default="5",
type=str,
help="How many days",
)
args = parser.parse_args()
if args.keywords:
C = LinkedinScraper(search_keyword=args.keywords, search_window=args.days)
# C.login(EMAIL, PASSWORD)
# C.collect("groups")
# with open("config.json") as config_file:
# data = json.load(config_file)
bot = LinkedinScraper()
# bot.apply()