{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException\n", "from selenium import webdriver\n", "import time\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def get_jobs(keyword, num_jobs, verbose):\n", " \n", " '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n", " \n", " #Initializing the webdriver\n", " options = webdriver.ChromeOptions()\n", " \n", " #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n", " #options.add_argument('headless')\n", " \n", " #Change the path to where chromedriver is in your home folder.\n", " driver = webdriver.Chrome(executable_path=\"/home/jovyan/chromedriver\", options=options)\n", " driver.set_window_size(1120, 1000)\n", "\n", " url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n", " driver.get(url)\n", " jobs = []\n", "\n", " while len(jobs) < num_jobs: #If true, should be still looking for new jobs.\n", "\n", " #Let the page load. Change this number based on your internet speed.\n", " #Or, wait until the webpage is loaded, instead of hardcoding it.\n", " time.sleep(4)\n", "\n", " #Test for the \"Sign Up\" prompt and get rid of it.\n", " try:\n", " driver.find_element_by_class_name(\"selected\").click()\n", " except ElementClickInterceptedException:\n", " pass\n", "\n", " time.sleep(.1)\n", "\n", " try:\n", " driver.find_element_by_class_name(\"ModalStyle__xBtn___29PT9\").click() #clicking to the X.\n", " except NoSuchElementException:\n", " pass\n", "\n", " \n", " #Going through each job in this page\n", " job_buttons = driver.find_elements_by_class_name(\"jl\") #jl for Job Listing. These are the buttons we're going to click.\n", " for job_button in job_buttons: \n", "\n", " print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n", " if len(jobs) >= num_jobs:\n", " break\n", "\n", " job_button.click() #You might \n", " time.sleep(1)\n", " collected_successfully = False\n", " \n", " while not collected_successfully:\n", " try:\n", " company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n", " location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n", " job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n", " job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n", " collected_successfully = True\n", " except:\n", " time.sleep(5)\n", "\n", " try:\n", " salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray small salary\"]').text\n", " except NoSuchElementException:\n", " salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n", " \n", " try:\n", " rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n", " except NoSuchElementException:\n", " rating = -1 #You need to set a \"not found value. It's important.\"\n", "\n", " #Printing for debugging\n", " if verbose:\n", " print(\"Job Title: {}\".format(job_title))\n", " print(\"Salary Estimate: {}\".format(salary_estimate))\n", " print(\"Job Description: {}\".format(job_description[:500]))\n", " print(\"Rating: {}\".format(rating))\n", " print(\"Company Name: {}\".format(company_name))\n", " print(\"Location: {}\".format(location))\n", "\n", " #Going to the Company tab...\n", " #clicking on this:\n", " #