{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException\n", "from selenium import webdriver\n", "import time\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def get_jobs(keyword, num_jobs, verbose):\n", " \n", " '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n", " \n", " #Initializing the webdriver\n", " options = webdriver.ChromeOptions()\n", " \n", " #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n", " #options.add_argument('headless')\n", " \n", " #Change the path to where chromedriver is in your home folder.\n", " driver = webdriver.Chrome(executable_path=\"/home/jovyan/chromedriver\", options=options)\n", " driver.set_window_size(1120, 1000)\n", "\n", " url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n", " driver.get(url)\n", " jobs = []\n", "\n", " while len(jobs) < num_jobs: #If true, should be still looking for new jobs.\n", "\n", " #Let the page load. Change this number based on your internet speed.\n", " #Or, wait until the webpage is loaded, instead of hardcoding it.\n", " time.sleep(4)\n", "\n", " #Test for the \"Sign Up\" prompt and get rid of it.\n", " try:\n", " driver.find_element_by_class_name(\"selected\").click()\n", " except ElementClickInterceptedException:\n", " pass\n", "\n", " time.sleep(.1)\n", "\n", " try:\n", " driver.find_element_by_class_name(\"ModalStyle__xBtn___29PT9\").click() #clicking to the X.\n", " except NoSuchElementException:\n", " pass\n", "\n", " \n", " #Going through each job in this page\n", " job_buttons = driver.find_elements_by_class_name(\"jl\") #jl for Job Listing. These are the buttons we're going to click.\n", " for job_button in job_buttons: \n", "\n", " print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n", " if len(jobs) >= num_jobs:\n", " break\n", "\n", " job_button.click() #You might \n", " time.sleep(1)\n", " collected_successfully = False\n", " \n", " while not collected_successfully:\n", " try:\n", " company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n", " location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n", " job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n", " job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n", " collected_successfully = True\n", " except:\n", " time.sleep(5)\n", "\n", " try:\n", " salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray small salary\"]').text\n", " except NoSuchElementException:\n", " salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n", " \n", " try:\n", " rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n", " except NoSuchElementException:\n", " rating = -1 #You need to set a \"not found value. It's important.\"\n", "\n", " #Printing for debugging\n", " if verbose:\n", " print(\"Job Title: {}\".format(job_title))\n", " print(\"Salary Estimate: {}\".format(salary_estimate))\n", " print(\"Job Description: {}\".format(job_description[:500]))\n", " print(\"Rating: {}\".format(rating))\n", " print(\"Company Name: {}\".format(company_name))\n", " print(\"Location: {}\".format(location))\n", "\n", " #Going to the Company tab...\n", " #clicking on this:\n", " #
Company
\n", " try:\n", " driver.find_element_by_xpath('.//div[@class=\"tab\" and @data-tab-type=\"overview\"]').click()\n", "\n", " try:\n", " #
\n", " # \n", " # San Francisco, CA\n", " #
\n", " headquarters = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Headquarters\"]//following-sibling::*').text\n", " except NoSuchElementException:\n", " headquarters = -1\n", "\n", " try:\n", " size = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Size\"]//following-sibling::*').text\n", " except NoSuchElementException:\n", " size = -1\n", "\n", " try:\n", " founded = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Founded\"]//following-sibling::*').text\n", " except NoSuchElementException:\n", " founded = -1\n", "\n", " try:\n", " type_of_ownership = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Type\"]//following-sibling::*').text\n", " except NoSuchElementException:\n", " type_of_ownership = -1\n", "\n", " try:\n", " industry = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Industry\"]//following-sibling::*').text\n", " except NoSuchElementException:\n", " industry = -1\n", "\n", " try:\n", " sector = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Sector\"]//following-sibling::*').text\n", " except NoSuchElementException:\n", " sector = -1\n", "\n", " try:\n", " revenue = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Revenue\"]//following-sibling::*').text\n", " except NoSuchElementException:\n", " revenue = -1\n", "\n", " try:\n", " competitors = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Competitors\"]//following-sibling::*').text\n", " except NoSuchElementException:\n", " competitors = -1\n", "\n", " except NoSuchElementException: #Rarely, some job postings do not have the \"Company\" tab.\n", " headquarters = -1\n", " size = -1\n", " founded = -1\n", " type_of_ownership = -1\n", " industry = -1\n", " sector = -1\n", " revenue = -1\n", " competitors = -1\n", "\n", " \n", " if verbose:\n", " print(\"Headquarters: {}\".format(headquarters))\n", " print(\"Size: {}\".format(size))\n", " print(\"Founded: {}\".format(founded))\n", " print(\"Type of Ownership: {}\".format(type_of_ownership))\n", " print(\"Industry: {}\".format(industry))\n", " print(\"Sector: {}\".format(sector))\n", " print(\"Revenue: {}\".format(revenue))\n", " print(\"Competitors: {}\".format(competitors))\n", " print(\"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\")\n", "\n", " jobs.append({\"Job Title\" : job_title,\n", " \"Salary Estimate\" : salary_estimate,\n", " \"Job Description\" : job_description,\n", " \"Rating\" : rating,\n", " \"Company Name\" : company_name,\n", " \"Location\" : location,\n", " \"Headquarters\" : headquarters,\n", " \"Size\" : size,\n", " \"Founded\" : founded,\n", " \"Type of ownership\" : type_of_ownership,\n", " \"Industry\" : industry,\n", " \"Sector\" : sector,\n", " \"Revenue\" : revenue,\n", " \"Competitors\" : competitors})\n", " #add job to jobs\n", "\n", " #Clicking on the \"next page\" button\n", " try:\n", " driver.find_element_by_xpath('.//li[@class=\"next\"]//a').click()\n", " except NoSuchElementException:\n", " print(\"Scraping terminated before reaching target number of jobs. Needed {}, got {}.\".format(num_jobs, len(jobs)))\n", " break\n", "\n", " return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "ename": "OSError", "evalue": "[Errno 8] Exec format error: '/home/jovyan/chromedriver'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_1022/156947742.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#This line will open a new chrome window and start the scraping.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_jobs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"data scientist\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/tmp/ipykernel_1022/3679239868.py\u001b[0m in \u001b[0;36mget_jobs\u001b[0;34m(keyword, num_jobs, verbose)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m#Change the path to where chromedriver is in your home folder.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mdriver\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChrome\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexecutable_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"/home/jovyan/chromedriver\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_window_size\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1120\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/chrome/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, executable_path, port, options, service_args, desired_capabilities, service_log_path, chrome_options, keep_alive)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mservice_args\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mservice_args\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m log_path=service_log_path)\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0mcmd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mcmd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommand_line_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m self.process = subprocess.Popen(cmd, env=self.env,\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0mclose_fds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplatform\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'Windows'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0mstdout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.9/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask)\u001b[0m\n\u001b[1;32m 949\u001b[0m encoding=encoding, errors=errors)\n\u001b[1;32m 950\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 951\u001b[0;31m self._execute_child(args, executable, preexec_fn, close_fds,\n\u001b[0m\u001b[1;32m 952\u001b[0m \u001b[0mpass_fds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcwd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 953\u001b[0m \u001b[0mstartupinfo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreationflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshell\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.9/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)\u001b[0m\n\u001b[1;32m 1819\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrno_num\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1820\u001b[0m \u001b[0merr_msg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrerror\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1821\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1822\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mOSError\u001b[0m: [Errno 8] Exec format error: '/home/jovyan/chromedriver'" ] } ], "source": [ "#This line will open a new chrome window and start the scraping.\n", "df = get_jobs(\"data scientist\", 5, False)\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 4 }