Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

255 lines
18 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException\n",
"from selenium import webdriver\n",
"import time\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_jobs(keyword, num_jobs, verbose):\n",
" \n",
" '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n",
" \n",
" #Initializing the webdriver\n",
" options = webdriver.ChromeOptions()\n",
" \n",
" #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n",
" #options.add_argument('headless')\n",
" \n",
" #Change the path to where chromedriver is in your home folder.\n",
" driver = webdriver.Chrome(executable_path=\"/home/jovyan/chromedriver\", options=options)\n",
" driver.set_window_size(1120, 1000)\n",
"\n",
" url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n",
" driver.get(url)\n",
" jobs = []\n",
"\n",
" while len(jobs) < num_jobs: #If true, should be still looking for new jobs.\n",
"\n",
" #Let the page load. Change this number based on your internet speed.\n",
" #Or, wait until the webpage is loaded, instead of hardcoding it.\n",
" time.sleep(4)\n",
"\n",
" #Test for the \"Sign Up\" prompt and get rid of it.\n",
" try:\n",
" driver.find_element_by_class_name(\"selected\").click()\n",
" except ElementClickInterceptedException:\n",
" pass\n",
"\n",
" time.sleep(.1)\n",
"\n",
" try:\n",
" driver.find_element_by_class_name(\"ModalStyle__xBtn___29PT9\").click() #clicking to the X.\n",
" except NoSuchElementException:\n",
" pass\n",
"\n",
" \n",
" #Going through each job in this page\n",
" job_buttons = driver.find_elements_by_class_name(\"jl\") #jl for Job Listing. These are the buttons we're going to click.\n",
" for job_button in job_buttons: \n",
"\n",
" print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n",
" if len(jobs) >= num_jobs:\n",
" break\n",
"\n",
" job_button.click() #You might \n",
" time.sleep(1)\n",
" collected_successfully = False\n",
" \n",
" while not collected_successfully:\n",
" try:\n",
" company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n",
" location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n",
" job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n",
" job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n",
" collected_successfully = True\n",
" except:\n",
" time.sleep(5)\n",
"\n",
" try:\n",
" salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray small salary\"]').text\n",
" except NoSuchElementException:\n",
" salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n",
" \n",
" try:\n",
" rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n",
" except NoSuchElementException:\n",
" rating = -1 #You need to set a \"not found value. It's important.\"\n",
"\n",
" #Printing for debugging\n",
" if verbose:\n",
" print(\"Job Title: {}\".format(job_title))\n",
" print(\"Salary Estimate: {}\".format(salary_estimate))\n",
" print(\"Job Description: {}\".format(job_description[:500]))\n",
" print(\"Rating: {}\".format(rating))\n",
" print(\"Company Name: {}\".format(company_name))\n",
" print(\"Location: {}\".format(location))\n",
"\n",
" #Going to the Company tab...\n",
" #clicking on this:\n",
" #<div class=\"tab\" data-tab-type=\"overview\"><span>Company</span></div>\n",
" try:\n",
" driver.find_element_by_xpath('.//div[@class=\"tab\" and @data-tab-type=\"overview\"]').click()\n",
"\n",
" try:\n",
" #<div class=\"infoEntity\">\n",
" # <label>Headquarters</label>\n",
" # <span class=\"value\">San Francisco, CA</span>\n",
" #</div>\n",
" headquarters = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Headquarters\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" headquarters = -1\n",
"\n",
" try:\n",
" size = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Size\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" size = -1\n",
"\n",
" try:\n",
" founded = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Founded\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" founded = -1\n",
"\n",
" try:\n",
" type_of_ownership = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Type\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" type_of_ownership = -1\n",
"\n",
" try:\n",
" industry = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Industry\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" industry = -1\n",
"\n",
" try:\n",
" sector = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Sector\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" sector = -1\n",
"\n",
" try:\n",
" revenue = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Revenue\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" revenue = -1\n",
"\n",
" try:\n",
" competitors = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Competitors\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" competitors = -1\n",
"\n",
" except NoSuchElementException: #Rarely, some job postings do not have the \"Company\" tab.\n",
" headquarters = -1\n",
" size = -1\n",
" founded = -1\n",
" type_of_ownership = -1\n",
" industry = -1\n",
" sector = -1\n",
" revenue = -1\n",
" competitors = -1\n",
"\n",
" \n",
" if verbose:\n",
" print(\"Headquarters: {}\".format(headquarters))\n",
" print(\"Size: {}\".format(size))\n",
" print(\"Founded: {}\".format(founded))\n",
" print(\"Type of Ownership: {}\".format(type_of_ownership))\n",
" print(\"Industry: {}\".format(industry))\n",
" print(\"Sector: {}\".format(sector))\n",
" print(\"Revenue: {}\".format(revenue))\n",
" print(\"Competitors: {}\".format(competitors))\n",
" print(\"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\")\n",
"\n",
" jobs.append({\"Job Title\" : job_title,\n",
" \"Salary Estimate\" : salary_estimate,\n",
" \"Job Description\" : job_description,\n",
" \"Rating\" : rating,\n",
" \"Company Name\" : company_name,\n",
" \"Location\" : location,\n",
" \"Headquarters\" : headquarters,\n",
" \"Size\" : size,\n",
" \"Founded\" : founded,\n",
" \"Type of ownership\" : type_of_ownership,\n",
" \"Industry\" : industry,\n",
" \"Sector\" : sector,\n",
" \"Revenue\" : revenue,\n",
" \"Competitors\" : competitors})\n",
" #add job to jobs\n",
"\n",
" #Clicking on the \"next page\" button\n",
" try:\n",
" driver.find_element_by_xpath('.//li[@class=\"next\"]//a').click()\n",
" except NoSuchElementException:\n",
" print(\"Scraping terminated before reaching target number of jobs. Needed {}, got {}.\".format(num_jobs, len(jobs)))\n",
" break\n",
"\n",
" return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"ename": "OSError",
"evalue": "[Errno 8] Exec format error: '/home/jovyan/chromedriver'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_1022/156947742.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#This line will open a new chrome window and start the scraping.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_jobs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"data scientist\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/tmp/ipykernel_1022/3679239868.py\u001b[0m in \u001b[0;36mget_jobs\u001b[0;34m(keyword, num_jobs, verbose)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m#Change the path to where chromedriver is in your home folder.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mdriver\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwebdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mChrome\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexecutable_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"/home/jovyan/chromedriver\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_window_size\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1120\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/chrome/webdriver.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, executable_path, port, options, service_args, desired_capabilities, service_log_path, chrome_options, keep_alive)\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mservice_args\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mservice_args\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m log_path=service_log_path)\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mservice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/site-packages/selenium/webdriver/common/service.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0mcmd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mcmd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommand_line_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m self.process = subprocess.Popen(cmd, env=self.env,\n\u001b[0m\u001b[1;32m 73\u001b[0m \u001b[0mclose_fds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplatform\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'Windows'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0mstdout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/subprocess.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask)\u001b[0m\n\u001b[1;32m 949\u001b[0m encoding=encoding, errors=errors)\n\u001b[1;32m 950\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 951\u001b[0;31m self._execute_child(args, executable, preexec_fn, close_fds,\n\u001b[0m\u001b[1;32m 952\u001b[0m \u001b[0mpass_fds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcwd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 953\u001b[0m \u001b[0mstartupinfo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreationflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshell\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.9/subprocess.py\u001b[0m in \u001b[0;36m_execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)\u001b[0m\n\u001b[1;32m 1819\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrno_num\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1820\u001b[0m \u001b[0merr_msg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrerror\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1821\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merrno_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_msg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1822\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mchild_exception_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1823\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mOSError\u001b[0m: [Errno 8] Exec format error: '/home/jovyan/chromedriver'"
]
}
],
"source": [
"#This line will open a new chrome window and start the scraping.\n",
"df = get_jobs(\"data scientist\", 5, False)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}