You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
153 lines
3.8 KiB
153 lines
3.8 KiB
import csv |
|
import random |
|
import time |
|
from datetime import date |
|
from time import sleep |
|
import json |
|
import numpy as np |
|
import pdfkit |
|
import logging as log |
|
|
|
|
|
def soup2json(): |
|
with open("nea.json", "a+") as fs: |
|
for area in soup.find("weatherForecast").find_all("area"): |
|
fs.write(str(area.attrs) + "\n") |
|
|
|
|
|
|
|
|
|
|
|
# def soup2json(soup): |
|
# with open("ziprecruiter-items.json",'a+') as fs: |
|
# for area in soup.find('weatherForecast').find_all('area'): |
|
# fs.write(str(area.attrs)+"\n") |
|
|
|
# job_posts = soup.find_all("article", class_="job_result") |
|
|
|
|
|
def random_wait(): |
|
moment = np.random.normal(5.0, 1.0) |
|
print(moment) |
|
return moment |
|
|
|
|
|
def pretty_date(): |
|
today = date.today() |
|
search_date = today.strftime("%m_%d_%Y") |
|
return search_date |
|
|
|
|
|
|
|
|
|
|
|
# clean all non-alphanumberic characters |
|
|
|
|
|
# def strip(string): |
|
# words = string.split() |
|
# words = [word for word in words if "#" not in word] |
|
# string = " ".join(words) |
|
# clean = "" |
|
# for c in string: |
|
# if str.isalnum(c) or (c in [" ", ".", ","]): |
|
# clean += c |
|
# return clean |
|
|
|
|
|
def generate_json(job_objects, filename): |
|
try: |
|
with open(filename, "w") as outfile: |
|
json.dump(job_objects, outfile, indent=4) |
|
log.info("Created jobs.json!") |
|
except: |
|
log.error("Failed to create jobs.json!") |
|
|
|
|
|
def generate_csv(datas, csv_object_header, filename): |
|
try: |
|
with open( |
|
filename, |
|
"w+", |
|
newline="", |
|
encoding="utf-8", |
|
) as save_file: |
|
writer = csv.writer(save_file) |
|
writer.writerow(csv_object_header) |
|
# [ |
|
# "data_listing_version_key", |
|
# "data_job_id", |
|
# "data_location", |
|
# "data_posted_on", |
|
# "title", |
|
# "job_url", |
|
# "geo_lat", |
|
# "geo_lng", |
|
# "apply_method", |
|
# "external_apply", |
|
# "employer", |
|
# "job_snippet", |
|
# "status", |
|
# "search_keyword", |
|
# "search_window", |
|
# "pull_date", |
|
# "job_site", |
|
# "blacklisted_title_ind", |
|
# ] |
|
# ) |
|
for data in datas: |
|
writer.writerow(data) |
|
|
|
log.info("Exported jobs.csv") |
|
except: |
|
log.error("couldnt generate csv") |
|
|
|
|
|
def blacklisted_title_check(title, employer, blackListTitles): |
|
for blacklist in blackListTitles: |
|
if blacklist in title: |
|
return True |
|
return False |
|
# else: |
|
# # print(f"[GRABBED] {title} - {employer}") |
|
# # job_objects.append(job_object) |
|
# # datas.append([data_listing_version_key, job_site, search_keyword]) |
|
# blacklist_ind = 0 |
|
|
|
# return is_blacklist |
|
|
|
|
|
# def soup2json(soup): |
|
# with open("ziprecruiter-items.json",'a+') as fs: |
|
# for area in soup.find('weatherForecast').find_all('area'): |
|
# fs.write(str(area.attrs)+"\n") |
|
|
|
# job_posts = soup.find_all("article", class_="job_result") |
|
|
|
# creating CSV header |
|
|
|
# def strip(string): |
|
# words = string.split() |
|
# words = [word for word in words if "#" not in word] |
|
# string = " ".join(words) |
|
# clean = "" |
|
# for c in string: |
|
# if str.isalnum(c) or (c in [" ", ".", ","]): |
|
# clean += c |
|
# return clean |
|
|
|
|
|
# # link for extract html data |
|
# def getdata(url): |
|
# r = requests.get(url) |
|
# return r.text |
|
|
|
# htmldata = getdata("https://www.geeksforgeeks.org/how-to-automate-an-excel-sheet-in-python/?ref=feed") |
|
# soup = BeautifulSoup(htmldata, 'html.parser') |
|
# data = '' |
|
# for data in soup.find_all("p"): |
|
# print(data.get_text()) |
|
|
|
|
|
def urltopdf(url, filename): |
|
pdfkit.from_url(url, filename)
|
|
|