jupyter-notebooks/unpublished/job_scrape/utils/original.py

import csv
import random
import time
from datetime import date
from time import sleep
import json
import numpy as np
import pdfkit
import logging as log


def soup2json():
    with open("nea.json", "a+") as fs:
        for area in soup.find("weatherForecast").find_all("area"):
            fs.write(str(area.attrs) + "\n")


# def soup2json(soup):
#     with open("ziprecruiter-items.json",'a+') as fs:
#         for area in soup.find('weatherForecast').find_all('area'):
#             fs.write(str(area.attrs)+"\n")

# job_posts = soup.find_all("article", class_="job_result")


def random_wait():
    moment = np.random.normal(5.0, 1.0)
    print(moment)
    return moment


def pretty_date():
    today = date.today()
    search_date = today.strftime("%m_%d_%Y")
    return search_date


# clean all non-alphanumberic characters


# def strip(string):
#     words = string.split()
#     words = [word for word in words if "#" not in word]
#     string = " ".join(words)
#     clean = ""
#     for c in string:
#         if str.isalnum(c) or (c in [" ", ".", ","]):
#             clean += c
#     return clean


def generate_json(job_objects, filename):
    try:
        with open(filename, "w") as outfile:
            json.dump(job_objects, outfile, indent=4)
        log.info("Created jobs.json!")
    except:
        log.error("Failed to create jobs.json!")


def generate_csv(datas, csv_object_header, filename):
    try:
        with open(
            filename,
            "w+",
            newline="",
            encoding="utf-8",
        ) as save_file:
            writer = csv.writer(save_file)
            writer.writerow(csv_object_header)
            #     [
            #         "data_listing_version_key",
            #         "data_job_id",
            #         "data_location",
            #         "data_posted_on",
            #         "title",
            #         "job_url",
            #         "geo_lat",
            #         "geo_lng",
            #         "apply_method",
            #         "external_apply",
            #         "employer",
            #         "job_snippet",
            #         "status",
            #         "search_keyword",
            #         "search_window",
            #         "pull_date",
            #         "job_site",
            #         "blacklisted_title_ind",
            #     ]
            # )
            for data in datas:
                writer.writerow(data)

        log.info("Exported jobs.csv")
    except:
        log.error("couldnt generate csv")


def blacklisted_title_check(title, employer, blackListTitles):
    for blacklist in blackListTitles:
        if blacklist in title:
            return True
    return False
    # else:
    #     # print(f"[GRABBED] {title} - {employer}")
    #     # job_objects.append(job_object)
    #     # datas.append([data_listing_version_key, job_site, search_keyword])
    #     blacklist_ind = 0

    # return is_blacklist


# def soup2json(soup):
#     with open("ziprecruiter-items.json",'a+') as fs:
#         for area in soup.find('weatherForecast').find_all('area'):
#             fs.write(str(area.attrs)+"\n")

# job_posts = soup.find_all("article", class_="job_result")

# creating CSV header

# def strip(string):
#     words = string.split()
#     words = [word for word in words if "#" not in word]
#     string = " ".join(words)
#     clean = ""
#     for c in string:
#         if str.isalnum(c) or (c in [" ", ".", ","]):
#             clean += c
#     return clean


# # link for extract html data
# def getdata(url):
#     r = requests.get(url)
#     return r.text

# htmldata = getdata("https://www.geeksforgeeks.org/how-to-automate-an-excel-sheet-in-python/?ref=feed")
# soup = BeautifulSoup(htmldata, 'html.parser')
# data = ''
# for data in soup.find_all("p"):
#     print(data.get_text())


def urltopdf(url, filename):
    pdfkit.from_url(url, filename)