jupyter-notebooks/unpublished/job_scrape/utils/original2.py

import csv
import random
import time
import os
from datetime import date
from datetime import datetime
from datetime import timezone
from time import sleep
import json
import numpy as np
import pdfkit
import logging as log
import yaml
# import yaml
import shutil
from pprint import pprint
from hashlib import blake2b

def hash_key_gen(session_id, data_listing_version_key, search_keyboard):
    h = blake2b(digest_size=16)
    string1 = str(session_id)
    str2 = string1.encode('utf-8')
    string3 = str(data_listing_version_key)
    str4 = string3.encode('utf-8')
    string5 = str(search_keyboard)
    str6 = string5.encode('utf-8')
    h.update(str2 + str4 + str6)
    # h.update(str4)
    key = h.hexdigest()
    print(key)
    return key

def soup2json():
    with open("nea.json", "a+") as fs:
        for area in soup.find("weatherForecast").find_all("area"):
            fs.write(str(area.attrs) + "\n")

    # os.chmod('test.txt', stat.S_IRWXU|stat.S_IRWXG|stat.S_IRWXO)


def ensure_dir_exists(directory):
    # try:
    os.makedirs(directory, mode=0o777, exist_ok=True)
    # except:
    #     log.error("couldnt make dir")
    #     pass
    log.info("{} exists".format(directory))


def make_new_dir(directory):
    try:
        os.makedirs(directory, mode=0o777)
    except FileExistsError:
        log.error("couldnt make {}".format(directory))
        # directory already exists
        # pass


def scroll_to_bottom():
    """Scroll to the bottom of the page
    Params:
        - scroll_pause_time {float}: time to wait (s) between page scroll increments
        - scroll_increment {int}: increment size of page scrolls (pixels)
    """
    # NOTE: this starts scrolling from the current scroll position, not the top of the page.
    current_height = driver.execute_script("return document.documentElement.scrollTop")
    print(current_height)
    while True:
        # click_expandable_buttons()
        # Scroll down to bottom in increments of self.scroll_increment
        new_height = driver.execute_script(
            "return Math.min({}, document.body.scrollHeight)".format(
                current_height + scroll_increment
            )
        )
        if new_height == current_height:
            break
        # driver.execute_script(
        #     "window.scrollTo(0, {});".format(new_height))
        # JavascriptExecutor js = (JavascriptExecutor) driver;

        # js.executeScript("window.scrollTo(0, document.body.scrollHeight)");
        current_height = new_height
        print(current_height)
        # Wait to load page
        time.sleep(scroll_pause)


# def soup2json(soup):
#     with open("ziprecruiter-items.json",'a+') as fs:
#         for area in soup.find('weatherForecast').find_all('area'):
#             fs.write(str(area.attrs)+"\n")

# job_posts = soup.find_all("article", class_="job_result")


def config_file_settings():
    try:
        with open("/data/scripts/config.yaml", "r") as stream:
            try:
                parameters = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                raise exc
    except:
        with open("config.yaml", "r") as stream:
            try:
                parameters = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                raise exc
    return parameters


def prep_folders(module, search_keyword):
    session_id = os.environ["AIRFLOW_CTX_DAG_RUN_ID"]
    root_csv_dir = "/data/data/csv/{0}/".format(module)
    root_data_dir = "/data/data/{0}/".format(module)
    staging_base_dir = "/data/data/staging/"
    staging_data_dir = "/data/data/staging/{0}/{1}{2}/".format(module, session_id, search_keyword)
    ensure_dir_exists(root_data_dir)
    ensure_dir_exists(root_csv_dir)
    ensure_dir_exists(root_csv_dir + "jobs")
    ensure_dir_exists(root_csv_dir + "sessions")
    ensure_dir_exists(staging_base_dir)
    ensure_dir_exists(staging_data_dir + "snippets")


def move_file_bundle(module, search_keyword):
    session_id = os.environ["AIRFLOW_CTX_DAG_RUN_ID"]
    target = "/data/data/{0}/".format(module, session_id)
    original = "/data/data/staging/{0}/{1}{2}".format(module, session_id, search_keyword)
    staging_jobs_csv = "/data/data/staging/{0}/{1}{2}/jobs.csv".format(module, session_id, search_keyword)
    archive_jobs_csv = "/data/data/csv/{0}/jobs/jobs_{1}{2}.csv".format(module, session_id, search_keyword)
    # try:
    shutil.copy(staging_jobs_csv, archive_jobs_csv)
    log.info(staging_jobs_csv + " copy to " + archive_jobs_csv)
    # except:
    #     log.error(archive_jobs_csv + " failed to move to " + staging_jobs_csv)
    #     raise
    # try:
    shutil.move(original, target)
    log.info(original + " move to " + target)
    # except:
    #     log.error(original + " failed to move to " + target)
    #     raise


def random_wait():
    moment = np.random.normal(5.0, 2.5)
    adj = max(moment, 2.5)
    # print(moment)
    return adj


# def session_timestamp():
#     session_timestamp = str(int(time.time()))
#     # session_id = "{0}{1}".format(prefix, run_id)
#     log.info(session_timestamp)
#     return session_timestamp


def session_id(module):
    session_timestamp = str(int(time.time()))
    session_id = "{0}{1}".format(module, session_timestamp)
    log.info(session_id)
    return {"session_id": session_id}


def pretty_date():
    today = date.today()
    search_date = today.strftime("%m_%d_%Y")
    return search_date


def infinite_scoll(driver, module, log, filename):
    log.info("Infinite scrolling...")

    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        log.info("Scrolled a page")

        rand_pause = random_wait()

        sleep(rand_pause)

        log.info("Paused for {} secs".format(rand_pause))

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            break
        else:
            last_height = new_height

    log.info("Infinite scroll finished, screen grabbed")
    driver.save_screenshot(filename)
    return


# clean all non-alphanumberic characters


# def strip(string):
#     words = string.split()
#     words = [word for word in words if "#" not in word]
#     string = " ".join(words)
#     clean = ""
#     for c in string:
#         if str.isalnum(c) or (c in [" ", ".", ","]):
#             clean += c
#     return clean


def print_dag_context(*args, **kwargs):
    """Print the Airflow context and ds variable from the context."""
    # print(random_base)
    pprint(kwargs)
    # pprint(args)
    # print("ds")
    # print(foo)
    return "Whatever you return gets printed in the logs"


def blacklisted_title_check(title, employer, blackListTitles):
    for blacklist in blackListTitles:
        if blacklist in title:
            return True
    return False

def unix_timestamp(now):
    # dt = datetime.now()
    timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
    print(timestamp)
    return timestamp

def datestring(now):
    # now = datetime.now()
    # dd/mm/YY H:M:S
    dt_string = now.strftime("%d/%m/%Y")
    return dt_string

def timestring(now):
    # now = datetime.now()
    # dd/mm/YY H:M:S
    tm_string = now.strftime("%H:%M:%S")
    return tm_string

# def soup2json(soup):
#     with open("ziprecruiter-items.json",'a+') as fs:
#         for area in soup.find('weatherForecast').find_all('area'):
#             fs.write(str(area.attrs)+"\n")

# job_posts = soup.find_all("article", class_="job_result")

# creating CSV header

# def strip(string):
#     words = string.split()
#     words = [word for word in words if "#" not in word]
#     string = " ".join(words)
#     clean = ""
#     for c in string:
#         if str.isalnum(c) or (c in [" ", ".", ","]):
#             clean += c
#     return clean


# # link for extract html data
# def getdata(url):
#     r = requests.get(url)
#     return r.text

# htmldata = getdata("https://www.geeksforgeeks.org/how-to-automate-an-excel-sheet-in-python/?ref=feed")
# soup = BeautifulSoup(htmldata, 'html.parser')
# data = ''
# for data in soup.find_all("p"):
#     print(data.get_text())


def urltopdf(url, filename):
    pdfkit.from_url(url, filename)