Notebooks >> Scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

288 lines
8.1 KiB

import csv
import random
import time
import os
from datetime import date
from datetime import datetime
from datetime import timezone
from time import sleep
import json
import numpy as np
import pdfkit
import logging as log
import yaml
# import yaml
import shutil
from pprint import pprint
from hashlib import blake2b
def hash_key_gen(session_id, data_listing_version_key, search_keyboard):
h = blake2b(digest_size=16)
string1 = str(session_id)
str2 = string1.encode('utf-8')
string3 = str(data_listing_version_key)
str4 = string3.encode('utf-8')
string5 = str(search_keyboard)
str6 = string5.encode('utf-8')
h.update(str2 + str4 + str6)
# h.update(str4)
key = h.hexdigest()
print(key)
return key
def soup2json():
with open("nea.json", "a+") as fs:
for area in soup.find("weatherForecast").find_all("area"):
fs.write(str(area.attrs) + "\n")
# os.chmod('test.txt', stat.S_IRWXU|stat.S_IRWXG|stat.S_IRWXO)
def ensure_dir_exists(directory):
# try:
os.makedirs(directory, mode=0o777, exist_ok=True)
# except:
# log.error("couldnt make dir")
# pass
log.info("{} exists".format(directory))
def make_new_dir(directory):
try:
os.makedirs(directory, mode=0o777)
except FileExistsError:
log.error("couldnt make {}".format(directory))
# directory already exists
# pass
def scroll_to_bottom():
"""Scroll to the bottom of the page
Params:
- scroll_pause_time {float}: time to wait (s) between page scroll increments
- scroll_increment {int}: increment size of page scrolls (pixels)
"""
# NOTE: this starts scrolling from the current scroll position, not the top of the page.
current_height = driver.execute_script("return document.documentElement.scrollTop")
print(current_height)
while True:
# click_expandable_buttons()
# Scroll down to bottom in increments of self.scroll_increment
new_height = driver.execute_script(
"return Math.min({}, document.body.scrollHeight)".format(
current_height + scroll_increment
)
)
if new_height == current_height:
break
# driver.execute_script(
# "window.scrollTo(0, {});".format(new_height))
# JavascriptExecutor js = (JavascriptExecutor) driver;
# js.executeScript("window.scrollTo(0, document.body.scrollHeight)");
current_height = new_height
print(current_height)
# Wait to load page
time.sleep(scroll_pause)
# def soup2json(soup):
# with open("ziprecruiter-items.json",'a+') as fs:
# for area in soup.find('weatherForecast').find_all('area'):
# fs.write(str(area.attrs)+"\n")
# job_posts = soup.find_all("article", class_="job_result")
def config_file_settings():
try:
with open("/data/scripts/config.yaml", "r") as stream:
try:
parameters = yaml.safe_load(stream)
except yaml.YAMLError as exc:
raise exc
except:
with open("config.yaml", "r") as stream:
try:
parameters = yaml.safe_load(stream)
except yaml.YAMLError as exc:
raise exc
return parameters
def prep_folders(module, search_keyword):
session_id = os.environ["AIRFLOW_CTX_DAG_RUN_ID"]
root_csv_dir = "/data/data/csv/{0}/".format(module)
root_data_dir = "/data/data/{0}/".format(module)
staging_base_dir = "/data/data/staging/"
staging_data_dir = "/data/data/staging/{0}/{1}{2}/".format(module, session_id, search_keyword)
ensure_dir_exists(root_data_dir)
ensure_dir_exists(root_csv_dir)
ensure_dir_exists(root_csv_dir + "jobs")
ensure_dir_exists(root_csv_dir + "sessions")
ensure_dir_exists(staging_base_dir)
ensure_dir_exists(staging_data_dir + "snippets")
def move_file_bundle(module, search_keyword):
session_id = os.environ["AIRFLOW_CTX_DAG_RUN_ID"]
target = "/data/data/{0}/".format(module, session_id)
original = "/data/data/staging/{0}/{1}{2}".format(module, session_id, search_keyword)
staging_jobs_csv = "/data/data/staging/{0}/{1}{2}/jobs.csv".format(module, session_id, search_keyword)
archive_jobs_csv = "/data/data/csv/{0}/jobs/jobs_{1}{2}.csv".format(module, session_id, search_keyword)
# try:
shutil.copy(staging_jobs_csv, archive_jobs_csv)
log.info(staging_jobs_csv + " copy to " + archive_jobs_csv)
# except:
# log.error(archive_jobs_csv + " failed to move to " + staging_jobs_csv)
# raise
# try:
shutil.move(original, target)
log.info(original + " move to " + target)
# except:
# log.error(original + " failed to move to " + target)
# raise
def random_wait():
moment = np.random.normal(5.0, 2.5)
adj = max(moment, 2.5)
# print(moment)
return adj
# def session_timestamp():
# session_timestamp = str(int(time.time()))
# # session_id = "{0}{1}".format(prefix, run_id)
# log.info(session_timestamp)
# return session_timestamp
def session_id(module):
session_timestamp = str(int(time.time()))
session_id = "{0}{1}".format(module, session_timestamp)
log.info(session_id)
return {"session_id": session_id}
def pretty_date():
today = date.today()
search_date = today.strftime("%m_%d_%Y")
return search_date
def infinite_scoll(driver, module, log, filename):
log.info("Infinite scrolling...")
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
log.info("Scrolled a page")
rand_pause = random_wait()
sleep(rand_pause)
log.info("Paused for {} secs".format(rand_pause))
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
log.info("Infinite scroll finished, screen grabbed")
driver.save_screenshot(filename)
return
# clean all non-alphanumberic characters
# def strip(string):
# words = string.split()
# words = [word for word in words if "#" not in word]
# string = " ".join(words)
# clean = ""
# for c in string:
# if str.isalnum(c) or (c in [" ", ".", ","]):
# clean += c
# return clean
def print_dag_context(*args, **kwargs):
"""Print the Airflow context and ds variable from the context."""
# print(random_base)
pprint(kwargs)
# pprint(args)
# print("ds")
# print(foo)
return "Whatever you return gets printed in the logs"
def blacklisted_title_check(title, employer, blackListTitles):
for blacklist in blackListTitles:
if blacklist in title:
return True
return False
def unix_timestamp(now):
# dt = datetime.now()
timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
print(timestamp)
return timestamp
def datestring(now):
# now = datetime.now()
# dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y")
return dt_string
def timestring(now):
# now = datetime.now()
# dd/mm/YY H:M:S
tm_string = now.strftime("%H:%M:%S")
return tm_string
# def soup2json(soup):
# with open("ziprecruiter-items.json",'a+') as fs:
# for area in soup.find('weatherForecast').find_all('area'):
# fs.write(str(area.attrs)+"\n")
# job_posts = soup.find_all("article", class_="job_result")
# creating CSV header
# def strip(string):
# words = string.split()
# words = [word for word in words if "#" not in word]
# string = " ".join(words)
# clean = ""
# for c in string:
# if str.isalnum(c) or (c in [" ", ".", ","]):
# clean += c
# return clean
# # link for extract html data
# def getdata(url):
# r = requests.get(url)
# return r.text
# htmldata = getdata("https://www.geeksforgeeks.org/how-to-automate-an-excel-sheet-in-python/?ref=feed")
# soup = BeautifulSoup(htmldata, 'html.parser')
# data = ''
# for data in soup.find_all("p"):
# print(data.get_text())
def urltopdf(url, filename):
pdfkit.from_url(url, filename)