You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
288 lines
8.1 KiB
288 lines
8.1 KiB
import csv |
|
import random |
|
import time |
|
import os |
|
from datetime import date |
|
from datetime import datetime |
|
from datetime import timezone |
|
from time import sleep |
|
import json |
|
import numpy as np |
|
import pdfkit |
|
import logging as log |
|
import yaml |
|
# import yaml |
|
import shutil |
|
from pprint import pprint |
|
from hashlib import blake2b |
|
|
|
def hash_key_gen(session_id, data_listing_version_key, search_keyboard): |
|
h = blake2b(digest_size=16) |
|
string1 = str(session_id) |
|
str2 = string1.encode('utf-8') |
|
string3 = str(data_listing_version_key) |
|
str4 = string3.encode('utf-8') |
|
string5 = str(search_keyboard) |
|
str6 = string5.encode('utf-8') |
|
h.update(str2 + str4 + str6) |
|
# h.update(str4) |
|
key = h.hexdigest() |
|
print(key) |
|
return key |
|
|
|
def soup2json(): |
|
with open("nea.json", "a+") as fs: |
|
for area in soup.find("weatherForecast").find_all("area"): |
|
fs.write(str(area.attrs) + "\n") |
|
|
|
# os.chmod('test.txt', stat.S_IRWXU|stat.S_IRWXG|stat.S_IRWXO) |
|
|
|
|
|
def ensure_dir_exists(directory): |
|
# try: |
|
os.makedirs(directory, mode=0o777, exist_ok=True) |
|
# except: |
|
# log.error("couldnt make dir") |
|
# pass |
|
log.info("{} exists".format(directory)) |
|
|
|
|
|
def make_new_dir(directory): |
|
try: |
|
os.makedirs(directory, mode=0o777) |
|
except FileExistsError: |
|
log.error("couldnt make {}".format(directory)) |
|
# directory already exists |
|
# pass |
|
|
|
|
|
def scroll_to_bottom(): |
|
"""Scroll to the bottom of the page |
|
Params: |
|
- scroll_pause_time {float}: time to wait (s) between page scroll increments |
|
- scroll_increment {int}: increment size of page scrolls (pixels) |
|
""" |
|
# NOTE: this starts scrolling from the current scroll position, not the top of the page. |
|
current_height = driver.execute_script("return document.documentElement.scrollTop") |
|
print(current_height) |
|
while True: |
|
# click_expandable_buttons() |
|
# Scroll down to bottom in increments of self.scroll_increment |
|
new_height = driver.execute_script( |
|
"return Math.min({}, document.body.scrollHeight)".format( |
|
current_height + scroll_increment |
|
) |
|
) |
|
if new_height == current_height: |
|
break |
|
# driver.execute_script( |
|
# "window.scrollTo(0, {});".format(new_height)) |
|
# JavascriptExecutor js = (JavascriptExecutor) driver; |
|
|
|
# js.executeScript("window.scrollTo(0, document.body.scrollHeight)"); |
|
current_height = new_height |
|
print(current_height) |
|
# Wait to load page |
|
time.sleep(scroll_pause) |
|
|
|
|
|
# def soup2json(soup): |
|
# with open("ziprecruiter-items.json",'a+') as fs: |
|
# for area in soup.find('weatherForecast').find_all('area'): |
|
# fs.write(str(area.attrs)+"\n") |
|
|
|
# job_posts = soup.find_all("article", class_="job_result") |
|
|
|
|
|
def config_file_settings(): |
|
try: |
|
with open("/data/scripts/config.yaml", "r") as stream: |
|
try: |
|
parameters = yaml.safe_load(stream) |
|
except yaml.YAMLError as exc: |
|
raise exc |
|
except: |
|
with open("config.yaml", "r") as stream: |
|
try: |
|
parameters = yaml.safe_load(stream) |
|
except yaml.YAMLError as exc: |
|
raise exc |
|
return parameters |
|
|
|
|
|
def prep_folders(module, search_keyword): |
|
session_id = os.environ["AIRFLOW_CTX_DAG_RUN_ID"] |
|
root_csv_dir = "/data/data/csv/{0}/".format(module) |
|
root_data_dir = "/data/data/{0}/".format(module) |
|
staging_base_dir = "/data/data/staging/" |
|
staging_data_dir = "/data/data/staging/{0}/{1}{2}/".format(module, session_id, search_keyword) |
|
ensure_dir_exists(root_data_dir) |
|
ensure_dir_exists(root_csv_dir) |
|
ensure_dir_exists(root_csv_dir + "jobs") |
|
ensure_dir_exists(root_csv_dir + "sessions") |
|
ensure_dir_exists(staging_base_dir) |
|
ensure_dir_exists(staging_data_dir + "snippets") |
|
|
|
|
|
def move_file_bundle(module, search_keyword): |
|
session_id = os.environ["AIRFLOW_CTX_DAG_RUN_ID"] |
|
target = "/data/data/{0}/".format(module, session_id) |
|
original = "/data/data/staging/{0}/{1}{2}".format(module, session_id, search_keyword) |
|
staging_jobs_csv = "/data/data/staging/{0}/{1}{2}/jobs.csv".format(module, session_id, search_keyword) |
|
archive_jobs_csv = "/data/data/csv/{0}/jobs/jobs_{1}{2}.csv".format(module, session_id, search_keyword) |
|
# try: |
|
shutil.copy(staging_jobs_csv, archive_jobs_csv) |
|
log.info(staging_jobs_csv + " copy to " + archive_jobs_csv) |
|
# except: |
|
# log.error(archive_jobs_csv + " failed to move to " + staging_jobs_csv) |
|
# raise |
|
# try: |
|
shutil.move(original, target) |
|
log.info(original + " move to " + target) |
|
# except: |
|
# log.error(original + " failed to move to " + target) |
|
# raise |
|
|
|
|
|
def random_wait(): |
|
moment = np.random.normal(5.0, 2.5) |
|
adj = max(moment, 2.5) |
|
# print(moment) |
|
return adj |
|
|
|
|
|
# def session_timestamp(): |
|
# session_timestamp = str(int(time.time())) |
|
# # session_id = "{0}{1}".format(prefix, run_id) |
|
# log.info(session_timestamp) |
|
# return session_timestamp |
|
|
|
|
|
def session_id(module): |
|
session_timestamp = str(int(time.time())) |
|
session_id = "{0}{1}".format(module, session_timestamp) |
|
log.info(session_id) |
|
return {"session_id": session_id} |
|
|
|
|
|
def pretty_date(): |
|
today = date.today() |
|
search_date = today.strftime("%m_%d_%Y") |
|
return search_date |
|
|
|
|
|
|
|
|
|
|
|
def infinite_scoll(driver, module, log, filename): |
|
log.info("Infinite scrolling...") |
|
|
|
last_height = driver.execute_script("return document.body.scrollHeight") |
|
|
|
while True: |
|
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") |
|
log.info("Scrolled a page") |
|
|
|
rand_pause = random_wait() |
|
|
|
sleep(rand_pause) |
|
|
|
log.info("Paused for {} secs".format(rand_pause)) |
|
|
|
new_height = driver.execute_script("return document.body.scrollHeight") |
|
|
|
if new_height == last_height: |
|
break |
|
else: |
|
last_height = new_height |
|
|
|
log.info("Infinite scroll finished, screen grabbed") |
|
driver.save_screenshot(filename) |
|
return |
|
|
|
|
|
# clean all non-alphanumberic characters |
|
|
|
|
|
# def strip(string): |
|
# words = string.split() |
|
# words = [word for word in words if "#" not in word] |
|
# string = " ".join(words) |
|
# clean = "" |
|
# for c in string: |
|
# if str.isalnum(c) or (c in [" ", ".", ","]): |
|
# clean += c |
|
# return clean |
|
|
|
|
|
def print_dag_context(*args, **kwargs): |
|
"""Print the Airflow context and ds variable from the context.""" |
|
# print(random_base) |
|
pprint(kwargs) |
|
# pprint(args) |
|
# print("ds") |
|
# print(foo) |
|
return "Whatever you return gets printed in the logs" |
|
|
|
|
|
|
|
|
|
def blacklisted_title_check(title, employer, blackListTitles): |
|
for blacklist in blackListTitles: |
|
if blacklist in title: |
|
return True |
|
return False |
|
|
|
def unix_timestamp(now): |
|
# dt = datetime.now() |
|
timestamp = dt.replace(tzinfo=timezone.utc).timestamp() |
|
print(timestamp) |
|
return timestamp |
|
|
|
def datestring(now): |
|
# now = datetime.now() |
|
# dd/mm/YY H:M:S |
|
dt_string = now.strftime("%d/%m/%Y") |
|
return dt_string |
|
|
|
def timestring(now): |
|
# now = datetime.now() |
|
# dd/mm/YY H:M:S |
|
tm_string = now.strftime("%H:%M:%S") |
|
return tm_string |
|
|
|
# def soup2json(soup): |
|
# with open("ziprecruiter-items.json",'a+') as fs: |
|
# for area in soup.find('weatherForecast').find_all('area'): |
|
# fs.write(str(area.attrs)+"\n") |
|
|
|
# job_posts = soup.find_all("article", class_="job_result") |
|
|
|
# creating CSV header |
|
|
|
# def strip(string): |
|
# words = string.split() |
|
# words = [word for word in words if "#" not in word] |
|
# string = " ".join(words) |
|
# clean = "" |
|
# for c in string: |
|
# if str.isalnum(c) or (c in [" ", ".", ","]): |
|
# clean += c |
|
# return clean |
|
|
|
|
|
# # link for extract html data |
|
# def getdata(url): |
|
# r = requests.get(url) |
|
# return r.text |
|
|
|
# htmldata = getdata("https://www.geeksforgeeks.org/how-to-automate-an-excel-sheet-in-python/?ref=feed") |
|
# soup = BeautifulSoup(htmldata, 'html.parser') |
|
# data = '' |
|
# for data in soup.find_all("p"): |
|
# print(data.get_text()) |
|
|
|
|
|
def urltopdf(url, filename): |
|
pdfkit.from_url(url, filename)
|
|
|