import csv import random import time import os from datetime import date from datetime import datetime from datetime import timezone from time import sleep import json import numpy as np import pdfkit import logging as log import yaml # import yaml import shutil from pprint import pprint from hashlib import blake2b def hash_key_gen(session_id, data_listing_version_key, search_keyboard): h = blake2b(digest_size=16) string1 = str(session_id) str2 = string1.encode('utf-8') string3 = str(data_listing_version_key) str4 = string3.encode('utf-8') string5 = str(search_keyboard) str6 = string5.encode('utf-8') h.update(str2 + str4 + str6) # h.update(str4) key = h.hexdigest() print(key) return key def soup2json(): with open("nea.json", "a+") as fs: for area in soup.find("weatherForecast").find_all("area"): fs.write(str(area.attrs) + "\n") # os.chmod('test.txt', stat.S_IRWXU|stat.S_IRWXG|stat.S_IRWXO) def ensure_dir_exists(directory): # try: os.makedirs(directory, mode=0o777, exist_ok=True) # except: # log.error("couldnt make dir") # pass log.info("{} exists".format(directory)) def make_new_dir(directory): try: os.makedirs(directory, mode=0o777) except FileExistsError: log.error("couldnt make {}".format(directory)) # directory already exists # pass def scroll_to_bottom(): """Scroll to the bottom of the page Params: - scroll_pause_time {float}: time to wait (s) between page scroll increments - scroll_increment {int}: increment size of page scrolls (pixels) """ # NOTE: this starts scrolling from the current scroll position, not the top of the page. current_height = driver.execute_script("return document.documentElement.scrollTop") print(current_height) while True: # click_expandable_buttons() # Scroll down to bottom in increments of self.scroll_increment new_height = driver.execute_script( "return Math.min({}, document.body.scrollHeight)".format( current_height + scroll_increment ) ) if new_height == current_height: break # driver.execute_script( # "window.scrollTo(0, {});".format(new_height)) # JavascriptExecutor js = (JavascriptExecutor) driver; # js.executeScript("window.scrollTo(0, document.body.scrollHeight)"); current_height = new_height print(current_height) # Wait to load page time.sleep(scroll_pause) # def soup2json(soup): # with open("ziprecruiter-items.json",'a+') as fs: # for area in soup.find('weatherForecast').find_all('area'): # fs.write(str(area.attrs)+"\n") # job_posts = soup.find_all("article", class_="job_result") def config_file_settings(): try: with open("/data/scripts/config.yaml", "r") as stream: try: parameters = yaml.safe_load(stream) except yaml.YAMLError as exc: raise exc except: with open("config.yaml", "r") as stream: try: parameters = yaml.safe_load(stream) except yaml.YAMLError as exc: raise exc return parameters def prep_folders(module, search_keyword): session_id = os.environ["AIRFLOW_CTX_DAG_RUN_ID"] root_csv_dir = "/data/data/csv/{0}/".format(module) root_data_dir = "/data/data/{0}/".format(module) staging_base_dir = "/data/data/staging/" staging_data_dir = "/data/data/staging/{0}/{1}{2}/".format(module, session_id, search_keyword) ensure_dir_exists(root_data_dir) ensure_dir_exists(root_csv_dir) ensure_dir_exists(root_csv_dir + "jobs") ensure_dir_exists(root_csv_dir + "sessions") ensure_dir_exists(staging_base_dir) ensure_dir_exists(staging_data_dir + "snippets") def move_file_bundle(module, search_keyword): session_id = os.environ["AIRFLOW_CTX_DAG_RUN_ID"] target = "/data/data/{0}/".format(module, session_id) original = "/data/data/staging/{0}/{1}{2}".format(module, session_id, search_keyword) staging_jobs_csv = "/data/data/staging/{0}/{1}{2}/jobs.csv".format(module, session_id, search_keyword) archive_jobs_csv = "/data/data/csv/{0}/jobs/jobs_{1}{2}.csv".format(module, session_id, search_keyword) # try: shutil.copy(staging_jobs_csv, archive_jobs_csv) log.info(staging_jobs_csv + " copy to " + archive_jobs_csv) # except: # log.error(archive_jobs_csv + " failed to move to " + staging_jobs_csv) # raise # try: shutil.move(original, target) log.info(original + " move to " + target) # except: # log.error(original + " failed to move to " + target) # raise def random_wait(): moment = np.random.normal(5.0, 2.5) adj = max(moment, 2.5) # print(moment) return adj # def session_timestamp(): # session_timestamp = str(int(time.time())) # # session_id = "{0}{1}".format(prefix, run_id) # log.info(session_timestamp) # return session_timestamp def session_id(module): session_timestamp = str(int(time.time())) session_id = "{0}{1}".format(module, session_timestamp) log.info(session_id) return {"session_id": session_id} def pretty_date(): today = date.today() search_date = today.strftime("%m_%d_%Y") return search_date def infinite_scoll(driver, module, log, filename): log.info("Infinite scrolling...") last_height = driver.execute_script("return document.body.scrollHeight") while True: driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") log.info("Scrolled a page") rand_pause = random_wait() sleep(rand_pause) log.info("Paused for {} secs".format(rand_pause)) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break else: last_height = new_height log.info("Infinite scroll finished, screen grabbed") driver.save_screenshot(filename) return # clean all non-alphanumberic characters # def strip(string): # words = string.split() # words = [word for word in words if "#" not in word] # string = " ".join(words) # clean = "" # for c in string: # if str.isalnum(c) or (c in [" ", ".", ","]): # clean += c # return clean def print_dag_context(*args, **kwargs): """Print the Airflow context and ds variable from the context.""" # print(random_base) pprint(kwargs) # pprint(args) # print("ds") # print(foo) return "Whatever you return gets printed in the logs" def blacklisted_title_check(title, employer, blackListTitles): for blacklist in blackListTitles: if blacklist in title: return True return False def unix_timestamp(now): # dt = datetime.now() timestamp = dt.replace(tzinfo=timezone.utc).timestamp() print(timestamp) return timestamp def datestring(now): # now = datetime.now() # dd/mm/YY H:M:S dt_string = now.strftime("%d/%m/%Y") return dt_string def timestring(now): # now = datetime.now() # dd/mm/YY H:M:S tm_string = now.strftime("%H:%M:%S") return tm_string # def soup2json(soup): # with open("ziprecruiter-items.json",'a+') as fs: # for area in soup.find('weatherForecast').find_all('area'): # fs.write(str(area.attrs)+"\n") # job_posts = soup.find_all("article", class_="job_result") # creating CSV header # def strip(string): # words = string.split() # words = [word for word in words if "#" not in word] # string = " ".join(words) # clean = "" # for c in string: # if str.isalnum(c) or (c in [" ", ".", ","]): # clean += c # return clean # # link for extract html data # def getdata(url): # r = requests.get(url) # return r.text # htmldata = getdata("https://www.geeksforgeeks.org/how-to-automate-an-excel-sheet-in-python/?ref=feed") # soup = BeautifulSoup(htmldata, 'html.parser') # data = '' # for data in soup.find_all("p"): # print(data.get_text()) def urltopdf(url, filename): pdfkit.from_url(url, filename)