Apache Airflow dags w/ backend configuration bundle.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

181 lines
5.4 KiB

"""
Headless Site Navigation and File Download (Using Selenium) to S3
This example demonstrates using Selenium (via Firefox/GeckoDriver) to:
1) Log into a website w/ credentials stored in connection labeled 'selenium_conn_id'
2) Download a file (initiated on login)
3) Transform the CSV into JSON formatting
4) Append the current data to each record
5) Load the corresponding file into S3
To use this DAG, you will need to have the following installed:
[XVFB](https://www.x.org/archive/X11R7.6/doc/man/man1/Xvfb.1.xhtml)
[GeckoDriver](https://github.com/mozilla/geckodriver/releases/download)
selenium==3.11.0
xvfbwrapper==0.2.9
"""
import csv
import datetime
import json
import logging
import os
import time
from datetime import datetime, timedelta
import boa
import requests
from airflow import DAG
from airflow.models import Connection
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator, PythonVirtualenvOperator
from airflow.providers.docker.operators.docker import DockerOperator
from airflow.utils.dates import days_ago
from airflow.utils.db import provide_session
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from tools.network import (
browser_config,
browser_feature,
driver_object,
ip_status,
proxy_ip,
selenoid_status,
uc_test,
user_agent,
vpn_settings,
)
from config.webdriver import browser_capabilities, browser_options
default_args = {
"start_date": days_ago(2),
"email": [],
"email_on_failure": True,
"email_on_retry": False,
# 'retries': 2,
"retry_delay": timedelta(minutes=5),
"catchup": False,
}
# def hello_world_py():
# # selenium_conn_id = kwargs.get('templates_dict', None).get('selenium_conn_id', None)
# # filename = kwargs.get('templates_dict', None).get('filename', None)
# # s3_conn_id = kwargs.get('templates_dict', None).get('s3_conn_id', None)
# # s3_bucket = kwargs.get('templates_dict', None).get('s3_bucket', None)
# # s3_key = kwargs.get('templates_dict', None).get('s3_key', None)
# # date = kwargs.get('templates_dict', None).get('date', None)
# # module_name = kwargs.get('templates_dict', None).get('module', None)
# module = "anon_browser_test"
# chrome_options = browser_options()
# capabilities = browser_capabilities(module)
# logging.info('Assembling driver')
# driver = webdriver.Remote(
# command_executor="http://192.168.1.101:4444/wd/hub",
# options=chrome_options,
# desired_capabilities=capabilities,
# )
# logging.info('proxy IP')
# proxy_ip()
# logging.info('driver')
# vpn_settings()
# logging.info('driver')
# selenoid_status()
# logging.info('driver')
# ip_status(driver)
# logging.info('driver')
# browser_config(driver)
# logging.info('driver')
# user_agent(driver)
# logging.info('driver')
# driver_object(driver)
# logging.info('driver')
# browser_feature(driver)
# logging.info('driver')
# driver.quit()
# logging.info('driver')
# uc_test()
# # print("Finished")
# return 'Whatever you return gets printed in the logs'
# dag = DAG(
# 'anon_browser_test',
# schedule_interval='@daily',
# default_args=default_args,
# catchup=False
# )
# dummy_operator = DummyOperator(task_id="dummy_task", retries=3, dag=dag)
# selenium = PythonOperator(
# task_id='anon_browser_test',
# python_callable=hello_world_py,
# templates_dict={"module": "anon_browser_test"},
# dag=dag
# # "s3_bucket": S3_BUCKET,
# # "s3_key": S3_KEY,
# # "date": date}
# # provide_context=True
# )
# t1 = DockerOperator(
# # api_version='1.19',
# # docker_url='tcp://localhost:2375', # Set your docker URL
# command='/bin/sleep 30',
# image='selenoid/chrome:latest',
# # network_mode='bridge',
# task_id='chrome',
# dag=dag,
# )
# t2 = DockerOperator(
# # api_version='1.19',
# # docker_url='tcp://localhost:2375', # Set your docker URL
# command='/bin/sleep 30',
# image='selenoid/video-recorder:latest-release',
# # network_mode='bridge',
# task_id='video_recorder',
# dag=dag,
# )
# [START howto_operator_python_venv]
def callable_virtualenv():
"""
Example function that will be performed in a virtual environment.
Importing at the module level ensures that it will not attempt to import the
library before it is installed.
"""
from time import sleep
from colorama import Back, Fore, Style
print(Fore.RED + "some red text")
print(Back.GREEN + "and with a green background")
print(Style.DIM + "and in dim text")
print(Style.RESET_ALL)
for _ in range(10):
print(Style.DIM + "Please wait...", flush=True)
sleep(10)
print("Finished")
virtualenv_task = PythonVirtualenvOperator(
task_id="virtualenv_python",
python_callable=callable_virtualenv,
requirements=["colorama==0.4.0"],
system_site_packages=False,
dag=dag,
)
# [END howto_operator_python_venv]
# selenium >> dummy_operator
# dummy_operator >> virtualenv_task
# t1 >> selenium
# t2 >> selenium