""" Headless Site Navigation and File Download (Using Selenium) to S3 This example demonstrates using Selenium (via Firefox/GeckoDriver) to: 1) Log into a website w/ credentials stored in connection labeled 'selenium_conn_id' 2) Download a file (initiated on login) 3) Transform the CSV into JSON formatting 4) Append the current data to each record 5) Load the corresponding file into S3 To use this DAG, you will need to have the following installed: [XVFB](https://www.x.org/archive/X11R7.6/doc/man/man1/Xvfb.1.xhtml) [GeckoDriver](https://github.com/mozilla/geckodriver/releases/download) selenium==3.11.0 xvfbwrapper==0.2.9 """ import csv import datetime import json import logging import os import time from datetime import datetime, timedelta import boa import requests from airflow import DAG from airflow.models import Connection from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator, PythonVirtualenvOperator from airflow.providers.docker.operators.docker import DockerOperator from airflow.utils.dates import days_ago from airflow.utils.db import provide_session from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.keys import Keys from tools.network import ( browser_config, browser_feature, driver_object, ip_status, proxy_ip, selenoid_status, uc_test, user_agent, vpn_settings, ) from config.webdriver import browser_capabilities, browser_options default_args = { "start_date": days_ago(2), "email": [], "email_on_failure": True, "email_on_retry": False, # 'retries': 2, "retry_delay": timedelta(minutes=5), "catchup": False, } # def hello_world_py(): # # selenium_conn_id = kwargs.get('templates_dict', None).get('selenium_conn_id', None) # # filename = kwargs.get('templates_dict', None).get('filename', None) # # s3_conn_id = kwargs.get('templates_dict', None).get('s3_conn_id', None) # # s3_bucket = kwargs.get('templates_dict', None).get('s3_bucket', None) # # s3_key = kwargs.get('templates_dict', None).get('s3_key', None) # # date = kwargs.get('templates_dict', None).get('date', None) # # module_name = kwargs.get('templates_dict', None).get('module', None) # module = "anon_browser_test" # chrome_options = browser_options() # capabilities = browser_capabilities(module) # logging.info('Assembling driver') # driver = webdriver.Remote( # command_executor="http://192.168.1.101:4444/wd/hub", # options=chrome_options, # desired_capabilities=capabilities, # ) # logging.info('proxy IP') # proxy_ip() # logging.info('driver') # vpn_settings() # logging.info('driver') # selenoid_status() # logging.info('driver') # ip_status(driver) # logging.info('driver') # browser_config(driver) # logging.info('driver') # user_agent(driver) # logging.info('driver') # driver_object(driver) # logging.info('driver') # browser_feature(driver) # logging.info('driver') # driver.quit() # logging.info('driver') # uc_test() # # print("Finished") # return 'Whatever you return gets printed in the logs' # dag = DAG( # 'anon_browser_test', # schedule_interval='@daily', # default_args=default_args, # catchup=False # ) # dummy_operator = DummyOperator(task_id="dummy_task", retries=3, dag=dag) # selenium = PythonOperator( # task_id='anon_browser_test', # python_callable=hello_world_py, # templates_dict={"module": "anon_browser_test"}, # dag=dag # # "s3_bucket": S3_BUCKET, # # "s3_key": S3_KEY, # # "date": date} # # provide_context=True # ) # t1 = DockerOperator( # # api_version='1.19', # # docker_url='tcp://localhost:2375', # Set your docker URL # command='/bin/sleep 30', # image='selenoid/chrome:latest', # # network_mode='bridge', # task_id='chrome', # dag=dag, # ) # t2 = DockerOperator( # # api_version='1.19', # # docker_url='tcp://localhost:2375', # Set your docker URL # command='/bin/sleep 30', # image='selenoid/video-recorder:latest-release', # # network_mode='bridge', # task_id='video_recorder', # dag=dag, # ) # [START howto_operator_python_venv] def callable_virtualenv(): """ Example function that will be performed in a virtual environment. Importing at the module level ensures that it will not attempt to import the library before it is installed. """ from time import sleep from colorama import Back, Fore, Style print(Fore.RED + "some red text") print(Back.GREEN + "and with a green background") print(Style.DIM + "and in dim text") print(Style.RESET_ALL) for _ in range(10): print(Style.DIM + "Please wait...", flush=True) sleep(10) print("Finished") virtualenv_task = PythonVirtualenvOperator( task_id="virtualenv_python", python_callable=callable_virtualenv, requirements=["colorama==0.4.0"], system_site_packages=False, dag=dag, ) # [END howto_operator_python_venv] # selenium >> dummy_operator # dummy_operator >> virtualenv_task # t1 >> selenium # t2 >> selenium