You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
181 lines
5.4 KiB
181 lines
5.4 KiB
""" |
|
Headless Site Navigation and File Download (Using Selenium) to S3 |
|
This example demonstrates using Selenium (via Firefox/GeckoDriver) to: |
|
1) Log into a website w/ credentials stored in connection labeled 'selenium_conn_id' |
|
2) Download a file (initiated on login) |
|
3) Transform the CSV into JSON formatting |
|
4) Append the current data to each record |
|
5) Load the corresponding file into S3 |
|
To use this DAG, you will need to have the following installed: |
|
[XVFB](https://www.x.org/archive/X11R7.6/doc/man/man1/Xvfb.1.xhtml) |
|
[GeckoDriver](https://github.com/mozilla/geckodriver/releases/download) |
|
selenium==3.11.0 |
|
xvfbwrapper==0.2.9 |
|
""" |
|
import csv |
|
import datetime |
|
import json |
|
import logging |
|
import os |
|
import time |
|
from datetime import datetime, timedelta |
|
|
|
import boa |
|
import requests |
|
from airflow import DAG |
|
from airflow.models import Connection |
|
from airflow.operators.dummy_operator import DummyOperator |
|
from airflow.operators.python_operator import PythonOperator, PythonVirtualenvOperator |
|
from airflow.providers.docker.operators.docker import DockerOperator |
|
from airflow.utils.dates import days_ago |
|
from airflow.utils.db import provide_session |
|
from bs4 import BeautifulSoup |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.webdriver.chrome.service import Service |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities |
|
from selenium.webdriver.common.keys import Keys |
|
from tools.network import ( |
|
browser_config, |
|
browser_feature, |
|
driver_object, |
|
ip_status, |
|
proxy_ip, |
|
selenoid_status, |
|
uc_test, |
|
user_agent, |
|
vpn_settings, |
|
) |
|
|
|
from config.webdriver import browser_capabilities, browser_options |
|
|
|
default_args = { |
|
"start_date": days_ago(2), |
|
"email": [], |
|
"email_on_failure": True, |
|
"email_on_retry": False, |
|
# 'retries': 2, |
|
"retry_delay": timedelta(minutes=5), |
|
"catchup": False, |
|
} |
|
|
|
|
|
# def hello_world_py(): |
|
# # selenium_conn_id = kwargs.get('templates_dict', None).get('selenium_conn_id', None) |
|
# # filename = kwargs.get('templates_dict', None).get('filename', None) |
|
# # s3_conn_id = kwargs.get('templates_dict', None).get('s3_conn_id', None) |
|
# # s3_bucket = kwargs.get('templates_dict', None).get('s3_bucket', None) |
|
# # s3_key = kwargs.get('templates_dict', None).get('s3_key', None) |
|
# # date = kwargs.get('templates_dict', None).get('date', None) |
|
# # module_name = kwargs.get('templates_dict', None).get('module', None) |
|
|
|
|
|
# module = "anon_browser_test" |
|
|
|
# chrome_options = browser_options() |
|
# capabilities = browser_capabilities(module) |
|
# logging.info('Assembling driver') |
|
# driver = webdriver.Remote( |
|
# command_executor="http://192.168.1.101:4444/wd/hub", |
|
# options=chrome_options, |
|
# desired_capabilities=capabilities, |
|
# ) |
|
# logging.info('proxy IP') |
|
# proxy_ip() |
|
# logging.info('driver') |
|
# vpn_settings() |
|
# logging.info('driver') |
|
# selenoid_status() |
|
# logging.info('driver') |
|
# ip_status(driver) |
|
# logging.info('driver') |
|
# browser_config(driver) |
|
# logging.info('driver') |
|
# user_agent(driver) |
|
# logging.info('driver') |
|
# driver_object(driver) |
|
# logging.info('driver') |
|
# browser_feature(driver) |
|
# logging.info('driver') |
|
|
|
# driver.quit() |
|
# logging.info('driver') |
|
# uc_test() |
|
# # print("Finished") |
|
# return 'Whatever you return gets printed in the logs' |
|
|
|
# dag = DAG( |
|
# 'anon_browser_test', |
|
# schedule_interval='@daily', |
|
# default_args=default_args, |
|
# catchup=False |
|
# ) |
|
|
|
# dummy_operator = DummyOperator(task_id="dummy_task", retries=3, dag=dag) |
|
|
|
# selenium = PythonOperator( |
|
# task_id='anon_browser_test', |
|
# python_callable=hello_world_py, |
|
# templates_dict={"module": "anon_browser_test"}, |
|
# dag=dag |
|
# # "s3_bucket": S3_BUCKET, |
|
# # "s3_key": S3_KEY, |
|
# # "date": date} |
|
# # provide_context=True |
|
# ) |
|
|
|
# t1 = DockerOperator( |
|
# # api_version='1.19', |
|
# # docker_url='tcp://localhost:2375', # Set your docker URL |
|
# command='/bin/sleep 30', |
|
# image='selenoid/chrome:latest', |
|
# # network_mode='bridge', |
|
# task_id='chrome', |
|
# dag=dag, |
|
# ) |
|
|
|
# t2 = DockerOperator( |
|
# # api_version='1.19', |
|
# # docker_url='tcp://localhost:2375', # Set your docker URL |
|
# command='/bin/sleep 30', |
|
# image='selenoid/video-recorder:latest-release', |
|
# # network_mode='bridge', |
|
# task_id='video_recorder', |
|
# dag=dag, |
|
# ) |
|
|
|
# [START howto_operator_python_venv] |
|
def callable_virtualenv(): |
|
""" |
|
Example function that will be performed in a virtual environment. |
|
Importing at the module level ensures that it will not attempt to import the |
|
library before it is installed. |
|
""" |
|
from time import sleep |
|
|
|
from colorama import Back, Fore, Style |
|
|
|
print(Fore.RED + "some red text") |
|
print(Back.GREEN + "and with a green background") |
|
print(Style.DIM + "and in dim text") |
|
print(Style.RESET_ALL) |
|
for _ in range(10): |
|
print(Style.DIM + "Please wait...", flush=True) |
|
sleep(10) |
|
print("Finished") |
|
|
|
|
|
virtualenv_task = PythonVirtualenvOperator( |
|
task_id="virtualenv_python", |
|
python_callable=callable_virtualenv, |
|
requirements=["colorama==0.4.0"], |
|
system_site_packages=False, |
|
dag=dag, |
|
) |
|
# [END howto_operator_python_venv] |
|
|
|
# selenium >> dummy_operator |
|
# dummy_operator >> virtualenv_task |
|
# t1 >> selenium |
|
# t2 >> selenium
|
|
|