You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
184 lines
6.8 KiB
184 lines
6.8 KiB
# |
|
# Licensed to the Apache Software Foundation (ASF) under one |
|
# or more contributor license agreements. See the NOTICE file |
|
# distributed with this work for additional information |
|
# regarding copyright ownership. The ASF licenses this file |
|
# to you under the Apache License, Version 2.0 (the |
|
# "License"); you may not use this file except in compliance |
|
# with the License. You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, |
|
# software distributed under the License is distributed on an |
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
|
# KIND, either express or implied. See the License for the |
|
# specific language governing permissions and limitations |
|
# under the License. |
|
|
|
from time import sleep |
|
|
|
try: |
|
from functools import cached_property |
|
except ImportError: |
|
from cached_property import cached_property |
|
|
|
from airflow.exceptions import AirflowException |
|
from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook |
|
|
|
|
|
class AwsGlueCrawlerHook(AwsBaseHook): |
|
""" |
|
Interacts with AWS Glue Crawler. |
|
|
|
Additional arguments (such as ``aws_conn_id``) may be specified and |
|
are passed down to the underlying AwsBaseHook. |
|
|
|
.. seealso:: |
|
:class:`~airflow.providers.amazon.aws.hooks.base_aws.AwsBaseHook` |
|
""" |
|
|
|
def __init__(self, *args, **kwargs): |
|
kwargs["client_type"] = "glue" |
|
super().__init__(*args, **kwargs) |
|
|
|
@cached_property |
|
def glue_client(self): |
|
""":return: AWS Glue client""" |
|
return self.get_conn() |
|
|
|
def has_crawler(self, crawler_name) -> bool: |
|
""" |
|
Checks if the crawler already exists |
|
|
|
:param crawler_name: unique crawler name per AWS account |
|
:type crawler_name: str |
|
:return: Returns True if the crawler already exists and False if not. |
|
""" |
|
self.log.info("Checking if crawler already exists: %s", crawler_name) |
|
|
|
try: |
|
self.get_crawler(crawler_name) |
|
return True |
|
except self.glue_client.exceptions.EntityNotFoundException: |
|
return False |
|
|
|
def get_crawler(self, crawler_name: str) -> dict: |
|
""" |
|
Gets crawler configurations |
|
|
|
:param crawler_name: unique crawler name per AWS account |
|
:type crawler_name: str |
|
:return: Nested dictionary of crawler configurations |
|
""" |
|
return self.glue_client.get_crawler(Name=crawler_name)["Crawler"] |
|
|
|
def update_crawler(self, **crawler_kwargs) -> str: |
|
""" |
|
Updates crawler configurations |
|
|
|
:param crawler_kwargs: Keyword args that define the configurations used for the crawler |
|
:type crawler_kwargs: any |
|
:return: True if crawler was updated and false otherwise |
|
""" |
|
crawler_name = crawler_kwargs["Name"] |
|
current_crawler = self.get_crawler(crawler_name) |
|
|
|
update_config = { |
|
key: value |
|
for key, value in crawler_kwargs.items() |
|
if current_crawler[key] != crawler_kwargs[key] |
|
} |
|
if update_config != {}: |
|
self.log.info("Updating crawler: %s", crawler_name) |
|
self.glue_client.update_crawler(**crawler_kwargs) |
|
self.log.info("Updated configurations: %s", update_config) |
|
return True |
|
else: |
|
return False |
|
|
|
def create_crawler(self, **crawler_kwargs) -> str: |
|
""" |
|
Creates an AWS Glue Crawler |
|
|
|
:param crawler_kwargs: Keyword args that define the configurations used to create the crawler |
|
:type crawler_kwargs: any |
|
:return: Name of the crawler |
|
""" |
|
crawler_name = crawler_kwargs["Name"] |
|
self.log.info("Creating crawler: %s", crawler_name) |
|
return self.glue_client.create_crawler(**crawler_kwargs)["Crawler"]["Name"] |
|
|
|
def start_crawler(self, crawler_name: str) -> dict: |
|
""" |
|
Triggers the AWS Glue crawler |
|
|
|
:param crawler_name: unique crawler name per AWS account |
|
:type crawler_name: str |
|
:return: Empty dictionary |
|
""" |
|
self.log.info("Starting crawler %s", crawler_name) |
|
crawler = self.glue_client.start_crawler(Name=crawler_name) |
|
return crawler |
|
|
|
def wait_for_crawler_completion( |
|
self, crawler_name: str, poll_interval: int = 5 |
|
) -> str: |
|
""" |
|
Waits until Glue crawler completes and |
|
returns the status of the latest crawl run. |
|
Raises AirflowException if the crawler fails or is cancelled. |
|
|
|
:param crawler_name: unique crawler name per AWS account |
|
:type crawler_name: str |
|
:param poll_interval: Time (in seconds) to wait between two consecutive calls to check crawler status |
|
:type poll_interval: int |
|
:return: Crawler's status |
|
""" |
|
failed_status = ["FAILED", "CANCELLED"] |
|
|
|
while True: |
|
crawler = self.get_crawler(crawler_name) |
|
crawler_state = crawler["State"] |
|
if crawler_state == "READY": |
|
self.log.info("State: %s", crawler_state) |
|
self.log.info("crawler_config: %s", crawler) |
|
crawler_status = crawler["LastCrawl"]["Status"] |
|
if crawler_status in failed_status: |
|
raise AirflowException( |
|
f"Status: {crawler_status}" |
|
) # pylint: disable=raising-format-tuple |
|
else: |
|
metrics = self.glue_client.get_crawler_metrics( |
|
CrawlerNameList=[crawler_name] |
|
)["CrawlerMetricsList"][0] |
|
self.log.info("Status: %s", crawler_status) |
|
self.log.info( |
|
"Last Runtime Duration (seconds): %s", |
|
metrics["LastRuntimeSeconds"], |
|
) |
|
self.log.info( |
|
"Median Runtime Duration (seconds): %s", |
|
metrics["MedianRuntimeSeconds"], |
|
) |
|
self.log.info("Tables Created: %s", metrics["TablesCreated"]) |
|
self.log.info("Tables Updated: %s", metrics["TablesUpdated"]) |
|
self.log.info("Tables Deleted: %s", metrics["TablesDeleted"]) |
|
|
|
return crawler_status |
|
|
|
else: |
|
self.log.info("Polling for AWS Glue crawler: %s ", crawler_name) |
|
self.log.info("State: %s", crawler_state) |
|
|
|
metrics = self.glue_client.get_crawler_metrics( |
|
CrawlerNameList=[crawler_name] |
|
)["CrawlerMetricsList"][0] |
|
time_left = int(metrics["TimeLeftSeconds"]) |
|
|
|
if time_left > 0: |
|
self.log.info("Estimated Time Left (seconds): %s", time_left) |
|
else: |
|
self.log.info("Crawler should finish soon") |
|
|
|
sleep(poll_interval)
|
|
|