airflow-dags/plugins/operators/google.py

import gzip
import json
import logging
import os
import time

import boa
from airflow.hooks.base_hook import BaseHook
from airflow.hooks.S3_hook import S3Hook
from airflow.models import BaseOperator, Variable
from hooks.google import GoogleHook
from six import BytesIO


class GoogleSheetsToS3Operator(BaseOperator):
    """
    Google Sheets To S3 Operator

    :param google_conn_id:    The Google connection id.
    :type google_conn_id:     string
    :param sheet_id:          The id for associated report.
    :type sheet_id:           string
    :param sheet_names:       The name for the relevent sheets in the report.
    :type sheet_names:        string/array
    :param range:             The range of of cells containing the relevant data.
                              This must be the same for all sheets if multiple
                              are being pulled together.
                              Example: Sheet1!A2:E80
    :type range:              string
    :param include_schema:    If set to true, infer the schema of the data and
                              output to S3 as a separate file
    :type include_schema:     boolean
    :param s3_conn_id:        The s3 connection id.
    :type s3_conn_id:         string
    :param s3_key:            The S3 key to be used to store the
                              retrieved data.
    :type s3_key:             string
    """

    template_fields = ("s3_key",)

    def __init__(
        self,
        google_conn_id,
        sheet_id,
        s3_conn_id,
        s3_key,
        compression_bound,
        include_schema=False,
        sheet_names=[],
        range=None,
        output_format="json",
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)

        self.google_conn_id = google_conn_id
        self.sheet_id = sheet_id
        self.sheet_names = sheet_names
        self.s3_conn_id = s3_conn_id
        self.s3_key = s3_key
        self.include_schema = include_schema
        self.range = range
        self.output_format = output_format.lower()
        self.compression_bound = compression_bound
        if self.output_format not in ("json"):
            raise Exception("Acceptable output formats are: json.")

        if self.sheet_names and not isinstance(self.sheet_names, (str, list, tuple)):
            raise Exception("Please specify the sheet names as a string or list.")

    def execute(self, context):
        g_conn = GoogleHook(self.google_conn_id)

        if isinstance(self.sheet_names, str) and "," in self.sheet_names:
            sheet_names = self.sheet_names.split(",")
        else:
            sheet_names = self.sheet_names

        sheets_object = g_conn.get_service_object("sheets", "v4")
        logging.info("Retrieved Sheets Object")

        response = (
            sheets_object.spreadsheets()
            .get(spreadsheetId=self.sheet_id, includeGridData=True)
            .execute()
        )

        title = response.get("properties").get("title")
        sheets = response.get("sheets")

        final_output = dict()

        total_sheets = []
        for sheet in sheets:
            name = sheet.get("properties").get("title")
            name = boa.constrict(name)
            total_sheets.append(name)

            if self.sheet_names:
                if name not in sheet_names:
                    logging.info(
                        "{} is not found in available sheet names.".format(name)
                    )
                    continue

            table_name = name
            data = sheet.get("data")[0].get("rowData")
            output = []

            for row in data:
                row_data = []
                values = row.get("values")
                for value in values:
                    ev = value.get("effectiveValue")
                    if ev is None:
                        row_data.append(None)
                    else:
                        for v in ev.values():
                            row_data.append(v)

                output.append(row_data)

            if self.output_format == "json":
                headers = output.pop(0)
                output = [dict(zip(headers, row)) for row in output]

            final_output[table_name] = output

        s3 = S3Hook(self.s3_conn_id)

        for sheet in final_output:
            output_data = final_output.get(sheet)

            file_name, file_extension = os.path.splitext(self.s3_key)

            output_name = "".join([file_name, "_", sheet, file_extension])

            if self.include_schema is True:
                schema_name = "".join(
                    [file_name, "_", sheet, "_schema", file_extension]
                )

            self.output_manager(
                s3, output_name, output_data, context, sheet, schema_name
            )

        dag_id = context["ti"].dag_id

        var_key = "_".join([dag_id, self.sheet_id])
        Variable.set(key=var_key, value=json.dumps(total_sheets))
        time.sleep(10)

        return boa.constrict(title)

    def output_manager(
        self, s3, output_name, output_data, context, sheet_name, schema_name=None
    ):
        self.s3_bucket = BaseHook.get_connection(self.s3_conn_id).host
        if self.output_format == "json":
            output = "\n".join(
                [
                    json.dumps({boa.constrict(str(k)): v for k, v in record.items()})
                    for record in output_data
                ]
            )

            enc_output = str.encode(output, "utf-8")

            # if file is more than bound then apply gzip compression
            if len(enc_output) / 1024 / 1024 >= self.compression_bound:
                logging.info(
                    "File is more than {}MB, gzip compression will be applied".format(
                        self.compression_bound
                    )
                )
                output = gzip.compress(enc_output, compresslevel=5)
                self.xcom_push(
                    context,
                    key="is_compressed_{}".format(sheet_name),
                    value="compressed",
                )
                self.load_bytes(
                    s3,
                    bytes_data=output,
                    key=output_name,
                    bucket_name=self.s3_bucket,
                    replace=True,
                )
            else:
                logging.info(
                    "File is less than {}MB, compression will not be applied".format(
                        self.compression_bound
                    )
                )
                self.xcom_push(
                    context,
                    key="is_compressed_{}".format(sheet_name),
                    value="non-compressed",
                )
                s3.load_string(
                    string_data=output,
                    key=output_name,
                    bucket_name=self.s3_bucket,
                    replace=True,
                )

            if self.include_schema is True:
                output_keys = output_data[0].keys()
                schema = [
                    {"name": boa.constrict(a), "type": "varchar(512)"}
                    for a in output_keys
                    if a is not None
                ]
                schema = {"columns": schema}

                s3.load_string(
                    string_data=json.dumps(schema),
                    key=schema_name,
                    bucket_name=self.s3_bucket,
                    replace=True,
                )

            logging.info('Successfully output of "{}" to S3.'.format(output_name))

        # TODO -- Add support for csv output

        # elif self.output_format == 'csv':
        #     with NamedTemporaryFile("w") as f:
        #         writer = csv.writer(f)
        #         writer.writerows(output_data)
        #         s3.load_file(
        #             filename=f.name,
        #             key=output_name,
        #             bucket_name=self.s3_bucket,
        #             replace=True
        #         )
        #
        #     if self.include_schema is True:
        #         pass

    # TODO: remove when airflow version is upgraded to 1.10
    def load_bytes(self, s3, bytes_data, key, bucket_name=None, replace=False):
        if not bucket_name:
            (bucket_name, key) = s3.parse_s3_url(key)

        if not replace and s3.check_for_key(key, bucket_name):
            raise ValueError("The key {key} already exists.".format(key=key))

        filelike_buffer = BytesIO(bytes_data)

        client = s3.get_conn()
        client.upload_fileobj(filelike_buffer, bucket_name, key, ExtraArgs={})