Spaces:

AmmarFahmy
/

AutoRAG_llama3_groq

Runtime error

File size: 12,558 Bytes

105b369

from typing import Optional, Any, Dict, List

from phi.aws.api_client import AwsApiClient
from phi.aws.resource.base import AwsResource
from phi.aws.resource.iam.role import IamRole
from phi.aws.resource.s3.bucket import S3Bucket
from phi.cli.console import print_info
from phi.utils.log import logger


class GlueS3Target(AwsResource):
    # The directory path in the S3 bucket to target
    dir: str = ""
    # The s3 bucket to target
    bucket: S3Bucket
    # A list of glob patterns used to exclude from the crawl.
    # For more information, see https://docs.aws.amazon.com/glue/latest/dg/add-crawler.html
    exclusions: Optional[List[str]] = None
    # The name of a connection which allows a job or crawler to access data in Amazon S3 within an
    # Amazon Virtual Private Cloud environment (Amazon VPC).
    connection_name: Optional[str] = None
    # Sets the number of files in each leaf folder to be crawled when crawling sample files in a dataset.
    # If not set, all the files are crawled. A valid value is an integer between 1 and 249.
    sample_size: Optional[int] = None
    # A valid Amazon SQS ARN. For example, arn:aws:sqs:region:account:sqs .
    event_queue_arn: Optional[str] = None
    # A valid Amazon dead-letter SQS ARN. For example, arn:aws:sqs:region:account:deadLetterQueue .
    dlq_event_queue_arn: Optional[str] = None


class GlueCrawler(AwsResource):
    """
    Reference:
    - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html
    """

    resource_type: Optional[str] = "GlueCrawler"
    service_name: str = "glue"

    # Name of the crawler.
    name: str
    # The IAM role for the crawler
    iam_role: IamRole
    # List of GlueS3Target to add to the targets dict
    s3_targets: Optional[List[GlueS3Target]] = None
    # The Glue database where results are written,
    # such as: arn:aws:daylight:us-east-1::database/sometable/* .
    database_name: Optional[str] = None
    # A description of the new crawler.
    description: Optional[str] = None
    # A list of collection of targets to crawl.
    targets: Optional[Dict[str, List[dict]]] = None
    # A cron expression used to specify the schedule
    # For example, to run something every day at 12:15 UTC,
    # you would specify: cron(15 12 * * ? *) .
    schedule: Optional[str] = None
    # A list of custom classifiers that the user has registered.
    # By default, all built-in classifiers are included in a crawl,
    # but these custom classifiers always override the default classifiers for a given classification.
    classifiers: Optional[List[str]] = None
    # The table prefix used for catalog tables that are created.
    table_prefix: Optional[str] = None
    # The policy for the crawler's update and deletion behavior.
    schema_change_policy: Optional[Dict[str, str]] = None
    # A policy that specifies whether to crawl the entire dataset again,
    # or to crawl only folders that were added since the last crawler run.
    recrawl_policy: Optional[Dict[str, str]] = None
    lineage_configuration: Optional[Dict[str, str]] = None
    lake_formation_configuration: Optional[Dict[str, str]] = None
    # Crawler configuration information. This versioned JSON string
    # allows users to specify aspects of a crawler's behavior.
    configuration: Optional[str] = None
    # The name of the SecurityConfiguration structure to be used by this crawler.
    crawler_security_configuration: Optional[str] = None
    # The tags to use with this crawler request.
    tags: Optional[Dict[str, str]] = None

    # provided by api on create
    creation_time: Optional[str] = None
    last_crawl: Optional[str] = None

    def get_glue_crawler_targets(self) -> Optional[Dict[str, List[dict]]]:
        # start with user provided targets
        crawler_targets: Optional[Dict[str, List[dict]]] = self.targets

        # Add GlueS3Targets to crawler_targets
        if self.s3_targets is not None:
            # create S3Targets dicts using s3_targets
            new_s3_targets_list: List[dict] = []
            for s3_target in self.s3_targets:
                _new_s3_target_path = f"s3://{s3_target.bucket.name}/{s3_target.dir}"
                # start with the only required argument
                _new_s3_target_dict: Dict[str, Any] = {"Path": _new_s3_target_path}
                # add any optional arguments
                if s3_target.exclusions is not None:
                    _new_s3_target_dict["Exclusions"] = s3_target.exclusions
                if s3_target.connection_name is not None:
                    _new_s3_target_dict["ConnectionName"] = s3_target.connection_name
                if s3_target.sample_size is not None:
                    _new_s3_target_dict["SampleSize"] = s3_target.sample_size
                if s3_target.event_queue_arn is not None:
                    _new_s3_target_dict["EventQueueArn"] = s3_target.event_queue_arn
                if s3_target.dlq_event_queue_arn is not None:
                    _new_s3_target_dict["DlqEventQueueArn"] = s3_target.dlq_event_queue_arn

                new_s3_targets_list.append(_new_s3_target_dict)

            # Add new S3Targets to crawler_targets
            if crawler_targets is None:
                crawler_targets = {}
            # logger.debug(f"new_s3_targets_list: {new_s3_targets_list}")
            existing_s3_targets = crawler_targets.get("S3Targets", [])
            # logger.debug(f"existing_s3_targets: {existing_s3_targets}")
            new_s3_targets = existing_s3_targets + new_s3_targets_list
            # logger.debug(f"new_s3_targets: {new_s3_targets}")
            crawler_targets["S3Targets"] = new_s3_targets

        # TODO: add more targets as needed
        logger.debug(f"GlueCrawler targets: {crawler_targets}")
        return crawler_targets

    def _create(self, aws_client: AwsApiClient) -> bool:
        """Creates the GlueCrawler

        Args:
            aws_client: The AwsApiClient for the current cluster
        """
        from botocore.exceptions import ClientError

        print_info(f"Creating {self.get_resource_type()}: {self.get_resource_name()}")
        try:
            # create a dict of args which are not null, otherwise aws type validation fails
            not_null_args: Dict[str, Any] = {}
            if self.database_name:
                not_null_args["DatabaseName"] = self.database_name
            if self.description:
                not_null_args["Description"] = self.description
            if self.schedule:
                not_null_args["Schedule"] = self.schedule
            if self.classifiers:
                not_null_args["Classifiers"] = self.classifiers
            if self.table_prefix:
                not_null_args["TablePrefix"] = self.table_prefix
            if self.schema_change_policy:
                not_null_args["SchemaChangePolicy"] = self.schema_change_policy
            if self.recrawl_policy:
                not_null_args["RecrawlPolicy"] = self.recrawl_policy
            if self.lineage_configuration:
                not_null_args["LineageConfiguration"] = self.lineage_configuration
            if self.lake_formation_configuration:
                not_null_args["LakeFormationConfiguration"] = self.lake_formation_configuration
            if self.configuration:
                not_null_args["Configuration"] = self.configuration
            if self.crawler_security_configuration:
                not_null_args["CrawlerSecurityConfiguration"] = self.crawler_security_configuration
            if self.tags:
                not_null_args["Tags"] = self.tags

            targets = self.get_glue_crawler_targets()
            if targets:
                not_null_args["Targets"] = targets

            # Create crawler
            # Get the service_client
            service_client = self.get_service_client(aws_client)
            iam_role_arn = self.iam_role.get_arn(aws_client)
            if iam_role_arn is None:
                logger.error("IamRole ARN unavailable.")
                return False
            create_response = service_client.create_crawler(
                Name=self.name,
                Role=iam_role_arn,
                **not_null_args,
            )
            logger.debug(f"GlueCrawler: {create_response}")
            logger.debug(f"GlueCrawler type: {type(create_response)}")

            if create_response is not None:
                print_info(f"GlueCrawler created: {self.name}")
                self.active_resource = create_response
                return True
        except ClientError as ce:
            logger.debug(f"ClientError: {ce}")
        except Exception as e:
            logger.error(f"{self.get_resource_type()} could not be created.")
            logger.error(e)
        return False

    def _read(self, aws_client: AwsApiClient) -> Optional[Any]:
        """Returns the GlueCrawler

        Args:
            aws_client: The AwsApiClient for the current cluster
        """
        from botocore.exceptions import ClientError

        logger.debug(f"Reading {self.get_resource_type()}: {self.get_resource_name()}")
        try:
            service_client = self.get_service_client(aws_client)
            get_crawler_response = service_client.get_crawler(Name=self.name)
            # logger.debug(f"GlueCrawler: {get_crawler_response}")
            # logger.debug(f"GlueCrawler type: {type(get_crawler_response)}")

            self.creation_time = get_crawler_response.get("Crawler", {}).get("CreationTime", None)
            self.last_crawl = get_crawler_response.get("Crawler", {}).get("LastCrawl", None)
            logger.debug(f"GlueCrawler creation_time: {self.creation_time}")
            logger.debug(f"GlueCrawler last_crawl: {self.last_crawl}")
            if self.creation_time is not None:
                logger.debug(f"GlueCrawler found: {self.name}")
                self.active_resource = get_crawler_response
        except ClientError as ce:
            logger.debug(f"ClientError: {ce}")
        except Exception as e:
            logger.error(f"Error reading {self.get_resource_type()}.")
            logger.error(e)
        return self.active_resource

    def _delete(self, aws_client: AwsApiClient) -> bool:
        """Deletes the GlueCrawler

        Args:
            aws_client: The AwsApiClient for the current cluster
        """

        print_info(f"Deleting {self.get_resource_type()}: {self.get_resource_name()}")
        try:
            # Delete the GlueCrawler
            service_client = self.get_service_client(aws_client)
            self.active_resource = None
            service_client.delete_crawler(Name=self.name)
            # logger.debug(f"GlueCrawler: {delete_crawler_response}")
            # logger.debug(f"GlueCrawler type: {type(delete_crawler_response)}")
            print_info(f"GlueCrawler deleted: {self.name}")
            return True
        except Exception as e:
            logger.error(f"{self.get_resource_type()} could not be deleted.")
            logger.error("Please try again or delete resources manually.")
            logger.error(e)
        return False

    def start_crawler(self, aws_client: Optional[AwsApiClient] = None) -> bool:
        """Runs the GlueCrawler

        Args:
            aws_client: The AwsApiClient for the current cluster
        """

        print_info(f"Starting {self.get_resource_type()}: {self.get_resource_name()}")
        try:
            # Get the service_client
            client: AwsApiClient = aws_client or self.get_aws_client()
            service_client = self.get_service_client(client)
            # logger.debug(f"ServiceClient: {service_client}")
            # logger.debug(f"ServiceClient type: {type(service_client)}")

            try:
                start_crawler_response = service_client.start_crawler(Name=self.name)
                # logger.debug(f"start_crawler_response: {start_crawler_response}")
            except service_client.exceptions.CrawlerRunningException:
                # reference: https://github.com/boto/boto3/issues/1606
                print_info(f"{self.get_resource_type()}: {self.get_resource_name()} already running")
                return True

            if start_crawler_response is not None:
                print_info(f"{self.get_resource_type()}: {self.get_resource_name()} started")
                return True

        except Exception as e:
            logger.error("GlueCrawler could not be started")
            logger.error(e)
            logger.exception(e)
        return False