from typing import Optional, Any, Dict, List from phi.aws.api_client import AwsApiClient from phi.aws.resource.base import AwsResource from phi.aws.resource.iam.role import IamRole from phi.aws.resource.s3.bucket import S3Bucket from phi.cli.console import print_info from phi.utils.log import logger class GlueS3Target(AwsResource): # The directory path in the S3 bucket to target dir: str = "" # The s3 bucket to target bucket: S3Bucket # A list of glob patterns used to exclude from the crawl. # For more information, see https://docs.aws.amazon.com/glue/latest/dg/add-crawler.html exclusions: Optional[List[str]] = None # The name of a connection which allows a job or crawler to access data in Amazon S3 within an # Amazon Virtual Private Cloud environment (Amazon VPC). connection_name: Optional[str] = None # Sets the number of files in each leaf folder to be crawled when crawling sample files in a dataset. # If not set, all the files are crawled. A valid value is an integer between 1 and 249. sample_size: Optional[int] = None # A valid Amazon SQS ARN. For example, arn:aws:sqs:region:account:sqs . event_queue_arn: Optional[str] = None # A valid Amazon dead-letter SQS ARN. For example, arn:aws:sqs:region:account:deadLetterQueue . dlq_event_queue_arn: Optional[str] = None class GlueCrawler(AwsResource): """ Reference: - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html """ resource_type: Optional[str] = "GlueCrawler" service_name: str = "glue" # Name of the crawler. name: str # The IAM role for the crawler iam_role: IamRole # List of GlueS3Target to add to the targets dict s3_targets: Optional[List[GlueS3Target]] = None # The Glue database where results are written, # such as: arn:aws:daylight:us-east-1::database/sometable/* . database_name: Optional[str] = None # A description of the new crawler. description: Optional[str] = None # A list of collection of targets to crawl. targets: Optional[Dict[str, List[dict]]] = None # A cron expression used to specify the schedule # For example, to run something every day at 12:15 UTC, # you would specify: cron(15 12 * * ? *) . schedule: Optional[str] = None # A list of custom classifiers that the user has registered. # By default, all built-in classifiers are included in a crawl, # but these custom classifiers always override the default classifiers for a given classification. classifiers: Optional[List[str]] = None # The table prefix used for catalog tables that are created. table_prefix: Optional[str] = None # The policy for the crawler's update and deletion behavior. schema_change_policy: Optional[Dict[str, str]] = None # A policy that specifies whether to crawl the entire dataset again, # or to crawl only folders that were added since the last crawler run. recrawl_policy: Optional[Dict[str, str]] = None lineage_configuration: Optional[Dict[str, str]] = None lake_formation_configuration: Optional[Dict[str, str]] = None # Crawler configuration information. This versioned JSON string # allows users to specify aspects of a crawler's behavior. configuration: Optional[str] = None # The name of the SecurityConfiguration structure to be used by this crawler. crawler_security_configuration: Optional[str] = None # The tags to use with this crawler request. tags: Optional[Dict[str, str]] = None # provided by api on create creation_time: Optional[str] = None last_crawl: Optional[str] = None def get_glue_crawler_targets(self) -> Optional[Dict[str, List[dict]]]: # start with user provided targets crawler_targets: Optional[Dict[str, List[dict]]] = self.targets # Add GlueS3Targets to crawler_targets if self.s3_targets is not None: # create S3Targets dicts using s3_targets new_s3_targets_list: List[dict] = [] for s3_target in self.s3_targets: _new_s3_target_path = f"s3://{s3_target.bucket.name}/{s3_target.dir}" # start with the only required argument _new_s3_target_dict: Dict[str, Any] = {"Path": _new_s3_target_path} # add any optional arguments if s3_target.exclusions is not None: _new_s3_target_dict["Exclusions"] = s3_target.exclusions if s3_target.connection_name is not None: _new_s3_target_dict["ConnectionName"] = s3_target.connection_name if s3_target.sample_size is not None: _new_s3_target_dict["SampleSize"] = s3_target.sample_size if s3_target.event_queue_arn is not None: _new_s3_target_dict["EventQueueArn"] = s3_target.event_queue_arn if s3_target.dlq_event_queue_arn is not None: _new_s3_target_dict["DlqEventQueueArn"] = s3_target.dlq_event_queue_arn new_s3_targets_list.append(_new_s3_target_dict) # Add new S3Targets to crawler_targets if crawler_targets is None: crawler_targets = {} # logger.debug(f"new_s3_targets_list: {new_s3_targets_list}") existing_s3_targets = crawler_targets.get("S3Targets", []) # logger.debug(f"existing_s3_targets: {existing_s3_targets}") new_s3_targets = existing_s3_targets + new_s3_targets_list # logger.debug(f"new_s3_targets: {new_s3_targets}") crawler_targets["S3Targets"] = new_s3_targets # TODO: add more targets as needed logger.debug(f"GlueCrawler targets: {crawler_targets}") return crawler_targets def _create(self, aws_client: AwsApiClient) -> bool: """Creates the GlueCrawler Args: aws_client: The AwsApiClient for the current cluster """ from botocore.exceptions import ClientError print_info(f"Creating {self.get_resource_type()}: {self.get_resource_name()}") try: # create a dict of args which are not null, otherwise aws type validation fails not_null_args: Dict[str, Any] = {} if self.database_name: not_null_args["DatabaseName"] = self.database_name if self.description: not_null_args["Description"] = self.description if self.schedule: not_null_args["Schedule"] = self.schedule if self.classifiers: not_null_args["Classifiers"] = self.classifiers if self.table_prefix: not_null_args["TablePrefix"] = self.table_prefix if self.schema_change_policy: not_null_args["SchemaChangePolicy"] = self.schema_change_policy if self.recrawl_policy: not_null_args["RecrawlPolicy"] = self.recrawl_policy if self.lineage_configuration: not_null_args["LineageConfiguration"] = self.lineage_configuration if self.lake_formation_configuration: not_null_args["LakeFormationConfiguration"] = self.lake_formation_configuration if self.configuration: not_null_args["Configuration"] = self.configuration if self.crawler_security_configuration: not_null_args["CrawlerSecurityConfiguration"] = self.crawler_security_configuration if self.tags: not_null_args["Tags"] = self.tags targets = self.get_glue_crawler_targets() if targets: not_null_args["Targets"] = targets # Create crawler # Get the service_client service_client = self.get_service_client(aws_client) iam_role_arn = self.iam_role.get_arn(aws_client) if iam_role_arn is None: logger.error("IamRole ARN unavailable.") return False create_response = service_client.create_crawler( Name=self.name, Role=iam_role_arn, **not_null_args, ) logger.debug(f"GlueCrawler: {create_response}") logger.debug(f"GlueCrawler type: {type(create_response)}") if create_response is not None: print_info(f"GlueCrawler created: {self.name}") self.active_resource = create_response return True except ClientError as ce: logger.debug(f"ClientError: {ce}") except Exception as e: logger.error(f"{self.get_resource_type()} could not be created.") logger.error(e) return False def _read(self, aws_client: AwsApiClient) -> Optional[Any]: """Returns the GlueCrawler Args: aws_client: The AwsApiClient for the current cluster """ from botocore.exceptions import ClientError logger.debug(f"Reading {self.get_resource_type()}: {self.get_resource_name()}") try: service_client = self.get_service_client(aws_client) get_crawler_response = service_client.get_crawler(Name=self.name) # logger.debug(f"GlueCrawler: {get_crawler_response}") # logger.debug(f"GlueCrawler type: {type(get_crawler_response)}") self.creation_time = get_crawler_response.get("Crawler", {}).get("CreationTime", None) self.last_crawl = get_crawler_response.get("Crawler", {}).get("LastCrawl", None) logger.debug(f"GlueCrawler creation_time: {self.creation_time}") logger.debug(f"GlueCrawler last_crawl: {self.last_crawl}") if self.creation_time is not None: logger.debug(f"GlueCrawler found: {self.name}") self.active_resource = get_crawler_response except ClientError as ce: logger.debug(f"ClientError: {ce}") except Exception as e: logger.error(f"Error reading {self.get_resource_type()}.") logger.error(e) return self.active_resource def _delete(self, aws_client: AwsApiClient) -> bool: """Deletes the GlueCrawler Args: aws_client: The AwsApiClient for the current cluster """ print_info(f"Deleting {self.get_resource_type()}: {self.get_resource_name()}") try: # Delete the GlueCrawler service_client = self.get_service_client(aws_client) self.active_resource = None service_client.delete_crawler(Name=self.name) # logger.debug(f"GlueCrawler: {delete_crawler_response}") # logger.debug(f"GlueCrawler type: {type(delete_crawler_response)}") print_info(f"GlueCrawler deleted: {self.name}") return True except Exception as e: logger.error(f"{self.get_resource_type()} could not be deleted.") logger.error("Please try again or delete resources manually.") logger.error(e) return False def start_crawler(self, aws_client: Optional[AwsApiClient] = None) -> bool: """Runs the GlueCrawler Args: aws_client: The AwsApiClient for the current cluster """ print_info(f"Starting {self.get_resource_type()}: {self.get_resource_name()}") try: # Get the service_client client: AwsApiClient = aws_client or self.get_aws_client() service_client = self.get_service_client(client) # logger.debug(f"ServiceClient: {service_client}") # logger.debug(f"ServiceClient type: {type(service_client)}") try: start_crawler_response = service_client.start_crawler(Name=self.name) # logger.debug(f"start_crawler_response: {start_crawler_response}") except service_client.exceptions.CrawlerRunningException: # reference: https://github.com/boto/boto3/issues/1606 print_info(f"{self.get_resource_type()}: {self.get_resource_name()} already running") return True if start_crawler_response is not None: print_info(f"{self.get_resource_type()}: {self.get_resource_name()} started") return True except Exception as e: logger.error("GlueCrawler could not be started") logger.error(e) logger.exception(e) return False