AmmarFahmy
adding all files
105b369
from typing import Optional, Any, Dict, List
from phi.aws.api_client import AwsApiClient
from phi.aws.resource.base import AwsResource
from phi.aws.resource.iam.role import IamRole
from phi.aws.resource.s3.bucket import S3Bucket
from phi.cli.console import print_info
from phi.utils.log import logger
class GlueS3Target(AwsResource):
# The directory path in the S3 bucket to target
dir: str = ""
# The s3 bucket to target
bucket: S3Bucket
# A list of glob patterns used to exclude from the crawl.
# For more information, see https://docs.aws.amazon.com/glue/latest/dg/add-crawler.html
exclusions: Optional[List[str]] = None
# The name of a connection which allows a job or crawler to access data in Amazon S3 within an
# Amazon Virtual Private Cloud environment (Amazon VPC).
connection_name: Optional[str] = None
# Sets the number of files in each leaf folder to be crawled when crawling sample files in a dataset.
# If not set, all the files are crawled. A valid value is an integer between 1 and 249.
sample_size: Optional[int] = None
# A valid Amazon SQS ARN. For example, arn:aws:sqs:region:account:sqs .
event_queue_arn: Optional[str] = None
# A valid Amazon dead-letter SQS ARN. For example, arn:aws:sqs:region:account:deadLetterQueue .
dlq_event_queue_arn: Optional[str] = None
class GlueCrawler(AwsResource):
"""
Reference:
- https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html
"""
resource_type: Optional[str] = "GlueCrawler"
service_name: str = "glue"
# Name of the crawler.
name: str
# The IAM role for the crawler
iam_role: IamRole
# List of GlueS3Target to add to the targets dict
s3_targets: Optional[List[GlueS3Target]] = None
# The Glue database where results are written,
# such as: arn:aws:daylight:us-east-1::database/sometable/* .
database_name: Optional[str] = None
# A description of the new crawler.
description: Optional[str] = None
# A list of collection of targets to crawl.
targets: Optional[Dict[str, List[dict]]] = None
# A cron expression used to specify the schedule
# For example, to run something every day at 12:15 UTC,
# you would specify: cron(15 12 * * ? *) .
schedule: Optional[str] = None
# A list of custom classifiers that the user has registered.
# By default, all built-in classifiers are included in a crawl,
# but these custom classifiers always override the default classifiers for a given classification.
classifiers: Optional[List[str]] = None
# The table prefix used for catalog tables that are created.
table_prefix: Optional[str] = None
# The policy for the crawler's update and deletion behavior.
schema_change_policy: Optional[Dict[str, str]] = None
# A policy that specifies whether to crawl the entire dataset again,
# or to crawl only folders that were added since the last crawler run.
recrawl_policy: Optional[Dict[str, str]] = None
lineage_configuration: Optional[Dict[str, str]] = None
lake_formation_configuration: Optional[Dict[str, str]] = None
# Crawler configuration information. This versioned JSON string
# allows users to specify aspects of a crawler's behavior.
configuration: Optional[str] = None
# The name of the SecurityConfiguration structure to be used by this crawler.
crawler_security_configuration: Optional[str] = None
# The tags to use with this crawler request.
tags: Optional[Dict[str, str]] = None
# provided by api on create
creation_time: Optional[str] = None
last_crawl: Optional[str] = None
def get_glue_crawler_targets(self) -> Optional[Dict[str, List[dict]]]:
# start with user provided targets
crawler_targets: Optional[Dict[str, List[dict]]] = self.targets
# Add GlueS3Targets to crawler_targets
if self.s3_targets is not None:
# create S3Targets dicts using s3_targets
new_s3_targets_list: List[dict] = []
for s3_target in self.s3_targets:
_new_s3_target_path = f"s3://{s3_target.bucket.name}/{s3_target.dir}"
# start with the only required argument
_new_s3_target_dict: Dict[str, Any] = {"Path": _new_s3_target_path}
# add any optional arguments
if s3_target.exclusions is not None:
_new_s3_target_dict["Exclusions"] = s3_target.exclusions
if s3_target.connection_name is not None:
_new_s3_target_dict["ConnectionName"] = s3_target.connection_name
if s3_target.sample_size is not None:
_new_s3_target_dict["SampleSize"] = s3_target.sample_size
if s3_target.event_queue_arn is not None:
_new_s3_target_dict["EventQueueArn"] = s3_target.event_queue_arn
if s3_target.dlq_event_queue_arn is not None:
_new_s3_target_dict["DlqEventQueueArn"] = s3_target.dlq_event_queue_arn
new_s3_targets_list.append(_new_s3_target_dict)
# Add new S3Targets to crawler_targets
if crawler_targets is None:
crawler_targets = {}
# logger.debug(f"new_s3_targets_list: {new_s3_targets_list}")
existing_s3_targets = crawler_targets.get("S3Targets", [])
# logger.debug(f"existing_s3_targets: {existing_s3_targets}")
new_s3_targets = existing_s3_targets + new_s3_targets_list
# logger.debug(f"new_s3_targets: {new_s3_targets}")
crawler_targets["S3Targets"] = new_s3_targets
# TODO: add more targets as needed
logger.debug(f"GlueCrawler targets: {crawler_targets}")
return crawler_targets
def _create(self, aws_client: AwsApiClient) -> bool:
"""Creates the GlueCrawler
Args:
aws_client: The AwsApiClient for the current cluster
"""
from botocore.exceptions import ClientError
print_info(f"Creating {self.get_resource_type()}: {self.get_resource_name()}")
try:
# create a dict of args which are not null, otherwise aws type validation fails
not_null_args: Dict[str, Any] = {}
if self.database_name:
not_null_args["DatabaseName"] = self.database_name
if self.description:
not_null_args["Description"] = self.description
if self.schedule:
not_null_args["Schedule"] = self.schedule
if self.classifiers:
not_null_args["Classifiers"] = self.classifiers
if self.table_prefix:
not_null_args["TablePrefix"] = self.table_prefix
if self.schema_change_policy:
not_null_args["SchemaChangePolicy"] = self.schema_change_policy
if self.recrawl_policy:
not_null_args["RecrawlPolicy"] = self.recrawl_policy
if self.lineage_configuration:
not_null_args["LineageConfiguration"] = self.lineage_configuration
if self.lake_formation_configuration:
not_null_args["LakeFormationConfiguration"] = self.lake_formation_configuration
if self.configuration:
not_null_args["Configuration"] = self.configuration
if self.crawler_security_configuration:
not_null_args["CrawlerSecurityConfiguration"] = self.crawler_security_configuration
if self.tags:
not_null_args["Tags"] = self.tags
targets = self.get_glue_crawler_targets()
if targets:
not_null_args["Targets"] = targets
# Create crawler
# Get the service_client
service_client = self.get_service_client(aws_client)
iam_role_arn = self.iam_role.get_arn(aws_client)
if iam_role_arn is None:
logger.error("IamRole ARN unavailable.")
return False
create_response = service_client.create_crawler(
Name=self.name,
Role=iam_role_arn,
**not_null_args,
)
logger.debug(f"GlueCrawler: {create_response}")
logger.debug(f"GlueCrawler type: {type(create_response)}")
if create_response is not None:
print_info(f"GlueCrawler created: {self.name}")
self.active_resource = create_response
return True
except ClientError as ce:
logger.debug(f"ClientError: {ce}")
except Exception as e:
logger.error(f"{self.get_resource_type()} could not be created.")
logger.error(e)
return False
def _read(self, aws_client: AwsApiClient) -> Optional[Any]:
"""Returns the GlueCrawler
Args:
aws_client: The AwsApiClient for the current cluster
"""
from botocore.exceptions import ClientError
logger.debug(f"Reading {self.get_resource_type()}: {self.get_resource_name()}")
try:
service_client = self.get_service_client(aws_client)
get_crawler_response = service_client.get_crawler(Name=self.name)
# logger.debug(f"GlueCrawler: {get_crawler_response}")
# logger.debug(f"GlueCrawler type: {type(get_crawler_response)}")
self.creation_time = get_crawler_response.get("Crawler", {}).get("CreationTime", None)
self.last_crawl = get_crawler_response.get("Crawler", {}).get("LastCrawl", None)
logger.debug(f"GlueCrawler creation_time: {self.creation_time}")
logger.debug(f"GlueCrawler last_crawl: {self.last_crawl}")
if self.creation_time is not None:
logger.debug(f"GlueCrawler found: {self.name}")
self.active_resource = get_crawler_response
except ClientError as ce:
logger.debug(f"ClientError: {ce}")
except Exception as e:
logger.error(f"Error reading {self.get_resource_type()}.")
logger.error(e)
return self.active_resource
def _delete(self, aws_client: AwsApiClient) -> bool:
"""Deletes the GlueCrawler
Args:
aws_client: The AwsApiClient for the current cluster
"""
print_info(f"Deleting {self.get_resource_type()}: {self.get_resource_name()}")
try:
# Delete the GlueCrawler
service_client = self.get_service_client(aws_client)
self.active_resource = None
service_client.delete_crawler(Name=self.name)
# logger.debug(f"GlueCrawler: {delete_crawler_response}")
# logger.debug(f"GlueCrawler type: {type(delete_crawler_response)}")
print_info(f"GlueCrawler deleted: {self.name}")
return True
except Exception as e:
logger.error(f"{self.get_resource_type()} could not be deleted.")
logger.error("Please try again or delete resources manually.")
logger.error(e)
return False
def start_crawler(self, aws_client: Optional[AwsApiClient] = None) -> bool:
"""Runs the GlueCrawler
Args:
aws_client: The AwsApiClient for the current cluster
"""
print_info(f"Starting {self.get_resource_type()}: {self.get_resource_name()}")
try:
# Get the service_client
client: AwsApiClient = aws_client or self.get_aws_client()
service_client = self.get_service_client(client)
# logger.debug(f"ServiceClient: {service_client}")
# logger.debug(f"ServiceClient type: {type(service_client)}")
try:
start_crawler_response = service_client.start_crawler(Name=self.name)
# logger.debug(f"start_crawler_response: {start_crawler_response}")
except service_client.exceptions.CrawlerRunningException:
# reference: https://github.com/boto/boto3/issues/1606
print_info(f"{self.get_resource_type()}: {self.get_resource_name()} already running")
return True
if start_crawler_response is not None:
print_info(f"{self.get_resource_type()}: {self.get_resource_name()} started")
return True
except Exception as e:
logger.error("GlueCrawler could not be started")
logger.error(e)
logger.exception(e)
return False