Spaces:
Runtime error
Runtime error
File size: 12,558 Bytes
105b369 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 |
from typing import Optional, Any, Dict, List
from phi.aws.api_client import AwsApiClient
from phi.aws.resource.base import AwsResource
from phi.aws.resource.iam.role import IamRole
from phi.aws.resource.s3.bucket import S3Bucket
from phi.cli.console import print_info
from phi.utils.log import logger
class GlueS3Target(AwsResource):
# The directory path in the S3 bucket to target
dir: str = ""
# The s3 bucket to target
bucket: S3Bucket
# A list of glob patterns used to exclude from the crawl.
# For more information, see https://docs.aws.amazon.com/glue/latest/dg/add-crawler.html
exclusions: Optional[List[str]] = None
# The name of a connection which allows a job or crawler to access data in Amazon S3 within an
# Amazon Virtual Private Cloud environment (Amazon VPC).
connection_name: Optional[str] = None
# Sets the number of files in each leaf folder to be crawled when crawling sample files in a dataset.
# If not set, all the files are crawled. A valid value is an integer between 1 and 249.
sample_size: Optional[int] = None
# A valid Amazon SQS ARN. For example, arn:aws:sqs:region:account:sqs .
event_queue_arn: Optional[str] = None
# A valid Amazon dead-letter SQS ARN. For example, arn:aws:sqs:region:account:deadLetterQueue .
dlq_event_queue_arn: Optional[str] = None
class GlueCrawler(AwsResource):
"""
Reference:
- https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html
"""
resource_type: Optional[str] = "GlueCrawler"
service_name: str = "glue"
# Name of the crawler.
name: str
# The IAM role for the crawler
iam_role: IamRole
# List of GlueS3Target to add to the targets dict
s3_targets: Optional[List[GlueS3Target]] = None
# The Glue database where results are written,
# such as: arn:aws:daylight:us-east-1::database/sometable/* .
database_name: Optional[str] = None
# A description of the new crawler.
description: Optional[str] = None
# A list of collection of targets to crawl.
targets: Optional[Dict[str, List[dict]]] = None
# A cron expression used to specify the schedule
# For example, to run something every day at 12:15 UTC,
# you would specify: cron(15 12 * * ? *) .
schedule: Optional[str] = None
# A list of custom classifiers that the user has registered.
# By default, all built-in classifiers are included in a crawl,
# but these custom classifiers always override the default classifiers for a given classification.
classifiers: Optional[List[str]] = None
# The table prefix used for catalog tables that are created.
table_prefix: Optional[str] = None
# The policy for the crawler's update and deletion behavior.
schema_change_policy: Optional[Dict[str, str]] = None
# A policy that specifies whether to crawl the entire dataset again,
# or to crawl only folders that were added since the last crawler run.
recrawl_policy: Optional[Dict[str, str]] = None
lineage_configuration: Optional[Dict[str, str]] = None
lake_formation_configuration: Optional[Dict[str, str]] = None
# Crawler configuration information. This versioned JSON string
# allows users to specify aspects of a crawler's behavior.
configuration: Optional[str] = None
# The name of the SecurityConfiguration structure to be used by this crawler.
crawler_security_configuration: Optional[str] = None
# The tags to use with this crawler request.
tags: Optional[Dict[str, str]] = None
# provided by api on create
creation_time: Optional[str] = None
last_crawl: Optional[str] = None
def get_glue_crawler_targets(self) -> Optional[Dict[str, List[dict]]]:
# start with user provided targets
crawler_targets: Optional[Dict[str, List[dict]]] = self.targets
# Add GlueS3Targets to crawler_targets
if self.s3_targets is not None:
# create S3Targets dicts using s3_targets
new_s3_targets_list: List[dict] = []
for s3_target in self.s3_targets:
_new_s3_target_path = f"s3://{s3_target.bucket.name}/{s3_target.dir}"
# start with the only required argument
_new_s3_target_dict: Dict[str, Any] = {"Path": _new_s3_target_path}
# add any optional arguments
if s3_target.exclusions is not None:
_new_s3_target_dict["Exclusions"] = s3_target.exclusions
if s3_target.connection_name is not None:
_new_s3_target_dict["ConnectionName"] = s3_target.connection_name
if s3_target.sample_size is not None:
_new_s3_target_dict["SampleSize"] = s3_target.sample_size
if s3_target.event_queue_arn is not None:
_new_s3_target_dict["EventQueueArn"] = s3_target.event_queue_arn
if s3_target.dlq_event_queue_arn is not None:
_new_s3_target_dict["DlqEventQueueArn"] = s3_target.dlq_event_queue_arn
new_s3_targets_list.append(_new_s3_target_dict)
# Add new S3Targets to crawler_targets
if crawler_targets is None:
crawler_targets = {}
# logger.debug(f"new_s3_targets_list: {new_s3_targets_list}")
existing_s3_targets = crawler_targets.get("S3Targets", [])
# logger.debug(f"existing_s3_targets: {existing_s3_targets}")
new_s3_targets = existing_s3_targets + new_s3_targets_list
# logger.debug(f"new_s3_targets: {new_s3_targets}")
crawler_targets["S3Targets"] = new_s3_targets
# TODO: add more targets as needed
logger.debug(f"GlueCrawler targets: {crawler_targets}")
return crawler_targets
def _create(self, aws_client: AwsApiClient) -> bool:
"""Creates the GlueCrawler
Args:
aws_client: The AwsApiClient for the current cluster
"""
from botocore.exceptions import ClientError
print_info(f"Creating {self.get_resource_type()}: {self.get_resource_name()}")
try:
# create a dict of args which are not null, otherwise aws type validation fails
not_null_args: Dict[str, Any] = {}
if self.database_name:
not_null_args["DatabaseName"] = self.database_name
if self.description:
not_null_args["Description"] = self.description
if self.schedule:
not_null_args["Schedule"] = self.schedule
if self.classifiers:
not_null_args["Classifiers"] = self.classifiers
if self.table_prefix:
not_null_args["TablePrefix"] = self.table_prefix
if self.schema_change_policy:
not_null_args["SchemaChangePolicy"] = self.schema_change_policy
if self.recrawl_policy:
not_null_args["RecrawlPolicy"] = self.recrawl_policy
if self.lineage_configuration:
not_null_args["LineageConfiguration"] = self.lineage_configuration
if self.lake_formation_configuration:
not_null_args["LakeFormationConfiguration"] = self.lake_formation_configuration
if self.configuration:
not_null_args["Configuration"] = self.configuration
if self.crawler_security_configuration:
not_null_args["CrawlerSecurityConfiguration"] = self.crawler_security_configuration
if self.tags:
not_null_args["Tags"] = self.tags
targets = self.get_glue_crawler_targets()
if targets:
not_null_args["Targets"] = targets
# Create crawler
# Get the service_client
service_client = self.get_service_client(aws_client)
iam_role_arn = self.iam_role.get_arn(aws_client)
if iam_role_arn is None:
logger.error("IamRole ARN unavailable.")
return False
create_response = service_client.create_crawler(
Name=self.name,
Role=iam_role_arn,
**not_null_args,
)
logger.debug(f"GlueCrawler: {create_response}")
logger.debug(f"GlueCrawler type: {type(create_response)}")
if create_response is not None:
print_info(f"GlueCrawler created: {self.name}")
self.active_resource = create_response
return True
except ClientError as ce:
logger.debug(f"ClientError: {ce}")
except Exception as e:
logger.error(f"{self.get_resource_type()} could not be created.")
logger.error(e)
return False
def _read(self, aws_client: AwsApiClient) -> Optional[Any]:
"""Returns the GlueCrawler
Args:
aws_client: The AwsApiClient for the current cluster
"""
from botocore.exceptions import ClientError
logger.debug(f"Reading {self.get_resource_type()}: {self.get_resource_name()}")
try:
service_client = self.get_service_client(aws_client)
get_crawler_response = service_client.get_crawler(Name=self.name)
# logger.debug(f"GlueCrawler: {get_crawler_response}")
# logger.debug(f"GlueCrawler type: {type(get_crawler_response)}")
self.creation_time = get_crawler_response.get("Crawler", {}).get("CreationTime", None)
self.last_crawl = get_crawler_response.get("Crawler", {}).get("LastCrawl", None)
logger.debug(f"GlueCrawler creation_time: {self.creation_time}")
logger.debug(f"GlueCrawler last_crawl: {self.last_crawl}")
if self.creation_time is not None:
logger.debug(f"GlueCrawler found: {self.name}")
self.active_resource = get_crawler_response
except ClientError as ce:
logger.debug(f"ClientError: {ce}")
except Exception as e:
logger.error(f"Error reading {self.get_resource_type()}.")
logger.error(e)
return self.active_resource
def _delete(self, aws_client: AwsApiClient) -> bool:
"""Deletes the GlueCrawler
Args:
aws_client: The AwsApiClient for the current cluster
"""
print_info(f"Deleting {self.get_resource_type()}: {self.get_resource_name()}")
try:
# Delete the GlueCrawler
service_client = self.get_service_client(aws_client)
self.active_resource = None
service_client.delete_crawler(Name=self.name)
# logger.debug(f"GlueCrawler: {delete_crawler_response}")
# logger.debug(f"GlueCrawler type: {type(delete_crawler_response)}")
print_info(f"GlueCrawler deleted: {self.name}")
return True
except Exception as e:
logger.error(f"{self.get_resource_type()} could not be deleted.")
logger.error("Please try again or delete resources manually.")
logger.error(e)
return False
def start_crawler(self, aws_client: Optional[AwsApiClient] = None) -> bool:
"""Runs the GlueCrawler
Args:
aws_client: The AwsApiClient for the current cluster
"""
print_info(f"Starting {self.get_resource_type()}: {self.get_resource_name()}")
try:
# Get the service_client
client: AwsApiClient = aws_client or self.get_aws_client()
service_client = self.get_service_client(client)
# logger.debug(f"ServiceClient: {service_client}")
# logger.debug(f"ServiceClient type: {type(service_client)}")
try:
start_crawler_response = service_client.start_crawler(Name=self.name)
# logger.debug(f"start_crawler_response: {start_crawler_response}")
except service_client.exceptions.CrawlerRunningException:
# reference: https://github.com/boto/boto3/issues/1606
print_info(f"{self.get_resource_type()}: {self.get_resource_name()} already running")
return True
if start_crawler_response is not None:
print_info(f"{self.get_resource_type()}: {self.get_resource_name()} started")
return True
except Exception as e:
logger.error("GlueCrawler could not be started")
logger.error(e)
logger.exception(e)
return False
|