Spaces:

AmmarFahmy
/

AutoRAG_llama3_groq

Runtime error

AutoRAG_llama3_groq / phi /aws /resource /glue /crawler.py

AmmarFahmy

adding all files

105b369 about 1 year ago

12.6 kB

	from typing import Optional, Any, Dict, List

	from phi.aws.api_client import AwsApiClient
	from phi.aws.resource.base import AwsResource
	from phi.aws.resource.iam.role import IamRole
	from phi.aws.resource.s3.bucket import S3Bucket
	from phi.cli.console import print_info
	from phi.utils.log import logger


	class GlueS3Target(AwsResource):
	# The directory path in the S3 bucket to target
	dir: str = ""
	# The s3 bucket to target
	bucket: S3Bucket
	# A list of glob patterns used to exclude from the crawl.
	# For more information, see https://docs.aws.amazon.com/glue/latest/dg/add-crawler.html
	exclusions: Optional[List[str]] = None
	# The name of a connection which allows a job or crawler to access data in Amazon S3 within an
	# Amazon Virtual Private Cloud environment (Amazon VPC).
	connection_name: Optional[str] = None
	# Sets the number of files in each leaf folder to be crawled when crawling sample files in a dataset.
	# If not set, all the files are crawled. A valid value is an integer between 1 and 249.
	sample_size: Optional[int] = None
	# A valid Amazon SQS ARN. For example, arn:aws:sqs:region:account:sqs .
	event_queue_arn: Optional[str] = None
	# A valid Amazon dead-letter SQS ARN. For example, arn:aws:sqs:region:account:deadLetterQueue .
	dlq_event_queue_arn: Optional[str] = None


	class GlueCrawler(AwsResource):
	"""
	Reference:
	- https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html
	"""

	resource_type: Optional[str] = "GlueCrawler"
	service_name: str = "glue"

	# Name of the crawler.
	name: str
	# The IAM role for the crawler
	iam_role: IamRole
	# List of GlueS3Target to add to the targets dict
	s3_targets: Optional[List[GlueS3Target]] = None
	# The Glue database where results are written,
	# such as: arn:aws:daylight:us-east-1::database/sometable/* .
	database_name: Optional[str] = None
	# A description of the new crawler.
	description: Optional[str] = None
	# A list of collection of targets to crawl.
	targets: Optional[Dict[str, List[dict]]] = None
	# A cron expression used to specify the schedule
	# For example, to run something every day at 12:15 UTC,
	# you would specify: cron(15 12 * * ? *) .
	schedule: Optional[str] = None
	# A list of custom classifiers that the user has registered.
	# By default, all built-in classifiers are included in a crawl,
	# but these custom classifiers always override the default classifiers for a given classification.
	classifiers: Optional[List[str]] = None
	# The table prefix used for catalog tables that are created.
	table_prefix: Optional[str] = None
	# The policy for the crawler's update and deletion behavior.
	schema_change_policy: Optional[Dict[str, str]] = None
	# A policy that specifies whether to crawl the entire dataset again,
	# or to crawl only folders that were added since the last crawler run.
	recrawl_policy: Optional[Dict[str, str]] = None
	lineage_configuration: Optional[Dict[str, str]] = None
	lake_formation_configuration: Optional[Dict[str, str]] = None
	# Crawler configuration information. This versioned JSON string
	# allows users to specify aspects of a crawler's behavior.
	configuration: Optional[str] = None
	# The name of the SecurityConfiguration structure to be used by this crawler.
	crawler_security_configuration: Optional[str] = None
	# The tags to use with this crawler request.
	tags: Optional[Dict[str, str]] = None

	# provided by api on create
	creation_time: Optional[str] = None
	last_crawl: Optional[str] = None

	def get_glue_crawler_targets(self) -> Optional[Dict[str, List[dict]]]:
	# start with user provided targets
	crawler_targets: Optional[Dict[str, List[dict]]] = self.targets

	# Add GlueS3Targets to crawler_targets
	if self.s3_targets is not None:
	# create S3Targets dicts using s3_targets
	new_s3_targets_list: List[dict] = []
	for s3_target in self.s3_targets:
	_new_s3_target_path = f"s3://{s3_target.bucket.name}/{s3_target.dir}"
	# start with the only required argument
	_new_s3_target_dict: Dict[str, Any] = {"Path": _new_s3_target_path}
	# add any optional arguments
	if s3_target.exclusions is not None:
	_new_s3_target_dict["Exclusions"] = s3_target.exclusions
	if s3_target.connection_name is not None:
	_new_s3_target_dict["ConnectionName"] = s3_target.connection_name
	if s3_target.sample_size is not None:
	_new_s3_target_dict["SampleSize"] = s3_target.sample_size
	if s3_target.event_queue_arn is not None:
	_new_s3_target_dict["EventQueueArn"] = s3_target.event_queue_arn
	if s3_target.dlq_event_queue_arn is not None:
	_new_s3_target_dict["DlqEventQueueArn"] = s3_target.dlq_event_queue_arn

	new_s3_targets_list.append(_new_s3_target_dict)

	# Add new S3Targets to crawler_targets
	if crawler_targets is None:
	crawler_targets = {}
	# logger.debug(f"new_s3_targets_list: {new_s3_targets_list}")
	existing_s3_targets = crawler_targets.get("S3Targets", [])
	# logger.debug(f"existing_s3_targets: {existing_s3_targets}")
	new_s3_targets = existing_s3_targets + new_s3_targets_list
	# logger.debug(f"new_s3_targets: {new_s3_targets}")
	crawler_targets["S3Targets"] = new_s3_targets

	# TODO: add more targets as needed
	logger.debug(f"GlueCrawler targets: {crawler_targets}")
	return crawler_targets

	def _create(self, aws_client: AwsApiClient) -> bool:
	"""Creates the GlueCrawler

	Args:
	aws_client: The AwsApiClient for the current cluster
	"""
	from botocore.exceptions import ClientError

	print_info(f"Creating {self.get_resource_type()}: {self.get_resource_name()}")
	try:
	# create a dict of args which are not null, otherwise aws type validation fails
	not_null_args: Dict[str, Any] = {}
	if self.database_name:
	not_null_args["DatabaseName"] = self.database_name
	if self.description:
	not_null_args["Description"] = self.description
	if self.schedule:
	not_null_args["Schedule"] = self.schedule
	if self.classifiers:
	not_null_args["Classifiers"] = self.classifiers
	if self.table_prefix:
	not_null_args["TablePrefix"] = self.table_prefix
	if self.schema_change_policy:
	not_null_args["SchemaChangePolicy"] = self.schema_change_policy
	if self.recrawl_policy:
	not_null_args["RecrawlPolicy"] = self.recrawl_policy
	if self.lineage_configuration:
	not_null_args["LineageConfiguration"] = self.lineage_configuration
	if self.lake_formation_configuration:
	not_null_args["LakeFormationConfiguration"] = self.lake_formation_configuration
	if self.configuration:
	not_null_args["Configuration"] = self.configuration
	if self.crawler_security_configuration:
	not_null_args["CrawlerSecurityConfiguration"] = self.crawler_security_configuration
	if self.tags:
	not_null_args["Tags"] = self.tags

	targets = self.get_glue_crawler_targets()
	if targets:
	not_null_args["Targets"] = targets

	# Create crawler
	# Get the service_client
	service_client = self.get_service_client(aws_client)
	iam_role_arn = self.iam_role.get_arn(aws_client)
	if iam_role_arn is None:
	logger.error("IamRole ARN unavailable.")
	return False
	create_response = service_client.create_crawler(
	Name=self.name,
	Role=iam_role_arn,
	**not_null_args,
	)
	logger.debug(f"GlueCrawler: {create_response}")
	logger.debug(f"GlueCrawler type: {type(create_response)}")

	if create_response is not None:
	print_info(f"GlueCrawler created: {self.name}")
	self.active_resource = create_response
	return True
	except ClientError as ce:
	logger.debug(f"ClientError: {ce}")
	except Exception as e:
	logger.error(f"{self.get_resource_type()} could not be created.")
	logger.error(e)
	return False

	def _read(self, aws_client: AwsApiClient) -> Optional[Any]:
	"""Returns the GlueCrawler

	Args:
	aws_client: The AwsApiClient for the current cluster
	"""
	from botocore.exceptions import ClientError

	logger.debug(f"Reading {self.get_resource_type()}: {self.get_resource_name()}")
	try:
	service_client = self.get_service_client(aws_client)
	get_crawler_response = service_client.get_crawler(Name=self.name)
	# logger.debug(f"GlueCrawler: {get_crawler_response}")
	# logger.debug(f"GlueCrawler type: {type(get_crawler_response)}")

	self.creation_time = get_crawler_response.get("Crawler", {}).get("CreationTime", None)
	self.last_crawl = get_crawler_response.get("Crawler", {}).get("LastCrawl", None)
	logger.debug(f"GlueCrawler creation_time: {self.creation_time}")
	logger.debug(f"GlueCrawler last_crawl: {self.last_crawl}")
	if self.creation_time is not None:
	logger.debug(f"GlueCrawler found: {self.name}")
	self.active_resource = get_crawler_response
	except ClientError as ce:
	logger.debug(f"ClientError: {ce}")
	except Exception as e:
	logger.error(f"Error reading {self.get_resource_type()}.")
	logger.error(e)
	return self.active_resource

	def _delete(self, aws_client: AwsApiClient) -> bool:
	"""Deletes the GlueCrawler

	Args:
	aws_client: The AwsApiClient for the current cluster
	"""

	print_info(f"Deleting {self.get_resource_type()}: {self.get_resource_name()}")
	try:
	# Delete the GlueCrawler
	service_client = self.get_service_client(aws_client)
	self.active_resource = None
	service_client.delete_crawler(Name=self.name)
	# logger.debug(f"GlueCrawler: {delete_crawler_response}")
	# logger.debug(f"GlueCrawler type: {type(delete_crawler_response)}")
	print_info(f"GlueCrawler deleted: {self.name}")
	return True
	except Exception as e:
	logger.error(f"{self.get_resource_type()} could not be deleted.")
	logger.error("Please try again or delete resources manually.")
	logger.error(e)
	return False

	def start_crawler(self, aws_client: Optional[AwsApiClient] = None) -> bool:
	"""Runs the GlueCrawler

	Args:
	aws_client: The AwsApiClient for the current cluster
	"""

	print_info(f"Starting {self.get_resource_type()}: {self.get_resource_name()}")
	try:
	# Get the service_client
	client: AwsApiClient = aws_client or self.get_aws_client()
	service_client = self.get_service_client(client)
	# logger.debug(f"ServiceClient: {service_client}")
	# logger.debug(f"ServiceClient type: {type(service_client)}")

	try:
	start_crawler_response = service_client.start_crawler(Name=self.name)
	# logger.debug(f"start_crawler_response: {start_crawler_response}")
	except service_client.exceptions.CrawlerRunningException:
	# reference: https://github.com/boto/boto3/issues/1606
	print_info(f"{self.get_resource_type()}: {self.get_resource_name()} already running")
	return True

	if start_crawler_response is not None:
	print_info(f"{self.get_resource_type()}: {self.get_resource_name()} started")
	return True

	except Exception as e:
	logger.error("GlueCrawler could not be started")
	logger.error(e)
	logger.exception(e)
	return False