from typing import Optional, Any, Dict, List from typing_extensions import Literal from phi.aws.api_client import AwsApiClient from phi.aws.resource.base import AwsResource from phi.cli.console import print_info from phi.utils.log import logger class EmrCluster(AwsResource): """ Reference: - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html """ resource_type: Optional[str] = "EmrCluster" service_name: str = "emr" # Name of the cluster. name: str # The location in Amazon S3 to write the log files of the job flow. # If a value is not provided, logs are not created. log_uri: Optional[str] = None # The KMS key used for encrypting log files. If a value is not provided, the logs remain encrypted by AES-256. # This attribute is only available with Amazon EMR version 5.30.0 and later, excluding Amazon EMR 6.0.0. log_encryption_kms_key_id: Optional[str] = None # A JSON string for selecting additional features. additional_info: Optional[str] = None # The Amazon EMR release label, which determines the version of open-source application packages installed on the # cluster. Release labels are in the form emr-x.x.x, # where x.x.x is an Amazon EMR release version such as emr-5.14.0 . release_label: Optional[str] = None # A specification of the number and type of Amazon EC2 instances. instances: Optional[Dict[str, Any]] = None # A list of steps to run. steps: Optional[List[Dict[str, Any]]] = None # A list of bootstrap actions to run before Hadoop starts on the cluster nodes. bootstrap_actions: Optional[List[Dict[str, Any]]] = None # For Amazon EMR releases 3.x and 2.x. For Amazon EMR releases 4.x and later, use Applications. # A list of strings that indicates third-party software to use. supported_products: Optional[List[str]] new_supported_products: Optional[List[Dict[str, Any]]] = None # Applies to Amazon EMR releases 4.0 and later. # A case-insensitive list of applications for Amazon EMR to install and configure when launching the cluster. applications: Optional[List[Dict[str, Any]]] = None # For Amazon EMR releases 4.0 and later. The list of configurations supplied for the EMR cluster you are creating. configurations: Optional[List[Dict[str, Any]]] = None # Also called instance profile and EC2 role. An IAM role for an EMR cluster. # The EC2 instances of the cluster assume this role. The default role is EMR_EC2_DefaultRole. # In order to use the default role, you must have already created it using the CLI or console. job_flow_role: Optional[str] = None # he IAM role that Amazon EMR assumes in order to access Amazon Web Services resources on your behalf. service_role: Optional[str] = None # A list of tags to associate with a cluster and propagate to Amazon EC2 instances. tags: Optional[List[Dict[str, str]]] = None # The name of a security configuration to apply to the cluster. security_configuration: Optional[str] = None # An IAM role for automatic scaling policies. The default role is EMR_AutoScaling_DefaultRole. # The IAM role provides permissions that the automatic scaling feature requires to launch and terminate EC2 # instances in an instance group. auto_scaling_role: Optional[str] = None scale_down_behavior: Optional[Literal["TERMINATE_AT_INSTANCE_HOUR", "TERMINATE_AT_TASK_COMPLETION"]] = None custom_ami_id: Optional[str] = None # The size, in GiB, of the Amazon EBS root device volume of the Linux AMI that is used for each EC2 instance. ebs_root_volume_size: Optional[int] = None repo_upgrade_on_boot: Optional[Literal["SECURITY", "NONE"]] = None # Attributes for Kerberos configuration when Kerberos authentication is enabled using a security configuration. kerberos_attributes: Optional[Dict[str, str]] = None # Specifies the number of steps that can be executed concurrently. # The default value is 1 . The maximum value is 256 . step_concurrency_level: Optional[int] = None # The specified managed scaling policy for an Amazon EMR cluster. managed_scaling_policy: Optional[Dict[str, Any]] = None placement_group_configs: Optional[List[Dict[str, Any]]] = None # The auto-termination policy defines the amount of idle time in seconds after which a cluster terminates. auto_termination_policy: Optional[Dict[str, int]] = None # provided by api on create # A unique identifier for the job flow. job_flow_id: Optional[str] = None # The Amazon Resource Name (ARN) of the cluster. cluster_arn: Optional[str] = None # ClusterSummary returned on read cluster_summary: Optional[Dict] = None def _create(self, aws_client: AwsApiClient) -> bool: """Creates the EmrCluster Args: aws_client: The AwsApiClient for the current cluster """ print_info(f"Creating {self.get_resource_type()}: {self.get_resource_name()}") try: # create a dict of args which are not null, otherwise aws type validation fails not_null_args: Dict[str, Any] = {} if self.log_uri: not_null_args["LogUri"] = self.log_uri if self.log_encryption_kms_key_id: not_null_args["LogEncryptionKmsKeyId"] = self.log_encryption_kms_key_id if self.additional_info: not_null_args["AdditionalInfo"] = self.additional_info if self.release_label: not_null_args["ReleaseLabel"] = self.release_label if self.instances: not_null_args["Instances"] = self.instances if self.steps: not_null_args["Steps"] = self.steps if self.bootstrap_actions: not_null_args["BootstrapActions"] = self.bootstrap_actions if self.supported_products: not_null_args["SupportedProducts"] = self.supported_products if self.new_supported_products: not_null_args["NewSupportedProducts"] = self.new_supported_products if self.applications: not_null_args["Applications"] = self.applications if self.configurations: not_null_args["Configurations"] = self.configurations if self.job_flow_role: not_null_args["JobFlowRole"] = self.job_flow_role if self.service_role: not_null_args["ServiceRole"] = self.service_role if self.tags: not_null_args["Tags"] = self.tags if self.security_configuration: not_null_args["SecurityConfiguration"] = self.security_configuration if self.auto_scaling_role: not_null_args["AutoScalingRole"] = self.auto_scaling_role if self.scale_down_behavior: not_null_args["ScaleDownBehavior"] = self.scale_down_behavior if self.custom_ami_id: not_null_args["CustomAmiId"] = self.custom_ami_id if self.ebs_root_volume_size: not_null_args["EbsRootVolumeSize"] = self.ebs_root_volume_size if self.repo_upgrade_on_boot: not_null_args["RepoUpgradeOnBoot"] = self.repo_upgrade_on_boot if self.kerberos_attributes: not_null_args["KerberosAttributes"] = self.kerberos_attributes if self.step_concurrency_level: not_null_args["StepConcurrencyLevel"] = self.step_concurrency_level if self.managed_scaling_policy: not_null_args["ManagedScalingPolicy"] = self.managed_scaling_policy if self.placement_group_configs: not_null_args["PlacementGroupConfigs"] = self.placement_group_configs if self.auto_termination_policy: not_null_args["AutoTerminationPolicy"] = self.auto_termination_policy # Get the service_client service_client = self.get_service_client(aws_client) # Create EmrCluster create_response = service_client.run_job_flow( Name=self.name, **not_null_args, ) logger.debug(f"create_response type: {type(create_response)}") logger.debug(f"create_response: {create_response}") self.job_flow_id = create_response.get("JobFlowId", None) self.cluster_arn = create_response.get("ClusterArn", None) self.active_resource = create_response if self.active_resource is not None: print_info(f"{self.get_resource_type()}: {self.get_resource_name()} created") logger.debug(f"JobFlowId: {self.job_flow_id}") logger.debug(f"ClusterArn: {self.cluster_arn}") return True except Exception as e: logger.error(f"{self.get_resource_type()} could not be created.") logger.error(e) return False def post_create(self, aws_client: AwsApiClient) -> bool: ## Wait for Cluster to be created if self.wait_for_create: try: print_info("Waiting for EmrCluster to be active.") if self.job_flow_id is not None: waiter = self.get_service_client(aws_client).get_waiter("cluster_running") waiter.wait( ClusterId=self.job_flow_id, WaiterConfig={ "Delay": self.waiter_delay, "MaxAttempts": self.waiter_max_attempts, }, ) else: logger.warning("Skipping waiter, No ClusterId found") except Exception as e: logger.error("Waiter failed.") logger.error(e) return True def _read(self, aws_client: AwsApiClient) -> Optional[Any]: """Returns the EmrCluster Args: aws_client: The AwsApiClient for the current cluster """ from botocore.exceptions import ClientError logger.debug(f"Reading {self.get_resource_type()}: {self.get_resource_name()}") try: service_client = self.get_service_client(aws_client) list_response = service_client.list_clusters() # logger.debug(f"list_response type: {type(list_response)}") # logger.debug(f"list_response: {list_response}") cluster_summary_list = list_response.get("Clusters", None) if cluster_summary_list is not None and isinstance(cluster_summary_list, list): for _cluster_summary in cluster_summary_list: cluster_name = _cluster_summary.get("Name", None) if cluster_name == self.name: self.active_resource = _cluster_summary break if self.active_resource is None: logger.debug(f"No {self.get_resource_type()} found") return None # logger.debug(f"EmrCluster: {self.active_resource}") self.job_flow_id = self.active_resource.get("Id", None) self.cluster_arn = self.active_resource.get("ClusterArn", None) except ClientError as ce: logger.debug(f"ClientError: {ce}") except Exception as e: logger.error(f"Error reading {self.get_resource_type()}.") logger.error(e) return self.active_resource def _delete(self, aws_client: AwsApiClient) -> bool: """Deletes the EmrCluster Args: aws_client: The AwsApiClient for the current cluster """ print_info(f"Deleting {self.get_resource_type()}: {self.get_resource_name()}") try: # populate self.job_flow_id self._read(aws_client) service_client = self.get_service_client(aws_client) self.active_resource = None if self.job_flow_id: service_client.terminate_job_flows(JobFlowIds=[self.job_flow_id]) print_info(f"{self.get_resource_type()}: {self.get_resource_name()} deleted") else: logger.error("Could not find cluster id") return True except Exception as e: logger.error(f"{self.get_resource_type()} could not be deleted.") logger.error("Please try again or delete resources manually.") logger.error(e) return False