Spaces:

AmmarFahmy
/

AutoRAG_llama3_groq

Runtime error

AutoRAG_llama3_groq / phi /aws /resource /eks /node_group.py

AmmarFahmy

adding all files

105b369 about 1 year ago

23.5 kB

	from typing import Optional, Any, Dict, List, Union, cast
	from typing_extensions import Literal
	from textwrap import dedent

	from phi.aws.api_client import AwsApiClient
	from phi.aws.resource.base import AwsResource
	from phi.aws.resource.ec2.subnet import Subnet
	from phi.aws.resource.eks.cluster import EksCluster
	from phi.aws.resource.iam.role import IamRole
	from phi.cli.console import print_info
	from phi.utils.log import logger


	class EksNodeGroup(AwsResource):
	"""
	An Amazon EKS managed node group is an Amazon EC2 Auto Scaling group and associated EC2
	instances that are managed by Amazon Web Services for an Amazon EKS cluster.

	An Auto Scaling group is a group of EC2 instances that are combined into one management unit.
	When you set up an auto-scaling group, you specify a scaling policy and AWS will apply that policy to make sure
	that a certain number of instances is automatically running in your group. If the number of instances drops below a
	certain value, or if the load increases (depending on the policy),
	then AWS will automatically spin up new instances for you.

	https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/eks.html
	https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/eks.html#EKS.Client.create_nodegroup
	"""

	resource_type: Optional[str] = "EksNodeGroup"
	service_name: str = "eks"

	# Name for the node group
	name: str
	# The cluster to create the EksNodeGroup in
	eks_cluster: EksCluster

	# The IAM role to associate with your node group.
	# The Amazon EKS worker node kubelet daemon makes calls to Amazon Web Services APIs on your behalf.
	# Nodes receive permissions for these API calls through an IAM instance profile and associated policies.
	# Before you can launch nodes and register them into a cluster,
	# you must create an IAM role for those nodes to use when they are launched.

	# ARN for the node group IAM role to use
	node_role_arn: Optional[str] = None
	# If node_role_arn is None, a default role is created if create_role is True
	create_role: bool = True
	# If node_role is None, a default node_role is created using node_role_name
	node_role: Optional[IamRole] = None
	# Name for the default node_role when role is None, use "name-iam-role" if not provided
	node_role_name: Optional[str] = None
	# Provide a list of policy ARNs to attach to the node group role
	add_policy_arns: Optional[List[str]] = None

	# The scaling configuration details for the Auto Scaling group
	# Users can provide a dict for scaling config or use min/max/desired values below
	scaling_config: Optional[Dict[str, Union[str, int]]] = None
	# The minimum number of nodes that the managed node group can scale in to.
	min_size: Optional[int] = None
	# The maximum number of nodes that the managed node group can scale out to.
	max_size: Optional[int] = None
	# The current number of nodes that the managed node group should maintain.
	# WARNING: If you use Cluster Autoscaler, you shouldn't change the desired_size value directly,
	# as this can cause the Cluster Autoscaler to suddenly scale up or scale down.
	# Whenever this parameter changes, the number of worker nodes in the node group is updated to
	# the specified size. If this parameter is given a value that is smaller than the current number of
	# running worker nodes, the necessary number of worker nodes are terminated to match the given value.
	desired_size: Optional[int] = None
	# The root device disk size (in GiB) for your node group instances.
	# The default disk size is 20 GiB. If you specify launchTemplate,
	# then don't specify diskSize, or the node group deployment will fail.
	disk_size: Optional[int] = None
	# The subnets to use for the Auto Scaling group that is created for your node group.
	# If you specify launchTemplate, then don't specify SubnetId in your launch template,
	# or the node group deployment will fail.
	# For more information about using launch templates with Amazon EKS,
	# see Launch template support in the Amazon EKS User Guide.
	subnets: Optional[List[str]] = None
	# Filter subnets using availability zones
	subnet_az: Optional[Union[str, List[str]]] = None
	# Specify the instance types for a node group.
	# If you specify a GPU instance type, be sure to specify AL2_x86_64_GPU with the amiType parameter.
	# If you specify launchTemplate , then you can specify zero or one instance type in your launch template
	# or you can specify 0-20 instance types for instanceTypes .
	# If however, you specify an instance type in your launch template and specify any instanceTypes ,
	# the node group deployment will fail. If you don't specify an instance type in a launch template
	# or for instance_types, then t3.medium is used, by default. If you specify Spot for capacityType,
	# then we recommend specifying multiple values for instanceTypes .
	instance_types: Optional[List[str]] = None
	# The AMI type for your node group. GPU instance types should use the AL2_x86_64_GPU AMI type.
	# Non-GPU instances should use the AL2_x86_64 AMI type.
	# Arm instances should use the AL2_ARM_64 AMI type.
	# All types use the Amazon EKS optimized Amazon Linux 2 AMI.
	# If you specify launchTemplate , and your launch template uses a custom AMI,
	# then don't specify amiType , or the node group deployment will fail.
	ami_type: Optional[
	Literal[
	"AL2_x86_64",
	"AL2_x86_64_GPU",
	"AL2_ARM_64",
	"CUSTOM",
	"BOTTLEROCKET_ARM_64",
	"BOTTLEROCKET_x86_64",
	]
	] = None
	# The remote access (SSH) configuration to use with your node group.
	# If you specify launchTemplate, then don't specify remoteAccess, or the node group deployment will fail. For
	# Keys:
	# ec2SshKey (string) -- The Amazon EC2 SSH key that provides access for SSH communication with the nodes
	# in the managed node group. For more information, see Amazon EC2 key pairs and Linux instances in the
	# Amazon Elastic Compute Cloud User Guide for Linux Instances .
	# sourceSecurityGroups (list) -- The security groups that are allowed SSH access (port 22) to the nodes.
	# If you specify an Amazon EC2 SSH key but do not specify a source security group when you create
	# a managed node group, then port 22 on the nodes is opened to the internet (0.0.0.0/0).
	# For more information, see Security Groups for Your VPC in the Amazon Virtual Private Cloud User Guide .
	remote_access: Optional[Dict[str, str]] = None
	# The Kubernetes labels to be applied to the nodes in the node group when they are created.
	labels: Optional[Dict[str, str]] = None
	# The Kubernetes taints to be applied to the nodes in the node group.
	taints: Optional[List[dict]] = None
	# The metadata to apply to the node group to assist with categorization and organization.
	# Each tag consists of a key and an optional value. You define both.
	# Node group tags do not propagate to any other resources associated with the node group,
	# such as the Amazon EC2 instances or subnets.
	tags: Optional[Dict[str, str]] = None
	# Unique, case-sensitive identifier that you provide to ensure the idempotency of the request.
	# This field is autopopulated if not provided.
	client_request_token: Optional[str] = None
	# An object representing a node group's launch template specification.
	# If specified, then do not specify instanceTypes, diskSize, or remoteAccess and make sure that the launch template
	# meets the requirements in launchTemplateSpecification .
	launch_template: Optional[Dict[str, str]] = None
	# The node group update configuration.
	update_config: Optional[Dict[str, int]] = None
	# The capacity type for your node group.
	capacity_type: Optional[Literal["ON_DEMAND", "SPOT"]] = None
	# The Kubernetes version to use for your managed nodes.
	# By default, the Kubernetes version of the cluster is used, and this is the only accepted specified value.
	# If you specify launchTemplate , and your launch template uses a custom AMI,
	# then don't specify version , or the node group deployment will fail.
	version: Optional[str] = None
	# The AMI version of the Amazon EKS optimized AMI to use with your node group.
	# By default, the latest available AMI version for the node group's current Kubernetes version is used.
	release_version: Optional[str] = None

	# provided by api on create
	created_at: Optional[str] = None
	nodegroup_status: Optional[str] = None

	# provided by api on update
	update_id: Optional[str] = None
	update_status: Optional[str] = None

	# bump the wait time for Eks to 30 seconds
	waiter_delay: int = 30

	def _create(self, aws_client: AwsApiClient) -> bool:
	"""Creates a NodeGroup for your Amazon EKS cluster.

	Args:
	aws_client: The AwsApiClient for the current cluster
	"""
	print_info(f"Creating {self.get_resource_type()}: {self.get_resource_name()}")

	# Step 1: Get NodeGroup IamRole
	nodegroup_iam_role_arn = self.node_role_arn
	if nodegroup_iam_role_arn is None and self.create_role:
	# Create NodeGroup IamRole and get nodegroup_iam_role_arn
	nodegroup_iam_role = self.get_nodegroup_iam_role()
	try:
	nodegroup_iam_role.create(aws_client)
	nodegroup_iam_role_arn = nodegroup_iam_role.read(aws_client).arn
	print_info(f"ARN for {nodegroup_iam_role.name}: {nodegroup_iam_role_arn}")
	except Exception as e:
	logger.error("NodeGroup IamRole creation failed, please fix and try again")
	logger.error(e)
	return False
	if nodegroup_iam_role_arn is None:
	logger.error("IamRole ARN not available, please fix and try again")
	return False

	# Step 2: Get the subnets
	subnets: Optional[List[str]] = self.subnets
	if subnets is None:
	# Use subnets from EKSCluster if subnets not provided
	subnets = self.eks_cluster.get_subnets(aws_client=aws_client)
	# Filter subnets using availability zones
	if self.subnet_az is not None:
	azs_filter = []
	if isinstance(self.subnet_az, str):
	azs_filter.append(self.subnet_az)
	elif isinstance(self.subnet_az, list):
	azs_filter.extend(self.subnet_az)

	subnets = [
	subnet_id
	for subnet_id in subnets
	if Subnet(name=subnet_id).get_availability_zone(aws_client=aws_client) in azs_filter
	]
	logger.debug(f"Using subnets from EKSCluster: {subnets}")
	# cast for type checker
	subnets = cast(List[str], subnets)

	# Step 3: Get the scaling_config
	scaling_config: Optional[Dict[str, Union[str, int]]] = self.scaling_config
	if scaling_config is None:
	# Build the scaling_config
	if self.min_size is not None:
	if scaling_config is None:
	scaling_config = {}
	scaling_config["minSize"] = self.min_size
	# use min_size as the default for maxSize/desiredSize incase maxSize/desiredSize is not provided
	scaling_config["maxSize"] = self.min_size
	scaling_config["desiredSize"] = self.min_size
	if self.max_size is not None:
	if scaling_config is None:
	scaling_config = {}
	scaling_config["maxSize"] = self.max_size
	if self.desired_size is not None:
	if scaling_config is None:
	scaling_config = {}
	scaling_config["desiredSize"] = self.desired_size

	# create a dict of args which are not null, otherwise aws type validation fails
	not_null_args: Dict[str, Any] = {}
	if scaling_config is not None:
	not_null_args["scalingConfig"] = scaling_config
	if self.disk_size is not None:
	not_null_args["diskSize"] = self.disk_size
	if self.instance_types is not None:
	not_null_args["instanceTypes"] = self.instance_types
	if self.ami_type is not None:
	not_null_args["amiType"] = self.ami_type
	if self.remote_access is not None:
	not_null_args["remoteAccess"] = self.remote_access
	if self.labels is not None:
	not_null_args["labels"] = self.labels
	if self.taints is not None:
	not_null_args["taints"] = self.taints
	if self.tags is not None:
	not_null_args["tags"] = self.tags
	if self.client_request_token is not None:
	not_null_args["clientRequestToken"] = self.client_request_token
	if self.launch_template is not None:
	not_null_args["launchTemplate"] = self.launch_template
	if self.update_config is not None:
	not_null_args["updateConfig"] = self.update_config
	if self.capacity_type is not None:
	not_null_args["capacityType"] = self.capacity_type
	if self.version is not None:
	not_null_args["version"] = self.version
	if self.release_version is not None:
	not_null_args["release_version"] = self.release_version

	# Step 4: Create EksNodeGroup
	service_client = self.get_service_client(aws_client)
	try:
	create_response = service_client.create_nodegroup(
	clusterName=self.eks_cluster.name,
	nodegroupName=self.name,
	subnets=subnets,
	nodeRole=nodegroup_iam_role_arn,
	**not_null_args,
	)
	logger.debug(f"EksNodeGroup: {create_response}")
	nodegroup_dict = create_response.get("nodegroup", {})

	# Validate EksNodeGroup creation
	self.created_at = nodegroup_dict.get("createdAt", None)
	self.nodegroup_status = nodegroup_dict.get("status", None)
	logger.debug(f"created_at: {self.created_at}")
	logger.debug(f"nodegroup_status: {self.nodegroup_status}")
	if self.created_at is not None:
	print_info(f"EksNodeGroup created: {self.name}")
	self.active_resource = create_response
	return True
	except service_client.exceptions.ResourceInUseException:
	print_info(f"EksNodeGroup already exists: {self.name}")
	return True
	except Exception as e:
	logger.error(f"{self.get_resource_type()} could not be created.")
	logger.error(e)
	return False

	def post_create(self, aws_client: AwsApiClient) -> bool:
	# Wait for EksNodeGroup to be created
	if self.wait_for_create:
	try:
	print_info(f"Waiting for {self.get_resource_type()} to be created.")
	waiter = self.get_service_client(aws_client).get_waiter("nodegroup_active")
	waiter.wait(
	clusterName=self.eks_cluster.name,
	nodegroupName=self.name,
	WaiterConfig={
	"Delay": self.waiter_delay,
	"MaxAttempts": self.waiter_max_attempts,
	},
	)
	except Exception as e:
	logger.error("Waiter failed.")
	logger.error(e)
	return True

	def _read(self, aws_client: AwsApiClient) -> Optional[Any]:
	"""Returns the EksNodeGroup

	Args:
	aws_client: The AwsApiClient for the current cluster
	"""
	logger.debug(f"Reading {self.get_resource_type()}: {self.get_resource_name()}")

	from botocore.exceptions import ClientError

	service_client = self.get_service_client(aws_client)
	try:
	describe_response = service_client.describe_nodegroup(
	clusterName=self.eks_cluster.name,
	nodegroupName=self.name,
	)
	# logger.debug(f"describe_response: {describe_response}")
	nodegroup_dict = describe_response.get("nodegroup", {})

	self.created_at = nodegroup_dict.get("createdAt", None)
	self.nodegroup_status = nodegroup_dict.get("status", None)
	logger.debug(f"NodeGroup created_at: {self.created_at}")
	logger.debug(f"NodeGroup status: {self.nodegroup_status}")
	if self.created_at is not None:
	logger.debug(f"EksNodeGroup found: {self.name}")
	self.active_resource = describe_response
	except ClientError as ce:
	logger.debug(f"ClientError: {ce}")
	except Exception as e:
	logger.error(f"Error reading {self.get_resource_type()}.")
	logger.error(e)
	return self.active_resource

	def _delete(self, aws_client: AwsApiClient) -> bool:
	"""Deletes the EksNodeGroup

	Args:
	aws_client: The AwsApiClient for the current cluster
	"""
	print_info(f"Deleting {self.get_resource_type()}: {self.get_resource_name()}")

	# Step 1: Delete the IamRole
	if self.node_role_arn is None and self.create_role:
	nodegroup_iam_role = self.get_nodegroup_iam_role()
	try:
	nodegroup_iam_role.delete(aws_client)
	except Exception as e:
	logger.error("IamRole deletion failed, please try again or delete manually")
	logger.error(e)

	# Step 2: Delete the NodeGroup
	service_client = self.get_service_client(aws_client)
	self.active_resource = None
	try:
	delete_response = service_client.delete_nodegroup(
	clusterName=self.eks_cluster.name,
	nodegroupName=self.name,
	)
	logger.debug(f"EksNodeGroup: {delete_response}")
	return True
	except Exception as e:
	logger.error(f"{self.get_resource_type()} could not be deleted.")
	logger.error("Please try again or delete resources manually.")
	logger.error(e)
	return False

	def post_delete(self, aws_client: AwsApiClient) -> bool:
	# Wait for EksNodeGroup to be deleted
	if self.wait_for_delete:
	try:
	print_info(f"Waiting for {self.get_resource_type()} to be deleted.")
	waiter = self.get_service_client(aws_client).get_waiter("nodegroup_deleted")
	waiter.wait(
	clusterName=self.eks_cluster.name,
	nodegroupName=self.name,
	WaiterConfig={
	"Delay": self.waiter_delay,
	"MaxAttempts": self.waiter_max_attempts,
	},
	)
	return True
	except Exception as e:
	logger.error("Waiter failed.")
	logger.error(e)
	return True

	def get_nodegroup_iam_role(self) -> IamRole:
	"""
	Create an IAM role and attach the required Amazon EKS IAM managed policy to it.
	"""
	if self.node_role is not None:
	return self.node_role

	policy_arns = [
	"arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy",
	"arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly",
	"arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy",
	"arn:aws:iam::aws:policy/AmazonS3FullAccess",
	"arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy",
	"arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy",
	]
	if self.add_policy_arns is not None and isinstance(self.add_policy_arns, list):
	policy_arns.extend(self.add_policy_arns)

	return IamRole(
	name=self.node_role_name or f"{self.name}-iam-role",
	assume_role_policy_document=dedent(
	"""\
	{
	"Version": "2012-10-17",
	"Statement": [
	{
	"Effect": "Allow",
	"Principal": {
	"Service": "ec2.amazonaws.com"
	},
	"Action": "sts:AssumeRole"
	}
	]
	}
	"""
	),
	policy_arns=policy_arns,
	)

	def _update(self, aws_client: AwsApiClient) -> bool:
	"""Update EKsNodeGroup"""
	print_info(f"Updating {self.get_resource_type()}: {self.get_resource_name()}")

	scaling_config: Optional[Dict[str, Union[str, int]]] = self.scaling_config
	if scaling_config is None:
	# Build the scaling_config
	if self.min_size is not None:
	if scaling_config is None:
	scaling_config = {}
	scaling_config["minSize"] = self.min_size
	# use min_size as the default for maxSize/desiredSize incase maxSize/desiredSize is not provided
	scaling_config["maxSize"] = self.min_size
	scaling_config["desiredSize"] = self.min_size
	if self.max_size is not None:
	if scaling_config is None:
	scaling_config = {}
	scaling_config["maxSize"] = self.max_size
	if self.desired_size is not None:
	if scaling_config is None:
	scaling_config = {}
	scaling_config["desiredSize"] = self.desired_size

	# TODO: Add logic to calculate updated_labels and updated_taints

	updated_labels = None
	updated_taints = None

	# create a dict of args which are not null, otherwise aws type validation fails
	not_null_args: Dict[str, Any] = {}
	if scaling_config is not None:
	not_null_args["scalingConfig"] = scaling_config
	if updated_labels is not None:
	not_null_args["labels"] = updated_labels
	if updated_taints is not None:
	not_null_args["taints"] = updated_taints
	if self.update_config is not None:
	not_null_args["updateConfig"] = self.update_config

	# Step 4: Update EksNodeGroup
	service_client = self.get_service_client(aws_client)
	try:
	update_response = service_client.update_nodegroup_config(
	clusterName=self.eks_cluster.name,
	nodegroupName=self.name,
	**not_null_args,
	)
	logger.debug(f"EksNodeGroup: {update_response}")
	nodegroup_dict = update_response.get("update", {})

	# Validate EksNodeGroup update
	self.update_id = nodegroup_dict.get("id", None)
	self.update_status = nodegroup_dict.get("status", None)
	logger.debug(f"update_id: {self.update_id}")
	logger.debug(f"update_status: {self.update_status}")
	if self.update_id is not None:
	print_info(f"EksNodeGroup updated: {self.name}")
	return True
	except Exception as e:
	logger.error(f"{self.get_resource_type()} could not be updated.")
	logger.error(e)
	return False