File size: 23,482 Bytes
105b369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
from typing import Optional, Any, Dict, List, Union, cast
from typing_extensions import Literal
from textwrap import dedent

from phi.aws.api_client import AwsApiClient
from phi.aws.resource.base import AwsResource
from phi.aws.resource.ec2.subnet import Subnet
from phi.aws.resource.eks.cluster import EksCluster
from phi.aws.resource.iam.role import IamRole
from phi.cli.console import print_info
from phi.utils.log import logger


class EksNodeGroup(AwsResource):
    """
    An Amazon EKS managed node group is an Amazon EC2 Auto Scaling group and associated EC2
    instances that are managed by Amazon Web Services for an Amazon EKS cluster.

    An Auto Scaling group is a group of EC2 instances that are combined into one management unit.
    When you set up an auto-scaling group, you specify a scaling policy and AWS will apply that policy to make sure
    that a certain number of instances is automatically running in your group. If the number of instances drops below a
    certain value, or if the load increases (depending on the policy),
    then AWS will automatically spin up new instances for you.

    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/eks.html
    https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/eks.html#EKS.Client.create_nodegroup
    """

    resource_type: Optional[str] = "EksNodeGroup"
    service_name: str = "eks"

    # Name for the node group
    name: str
    # The cluster to create the EksNodeGroup in
    eks_cluster: EksCluster

    # The IAM role to associate with your node group.
    # The Amazon EKS worker node kubelet daemon makes calls to Amazon Web Services APIs on your behalf.
    # Nodes receive permissions for these API calls through an IAM instance profile and associated policies.
    # Before you can launch nodes and register them into a cluster,
    # you must create an IAM role for those nodes to use when they are launched.

    # ARN for the node group IAM role to use
    node_role_arn: Optional[str] = None
    # If node_role_arn is None, a default role is created if create_role is True
    create_role: bool = True
    # If node_role is None, a default node_role is created using node_role_name
    node_role: Optional[IamRole] = None
    # Name for the default node_role when role is None, use "name-iam-role" if not provided
    node_role_name: Optional[str] = None
    # Provide a list of policy ARNs to attach to the node group role
    add_policy_arns: Optional[List[str]] = None

    # The scaling configuration details for the Auto Scaling group
    # Users can provide a dict for scaling config or use min/max/desired values below
    scaling_config: Optional[Dict[str, Union[str, int]]] = None
    # The minimum number of nodes that the managed node group can scale in to.
    min_size: Optional[int] = None
    # The maximum number of nodes that the managed node group can scale out to.
    max_size: Optional[int] = None
    # The current number of nodes that the managed node group should maintain.
    # WARNING: If you use Cluster Autoscaler, you shouldn't change the desired_size value directly,
    # as this can cause the Cluster Autoscaler to suddenly scale up or scale down.
    # Whenever this parameter changes, the number of worker nodes in the node group is updated to
    # the specified size. If this parameter is given a value that is smaller than the current number of
    # running worker nodes, the necessary number of worker nodes are terminated to match the given value.
    desired_size: Optional[int] = None
    # The root device disk size (in GiB) for your node group instances.
    # The default disk size is 20 GiB. If you specify launchTemplate,
    # then don't specify diskSize, or the node group deployment will fail.
    disk_size: Optional[int] = None
    # The subnets to use for the Auto Scaling group that is created for your node group.
    # If you specify launchTemplate, then don't specify SubnetId in your launch template,
    # or the node group deployment will fail.
    # For more information about using launch templates with Amazon EKS,
    # see Launch template support in the Amazon EKS User Guide.
    subnets: Optional[List[str]] = None
    # Filter subnets using availability zones
    subnet_az: Optional[Union[str, List[str]]] = None
    # Specify the instance types for a node group.
    # If you specify a GPU instance type, be sure to specify AL2_x86_64_GPU with the amiType parameter.
    # If you specify launchTemplate , then you can specify zero or one instance type in your launch template
    # or you can specify 0-20 instance types for instanceTypes .
    # If however, you specify an instance type in your launch template and specify any instanceTypes ,
    # the node group deployment will fail. If you don't specify an instance type in a launch template
    # or for instance_types, then t3.medium is used, by default. If you specify Spot for capacityType,
    # then we recommend specifying multiple values for instanceTypes .
    instance_types: Optional[List[str]] = None
    # The AMI type for your node group. GPU instance types should use the AL2_x86_64_GPU AMI type.
    # Non-GPU instances should use the AL2_x86_64 AMI type.
    # Arm instances should use the AL2_ARM_64 AMI type.
    # All types use the Amazon EKS optimized Amazon Linux 2 AMI.
    # If you specify launchTemplate , and your launch template uses a custom AMI,
    # then don't specify amiType , or the node group deployment will fail.
    ami_type: Optional[
        Literal[
            "AL2_x86_64",
            "AL2_x86_64_GPU",
            "AL2_ARM_64",
            "CUSTOM",
            "BOTTLEROCKET_ARM_64",
            "BOTTLEROCKET_x86_64",
        ]
    ] = None
    # The remote access (SSH) configuration to use with your node group.
    # If you specify launchTemplate, then don't specify remoteAccess, or the node group deployment will fail. For
    # Keys:
    #   ec2SshKey (string) -- The Amazon EC2 SSH key that provides access for SSH communication with the nodes
    #   in the managed node group. For more information, see Amazon EC2 key pairs and Linux instances in the
    #   Amazon Elastic Compute Cloud User Guide for Linux Instances .
    #   sourceSecurityGroups (list) -- The security groups that are allowed SSH access (port 22) to the nodes.
    #   If you specify an Amazon EC2 SSH key but do not specify a source security group when you create
    #   a managed node group, then port 22 on the nodes is opened to the internet (0.0.0.0/0).
    #   For more information, see Security Groups for Your VPC in the Amazon Virtual Private Cloud User Guide .
    remote_access: Optional[Dict[str, str]] = None
    # The Kubernetes labels to be applied to the nodes in the node group when they are created.
    labels: Optional[Dict[str, str]] = None
    # The Kubernetes taints to be applied to the nodes in the node group.
    taints: Optional[List[dict]] = None
    # The metadata to apply to the node group to assist with categorization and organization.
    # Each tag consists of a key and an optional value. You define both.
    # Node group tags do not propagate to any other resources associated with the node group,
    # such as the Amazon EC2 instances or subnets.
    tags: Optional[Dict[str, str]] = None
    # Unique, case-sensitive identifier that you provide to ensure the idempotency of the request.
    # This field is autopopulated if not provided.
    client_request_token: Optional[str] = None
    # An object representing a node group's launch template specification.
    # If specified, then do not specify instanceTypes, diskSize, or remoteAccess and make sure that the launch template
    # meets the requirements in launchTemplateSpecification .
    launch_template: Optional[Dict[str, str]] = None
    # The node group update configuration.
    update_config: Optional[Dict[str, int]] = None
    # The capacity type for your node group.
    capacity_type: Optional[Literal["ON_DEMAND", "SPOT"]] = None
    # The Kubernetes version to use for your managed nodes.
    # By default, the Kubernetes version of the cluster is used, and this is the only accepted specified value.
    # If you specify launchTemplate , and your launch template uses a custom AMI,
    # then don't specify version , or the node group deployment will fail.
    version: Optional[str] = None
    # The AMI version of the Amazon EKS optimized AMI to use with your node group.
    # By default, the latest available AMI version for the node group's current Kubernetes version is used.
    release_version: Optional[str] = None

    # provided by api on create
    created_at: Optional[str] = None
    nodegroup_status: Optional[str] = None

    # provided by api on update
    update_id: Optional[str] = None
    update_status: Optional[str] = None

    # bump the wait time for Eks to 30 seconds
    waiter_delay: int = 30

    def _create(self, aws_client: AwsApiClient) -> bool:
        """Creates a NodeGroup for your Amazon EKS cluster.

        Args:
            aws_client: The AwsApiClient for the current cluster
        """
        print_info(f"Creating {self.get_resource_type()}: {self.get_resource_name()}")

        # Step 1: Get NodeGroup IamRole
        nodegroup_iam_role_arn = self.node_role_arn
        if nodegroup_iam_role_arn is None and self.create_role:
            # Create NodeGroup IamRole and get nodegroup_iam_role_arn
            nodegroup_iam_role = self.get_nodegroup_iam_role()
            try:
                nodegroup_iam_role.create(aws_client)
                nodegroup_iam_role_arn = nodegroup_iam_role.read(aws_client).arn
                print_info(f"ARN for {nodegroup_iam_role.name}: {nodegroup_iam_role_arn}")
            except Exception as e:
                logger.error("NodeGroup IamRole creation failed, please fix and try again")
                logger.error(e)
                return False
        if nodegroup_iam_role_arn is None:
            logger.error("IamRole ARN not available, please fix and try again")
            return False

        # Step 2: Get the subnets
        subnets: Optional[List[str]] = self.subnets
        if subnets is None:
            # Use subnets from EKSCluster if subnets not provided
            subnets = self.eks_cluster.get_subnets(aws_client=aws_client)
            # Filter subnets using availability zones
            if self.subnet_az is not None:
                azs_filter = []
                if isinstance(self.subnet_az, str):
                    azs_filter.append(self.subnet_az)
                elif isinstance(self.subnet_az, list):
                    azs_filter.extend(self.subnet_az)

                subnets = [
                    subnet_id
                    for subnet_id in subnets
                    if Subnet(name=subnet_id).get_availability_zone(aws_client=aws_client) in azs_filter
                ]
            logger.debug(f"Using subnets from EKSCluster: {subnets}")
        # cast for type checker
        subnets = cast(List[str], subnets)

        # Step 3: Get the scaling_config
        scaling_config: Optional[Dict[str, Union[str, int]]] = self.scaling_config
        if scaling_config is None:
            # Build the scaling_config
            if self.min_size is not None:
                if scaling_config is None:
                    scaling_config = {}
                scaling_config["minSize"] = self.min_size
                # use min_size as the default for maxSize/desiredSize incase maxSize/desiredSize is not provided
                scaling_config["maxSize"] = self.min_size
                scaling_config["desiredSize"] = self.min_size
            if self.max_size is not None:
                if scaling_config is None:
                    scaling_config = {}
                scaling_config["maxSize"] = self.max_size
            if self.desired_size is not None:
                if scaling_config is None:
                    scaling_config = {}
                scaling_config["desiredSize"] = self.desired_size

        # create a dict of args which are not null, otherwise aws type validation fails
        not_null_args: Dict[str, Any] = {}
        if scaling_config is not None:
            not_null_args["scalingConfig"] = scaling_config
        if self.disk_size is not None:
            not_null_args["diskSize"] = self.disk_size
        if self.instance_types is not None:
            not_null_args["instanceTypes"] = self.instance_types
        if self.ami_type is not None:
            not_null_args["amiType"] = self.ami_type
        if self.remote_access is not None:
            not_null_args["remoteAccess"] = self.remote_access
        if self.labels is not None:
            not_null_args["labels"] = self.labels
        if self.taints is not None:
            not_null_args["taints"] = self.taints
        if self.tags is not None:
            not_null_args["tags"] = self.tags
        if self.client_request_token is not None:
            not_null_args["clientRequestToken"] = self.client_request_token
        if self.launch_template is not None:
            not_null_args["launchTemplate"] = self.launch_template
        if self.update_config is not None:
            not_null_args["updateConfig"] = self.update_config
        if self.capacity_type is not None:
            not_null_args["capacityType"] = self.capacity_type
        if self.version is not None:
            not_null_args["version"] = self.version
        if self.release_version is not None:
            not_null_args["release_version"] = self.release_version

        # Step 4: Create EksNodeGroup
        service_client = self.get_service_client(aws_client)
        try:
            create_response = service_client.create_nodegroup(
                clusterName=self.eks_cluster.name,
                nodegroupName=self.name,
                subnets=subnets,
                nodeRole=nodegroup_iam_role_arn,
                **not_null_args,
            )
            logger.debug(f"EksNodeGroup: {create_response}")
            nodegroup_dict = create_response.get("nodegroup", {})

            # Validate EksNodeGroup creation
            self.created_at = nodegroup_dict.get("createdAt", None)
            self.nodegroup_status = nodegroup_dict.get("status", None)
            logger.debug(f"created_at: {self.created_at}")
            logger.debug(f"nodegroup_status: {self.nodegroup_status}")
            if self.created_at is not None:
                print_info(f"EksNodeGroup created: {self.name}")
                self.active_resource = create_response
                return True
        except service_client.exceptions.ResourceInUseException:
            print_info(f"EksNodeGroup already exists: {self.name}")
            return True
        except Exception as e:
            logger.error(f"{self.get_resource_type()} could not be created.")
            logger.error(e)
        return False

    def post_create(self, aws_client: AwsApiClient) -> bool:
        # Wait for EksNodeGroup to be created
        if self.wait_for_create:
            try:
                print_info(f"Waiting for {self.get_resource_type()} to be created.")
                waiter = self.get_service_client(aws_client).get_waiter("nodegroup_active")
                waiter.wait(
                    clusterName=self.eks_cluster.name,
                    nodegroupName=self.name,
                    WaiterConfig={
                        "Delay": self.waiter_delay,
                        "MaxAttempts": self.waiter_max_attempts,
                    },
                )
            except Exception as e:
                logger.error("Waiter failed.")
                logger.error(e)
        return True

    def _read(self, aws_client: AwsApiClient) -> Optional[Any]:
        """Returns the EksNodeGroup

        Args:
            aws_client: The AwsApiClient for the current cluster
        """
        logger.debug(f"Reading {self.get_resource_type()}: {self.get_resource_name()}")

        from botocore.exceptions import ClientError

        service_client = self.get_service_client(aws_client)
        try:
            describe_response = service_client.describe_nodegroup(
                clusterName=self.eks_cluster.name,
                nodegroupName=self.name,
            )
            # logger.debug(f"describe_response: {describe_response}")
            nodegroup_dict = describe_response.get("nodegroup", {})

            self.created_at = nodegroup_dict.get("createdAt", None)
            self.nodegroup_status = nodegroup_dict.get("status", None)
            logger.debug(f"NodeGroup created_at: {self.created_at}")
            logger.debug(f"NodeGroup status: {self.nodegroup_status}")
            if self.created_at is not None:
                logger.debug(f"EksNodeGroup found: {self.name}")
                self.active_resource = describe_response
        except ClientError as ce:
            logger.debug(f"ClientError: {ce}")
        except Exception as e:
            logger.error(f"Error reading {self.get_resource_type()}.")
            logger.error(e)
        return self.active_resource

    def _delete(self, aws_client: AwsApiClient) -> bool:
        """Deletes the EksNodeGroup

        Args:
            aws_client: The AwsApiClient for the current cluster
        """
        print_info(f"Deleting {self.get_resource_type()}: {self.get_resource_name()}")

        # Step 1: Delete the IamRole
        if self.node_role_arn is None and self.create_role:
            nodegroup_iam_role = self.get_nodegroup_iam_role()
            try:
                nodegroup_iam_role.delete(aws_client)
            except Exception as e:
                logger.error("IamRole deletion failed, please try again or delete manually")
                logger.error(e)

        # Step 2: Delete the NodeGroup
        service_client = self.get_service_client(aws_client)
        self.active_resource = None
        try:
            delete_response = service_client.delete_nodegroup(
                clusterName=self.eks_cluster.name,
                nodegroupName=self.name,
            )
            logger.debug(f"EksNodeGroup: {delete_response}")
            return True
        except Exception as e:
            logger.error(f"{self.get_resource_type()} could not be deleted.")
            logger.error("Please try again or delete resources manually.")
            logger.error(e)
        return False

    def post_delete(self, aws_client: AwsApiClient) -> bool:
        # Wait for EksNodeGroup to be deleted
        if self.wait_for_delete:
            try:
                print_info(f"Waiting for {self.get_resource_type()} to be deleted.")
                waiter = self.get_service_client(aws_client).get_waiter("nodegroup_deleted")
                waiter.wait(
                    clusterName=self.eks_cluster.name,
                    nodegroupName=self.name,
                    WaiterConfig={
                        "Delay": self.waiter_delay,
                        "MaxAttempts": self.waiter_max_attempts,
                    },
                )
                return True
            except Exception as e:
                logger.error("Waiter failed.")
                logger.error(e)
        return True

    def get_nodegroup_iam_role(self) -> IamRole:
        """
        Create an IAM role and attach the required Amazon EKS IAM managed policy to it.
        """
        if self.node_role is not None:
            return self.node_role

        policy_arns = [
            "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy",
            "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly",
            "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy",
            "arn:aws:iam::aws:policy/AmazonS3FullAccess",
            "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy",
            "arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy",
        ]
        if self.add_policy_arns is not None and isinstance(self.add_policy_arns, list):
            policy_arns.extend(self.add_policy_arns)

        return IamRole(
            name=self.node_role_name or f"{self.name}-iam-role",
            assume_role_policy_document=dedent(
                """\
            {
              "Version": "2012-10-17",
              "Statement": [
                {
                  "Effect": "Allow",
                  "Principal": {
                    "Service": "ec2.amazonaws.com"
                  },
                  "Action": "sts:AssumeRole"
                }
              ]
            }
            """
            ),
            policy_arns=policy_arns,
        )

    def _update(self, aws_client: AwsApiClient) -> bool:
        """Update EKsNodeGroup"""
        print_info(f"Updating {self.get_resource_type()}: {self.get_resource_name()}")

        scaling_config: Optional[Dict[str, Union[str, int]]] = self.scaling_config
        if scaling_config is None:
            # Build the scaling_config
            if self.min_size is not None:
                if scaling_config is None:
                    scaling_config = {}
                scaling_config["minSize"] = self.min_size
                # use min_size as the default for maxSize/desiredSize incase maxSize/desiredSize is not provided
                scaling_config["maxSize"] = self.min_size
                scaling_config["desiredSize"] = self.min_size
            if self.max_size is not None:
                if scaling_config is None:
                    scaling_config = {}
                scaling_config["maxSize"] = self.max_size
            if self.desired_size is not None:
                if scaling_config is None:
                    scaling_config = {}
                scaling_config["desiredSize"] = self.desired_size

        # TODO: Add logic to calculate updated_labels and updated_taints

        updated_labels = None
        updated_taints = None

        # create a dict of args which are not null, otherwise aws type validation fails
        not_null_args: Dict[str, Any] = {}
        if scaling_config is not None:
            not_null_args["scalingConfig"] = scaling_config
        if updated_labels is not None:
            not_null_args["labels"] = updated_labels
        if updated_taints is not None:
            not_null_args["taints"] = updated_taints
        if self.update_config is not None:
            not_null_args["updateConfig"] = self.update_config

        # Step 4: Update EksNodeGroup
        service_client = self.get_service_client(aws_client)
        try:
            update_response = service_client.update_nodegroup_config(
                clusterName=self.eks_cluster.name,
                nodegroupName=self.name,
                **not_null_args,
            )
            logger.debug(f"EksNodeGroup: {update_response}")
            nodegroup_dict = update_response.get("update", {})

            # Validate EksNodeGroup update
            self.update_id = nodegroup_dict.get("id", None)
            self.update_status = nodegroup_dict.get("status", None)
            logger.debug(f"update_id: {self.update_id}")
            logger.debug(f"update_status: {self.update_status}")
            if self.update_id is not None:
                print_info(f"EksNodeGroup updated: {self.name}")
                return True
        except Exception as e:
            logger.error(f"{self.get_resource_type()} could not be updated.")
            logger.error(e)
        return False