chore: Update targeted ODCR to show p5 to be reflect current usage (#…

…1946)
aws-ia · May 13, 2024 · f11b7ef · f11b7ef
1 parent 9a0ca42
commit f11b7ef
Show file tree

Hide file tree

Showing 12 changed files with 273 additions and 189 deletions.
diff --git a/patterns/ml-capacity-block/README.md b/patterns/ml-capacity-block/README.md
@@ -2,11 +2,11 @@
 
 This pattern demonstrates how to consume/utilize ML capacity block reservations (CBR) with Amazon EKS. The solution is comprised of primarily 2 components:
 
-!!! warning
-      The use of self-managed node group(s) are required at this time to support capacity block reservations within EKS. This pattern will be updated to demonstrate EKS managed node groups once support has been implemented by the EKS service.
-
 1. The self-managed node group that will utilize the CBR should have the subnets provided to it restricted to the availability zone where the CBR has been allocated. For example - if the CBR is allocated to `us-west-2b`, the node group should only have subnet IDs provided to it that reside in `us-west-2b`. If the subnets that reside in other AZs are provided, its possible to encounter an error such as `InvalidParameterException: The following supplied instance types do not exist ...`. It is not guaranteed that this error will always be shown, and may appear random since the underlying autoscaling group(s) will provision nodes into different AZs at random. It will only occur when the underlying autoscaling group tries to provision instances into an AZ where capacity is not allocated and there is insufficient on-demand capacity for the desired instance type.
 
+    !!! warning
+        The use of self-managed node group(s) are required at this time to support capacity block reservations within EKS. This pattern will be updated to demonstrate EKS managed node groups once support has been implemented by the EKS service.
+
 2. The launch template utilized should specify the `instance_market_options` and `capacity_reservation_specification` arguments. This is how the CBR is utilized by the node group (i.e. - tells the autoscaling group to launch instances utilizing provided capacity reservation).
 
 <b>Links:</b>
@@ -16,7 +16,7 @@ This pattern demonstrates how to consume/utilize ML capacity block reservations
 
 ## Code
 
-```terraform hl_lines="53-93"
+```terraform hl_lines="5-11 54-56 84-92"
 {% include  "../../patterns/ml-capacity-block/eks.tf" %}
 ```
 

diff --git a/patterns/ml-capacity-block/eks.tf b/patterns/ml-capacity-block/eks.tf
@@ -6,7 +6,7 @@
 # on how to obtain a ML capacity block reservation. Once acquired, you can provide
 # the reservation ID through this input to deploy the pattern
 variable "capacity_reservation_id" {
-  description = "The ID of the ML capacity block reservation to use for the node group"
+  description = "The ID of the ML capacity block reservation for the node group"
   type        = string
 }
 
@@ -27,9 +27,10 @@ module "eks" {
   cluster_endpoint_public_access           = true
 
   cluster_addons = {
-    coredns    = {}
-    kube-proxy = {}
-    vpc-cni    = {}
+    coredns                = {}
+    eks-pod-identity-agent = {}
+    kube-proxy             = {}
+    vpc-cni                = {}
   }
 
   # Add security group rules on the node group security group to
@@ -53,7 +54,7 @@ module "eks" {
   # Note: ML capacity block reservations are only supported
   # on self-managed node groups at this time
   self_managed_node_groups = {
-    odcr = {
+    cbr = {
       # The EKS AL2 GPU AMI provides all of the necessary components
       # for accelerated workloads w/ EFA
       ami_type      = "AL2_x86_64_GPU"
@@ -94,51 +95,3 @@ module "eks" {
 
   tags = local.tags
 }
-
-################################################################################
-# Helm charts
-################################################################################
-
-resource "helm_release" "nvidia_device_plugin" {
-  name             = "nvidia-device-plugin"
-  repository       = "https://nvidia.github.io/k8s-device-plugin"
-  chart            = "nvidia-device-plugin"
-  version          = "0.14.5"
-  namespace        = "nvidia-device-plugin"
-  create_namespace = true
-  wait             = false
-
-  values = [
-    <<-EOT
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: 'nvidia.com/gpu.present'
-                operator: In
-                values:
-                - 'true'
-    EOT
-  ]
-}
-
-resource "helm_release" "aws_efa_device_plugin" {
-  name       = "aws-efa-k8s-device-plugin"
-  repository = "https://aws.github.io/eks-charts"
-  chart      = "aws-efa-k8s-device-plugin"
-  version    = "v0.4.4"
-  namespace  = "kube-system"
-  wait       = false
-
-  values = [
-    <<-EOT
-      nodeSelector:
-        vpc.amazonaws.com/efa.present: 'true'
-      tolerations:
-        - key: nvidia.com/gpu
-          operator: Exists
-          effect: NoSchedule
-    EOT
-  ]
-}
diff --git a/patterns/ml-capacity-block/helm.tf b/patterns/ml-capacity-block/helm.tf
@@ -0,0 +1,47 @@
+################################################################################
+# Helm charts
+################################################################################
+
+resource "helm_release" "nvidia_device_plugin" {
+  name             = "nvidia-device-plugin"
+  repository       = "https://nvidia.github.io/k8s-device-plugin"
+  chart            = "nvidia-device-plugin"
+  version          = "0.14.5"
+  namespace        = "nvidia-device-plugin"
+  create_namespace = true
+  wait             = false
+
+  values = [
+    <<-EOT
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: 'nvidia.com/gpu.present'
+                operator: In
+                values:
+                - 'true'
+    EOT
+  ]
+}
+
+resource "helm_release" "aws_efa_device_plugin" {
+  name       = "aws-efa-k8s-device-plugin"
+  repository = "https://aws.github.io/eks-charts"
+  chart      = "aws-efa-k8s-device-plugin"
+  version    = "v0.4.4"
+  namespace  = "kube-system"
+  wait       = false
+
+  values = [
+    <<-EOT
+      nodeSelector:
+        vpc.amazonaws.com/efa.present: 'true'
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+    EOT
+  ]
+}
diff --git a/patterns/ml-capacity-block/main.tf b/patterns/ml-capacity-block/main.tf
@@ -57,6 +57,15 @@ locals {
   }
 }
 
+################################################################################
+# Output
+################################################################################
+
+output "configure_kubectl" {
+  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
+  value       = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
+}
+
 ################################################################################
 # Supporting Resources
 ################################################################################

diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md
@@ -17,10 +17,14 @@ The following components are demonstrated in this pattern:
 
 ## Code
 
-```terraform hl_lines="23-25 31-68"
+```terraform hl_lines="24-26 32-67"
 {% include  "../../patterns/nvidia-gpu-efa/eks.tf" %}
 ```
 
+```terraform hl_lines="5-47"
+{% include  "../../patterns/nvidia-gpu-efa/helm.tf" %}
+```
+
 ## Deploy
 
 See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#prerequisites) for the prerequisites and steps to deploy this pattern.

diff --git a/patterns/nvidia-gpu-efa/eks.tf b/patterns/nvidia-gpu-efa/eks.tf
@@ -15,9 +15,10 @@ module "eks" {
   cluster_endpoint_public_access           = true
 
   cluster_addons = {
-    coredns    = {}
-    kube-proxy = {}
-    vpc-cni    = {}
+    coredns                = {}
+    eks-pod-identity-agent = {}
+    kube-proxy             = {}
+    vpc-cni                = {}
   }
 
   # Add security group rules on the node group security group to
@@ -35,8 +36,6 @@ module "eks" {
       instance_types = ["p5.48xlarge"]
 
       pre_bootstrap_user_data = <<-EOT
-        #!/usr/bin/env bash
-
         # Mount instance store volumes in RAID-0 for kubelet and containerd
         # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
         /bin/setup-local-disks raid0
@@ -71,18 +70,6 @@ module "eks" {
     default = {
       instance_types = ["m5.large"]
 
-      # Default AMI has only 8GB of storage
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size           = 128
-            volume_type           = "gp3"
-            delete_on_termination = true
-          }
-        }
-      }
-
       min_size     = 1
       max_size     = 2
       desired_size = 2
@@ -91,51 +78,3 @@ module "eks" {
 
   tags = local.tags
 }
-
-################################################################################
-# Helm charts
-################################################################################
-
-resource "helm_release" "nvidia_device_plugin" {
-  name             = "nvidia-device-plugin"
-  repository       = "https://nvidia.github.io/k8s-device-plugin"
-  chart            = "nvidia-device-plugin"
-  version          = "0.14.5"
-  namespace        = "nvidia-device-plugin"
-  create_namespace = true
-  wait             = false
-
-  values = [
-    <<-EOT
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: 'nvidia.com/gpu.present'
-                operator: In
-                values:
-                - 'true'
-    EOT
-  ]
-}
-
-resource "helm_release" "aws_efa_device_plugin" {
-  name       = "aws-efa-k8s-device-plugin"
-  repository = "https://aws.github.io/eks-charts"
-  chart      = "aws-efa-k8s-device-plugin"
-  version    = "v0.4.4"
-  namespace  = "kube-system"
-  wait       = false
-
-  values = [
-    <<-EOT
-      nodeSelector:
-        vpc.amazonaws.com/efa.present: 'true'
-      tolerations:
-        - key: nvidia.com/gpu
-          operator: Exists
-          effect: NoSchedule
-    EOT
-  ]
-}
diff --git a/patterns/nvidia-gpu-efa/helm.tf b/patterns/nvidia-gpu-efa/helm.tf
@@ -0,0 +1,47 @@
+################################################################################
+# Helm charts
+################################################################################
+
+resource "helm_release" "nvidia_device_plugin" {
+  name             = "nvidia-device-plugin"
+  repository       = "https://nvidia.github.io/k8s-device-plugin"
+  chart            = "nvidia-device-plugin"
+  version          = "0.14.5"
+  namespace        = "nvidia-device-plugin"
+  create_namespace = true
+  wait             = false
+
+  values = [
+    <<-EOT
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: 'nvidia.com/gpu.present'
+                operator: In
+                values:
+                - 'true'
+    EOT
+  ]
+}
+
+resource "helm_release" "aws_efa_device_plugin" {
+  name       = "aws-efa-k8s-device-plugin"
+  repository = "https://aws.github.io/eks-charts"
+  chart      = "aws-efa-k8s-device-plugin"
+  version    = "v0.4.4"
+  namespace  = "kube-system"
+  wait       = false
+
+  values = [
+    <<-EOT
+      nodeSelector:
+        vpc.amazonaws.com/efa.present: 'true'
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+    EOT
+  ]
+}
diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf
@@ -57,6 +57,15 @@ locals {
   }
 }
 
+################################################################################
+# Output
+################################################################################
+
+output "configure_kubectl" {
+  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
+  value       = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
+}
+
 ################################################################################
 # Supporting Resources
 ################################################################################

diff --git a/patterns/targeted-odcr/README.md b/patterns/targeted-odcr/README.md
@@ -18,7 +18,7 @@ This pattern demonstrates how to consume/utilize on-demand capacity reservations
 
 ## Code
 
-```terraform hl_lines="34-51"
+```terraform hl_lines="5-8 81-88 108-131"
 {% include  "../../patterns/targeted-odcr/eks.tf" %}
 ```