NexusCS

eksctl

Kubernetes
Official CLI tool for Amazon EKS. Automates cluster creation, management, and operations using declarative YAML configs and CLI commands.
aws
eks
kubernetes
devops

Getting started

Introduction

eksctl is the official CLI for Amazon EKS, written in Go by Weaveworks. It simplifies creating and managing EKS clusters by automating complex operations across AWS services (CloudFormation, EC2, IAM, VPC).

Target users: DevOps engineers, platform teams, Kubernetes administrators.

Installation

macOS

# Homebrew
brew tap weaveworks/tap
brew install weaveworks/tap/eksctl

# Direct download
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin

Linux

curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin

Windows

# Chocolatey
choco install eksctl

# Scoop
scoop install eksctl

Quick Example

# Create cluster with defaults
eksctl create cluster

# With custom name and region
eksctl create cluster \
  --name my-cluster \
  --region us-west-2 \
  --version 1.30

# From config file
eksctl create cluster -f cluster.yaml

Defaults: 1 managed nodegroup, 2 m5.large nodes. Kubeconfig is automatically updated after creation.

Core Commands

Cluster Management

# Create cluster
eksctl create cluster
eksctl create cluster -f cluster.yaml
eksctl create cluster \
  --name my-cluster \
  --region us-west-2 \
  --version 1.30
# List clusters
eksctl get cluster
eksctl get cluster --name my-cluster
eksctl get cluster -o yaml
# Delete cluster (always use --wait)
eksctl delete cluster -f cluster.yaml --wait
eksctl delete cluster --name my-cluster --wait

# Bypass PodDisruptionBudgets
eksctl delete cluster \
  -f cluster.yaml \
  --disable-nodegroup-eviction

Note: Always use --wait flag for proper error reporting. May need manual CloudFormation stack deletion if delete fails.

Cluster Upgrades

# Control plane upgrade (dry run)
eksctl upgrade cluster \
  --name my-cluster \
  --version 1.30

# Apply upgrade
eksctl upgrade cluster \
  --name my-cluster \
  --version 1.30 \
  --approve

# With config file
eksctl upgrade cluster \
  -f cluster.yaml \
  --approve

Version constraint: Can only upgrade one minor version at a time (same as EKS).

Node Group Operations

# Create node group
eksctl create nodegroup \
  --cluster my-cluster \
  --name my-nodes \
  --node-type t3.medium \
  --nodes 3 \
  --nodes-min 1 \
  --nodes-max 4
# Scale node group
eksctl scale nodegroup \
  --cluster my-cluster \
  --name my-nodes \
  --nodes 5
# List node groups
eksctl get nodegroup \
  --cluster my-cluster
# Delete node group
eksctl delete nodegroup \
  --cluster my-cluster \
  --name my-nodes
# Upgrade node group
eksctl upgrade nodegroup \
  --cluster my-cluster \
  --name my-nodes \
  --kubernetes-version 1.30

IAM Service Accounts (IRSA)

# Create IAM service account
eksctl create iamserviceaccount \
  --name SERVICE_ACCOUNT_NAME \
  --namespace NAMESPACE \
  --cluster CLUSTER_NAME \
  --role-name IAM_ROLE_NAME \
  --attach-policy-arn arn:aws:iam::aws:policy/POLICY_NAME \
  --approve
# Example: EBS CSI Driver
eksctl create iamserviceaccount \
  --name ebs-csi-controller-sa \
  --namespace kube-system \
  --cluster my-cluster \
  --role-name AmazonEKS_EBS_CSI_DriverRole \
  --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy \
  --approve
# List IAM service accounts
eksctl get iamserviceaccount \
  --cluster my-cluster
# Delete IAM service account
eksctl delete iamserviceaccount \
  --name SERVICE_ACCOUNT_NAME \
  --namespace NAMESPACE \
  --cluster CLUSTER_NAME

Utils Commands

# Associate OIDC provider
eksctl utils associate-iam-oidc-provider \
  --cluster my-cluster \
  --approve
# Write kubeconfig
eksctl utils write-kubeconfig \
  --cluster my-cluster
# Update cluster logging
eksctl utils update-cluster-logging \
  --cluster my-cluster \
  --enable-types all \
  --approve
# View IAM identity mappings
eksctl get iamidentitymapping \
  --cluster my-cluster

Cluster Config File

Basic Structure

apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
  name: my-cluster
  region: us-west-2
  version: "1.30"

managedNodeGroups:
  - name: ng-1
    instanceType: t3.medium
    desiredCapacity: 2
    minSize: 1
    maxSize: 4

Cluster name validation:

  • Only alphanumeric and hyphens
  • Max 128 characters
  • Must start with letter

VPC Configuration

New VPC (default)

vpc:
  cidr: 10.0.0.0/16
  nat:
    gateway: HighlyAvailable
    # or Single

Existing VPC

vpc:
  id: vpc-xxxxx
  subnets:
    private:
      us-west-2a: { id: subnet-xxxxx }
      us-west-2b: { id: subnet-yyyyy }
    public:
      us-west-2a: { id: subnet-zzzzz }
      us-west-2b: { id: subnet-wwwww }

Private Networking

privateNetworking: true

Nodes will be placed in private subnets only.

Node Group Types

Managed Node Group

managedNodeGroups:
  - name: ng-managed
    instanceType: t3.medium
    desiredCapacity: 2
    minSize: 1
    maxSize: 4
    volumeSize: 80
    ssh:
      allow: true
      publicKeyPath: ~/.ssh/id_rsa.pub
    labels:
      role: worker
    tags:
      Environment: production

Self-Managed Node Group

nodeGroups:
  - name: ng-self-managed
    instanceType: t3.medium
    desiredCapacity: 2
    minSize: 1
    maxSize: 4

Default node type (v0.215.0+): Managed nodegroups. Use --managed=false for self-managed nodes.

Fargate Profile

fargateProfiles:
  - name: fp-default
    selectors:
      - namespace: default
      - namespace: kube-system

Spot Instances

Managed Node Group with Spot

managedNodeGroups:
  - name: ng-spot
    instanceTypes: ["t3.medium", "t3.large"]
    spot: true
    desiredCapacity: 2

Mixed Instances (On-Demand + Spot)

managedNodeGroups:
  - name: ng-mixed
    instanceTypes:
      - t3.medium
      - t3.large
      - t3a.medium
    spot: true
    instancesDistribution:
      onDemandBaseCapacity: 2
      onDemandPercentageAboveBaseCapacity: 0
      spotAllocationStrategy: capacity-optimized

Labels and Taints

managedNodeGroups:
  - name: ng-with-labels-taints
    instanceType: t3.medium
    desiredCapacity: 2
    labels:
      workload: batch
      team: data
    taints:
      - key: dedicated
        value: batch
        effect: NoSchedule

IAM Configuration

Node IAM Policies

managedNodeGroups:
  - name: ng-with-iam
    instanceType: t3.medium
    desiredCapacity: 2
    iam:
      attachPolicyARNs:
        - arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy
        - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy
        - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly
        - arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy

Required default policies when using custom attachPolicyARNs:

  • AmazonEKSWorkerNodePolicy
  • AmazonEKS_CNI_Policy
  • AmazonEC2ContainerRegistryReadOnly

Reuse Existing IAM Role

iam:
  instanceProfileARN: arn:aws:iam::ACCOUNT_ID:instance-profile/PROFILE_NAME
  instanceRoleARN: arn:aws:iam::ACCOUNT_ID:role/ROLE_NAME

IAM Service Accounts (IRSA)

iam:
  withOIDC: true
  serviceAccounts:
    - metadata:
        name: ebs-csi-controller-sa
        namespace: kube-system
      roleName: AmazonEKS_EBS_CSI_DriverRole
      attachPolicyARNs:
        - arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy

    - metadata:
        name: cluster-autoscaler
        namespace: kube-system
      wellKnownPolicies:
        autoScaler: true

Well-Known Addon Policies

Policy Description
imageBuilder Full ECR access for CI/CD
autoScaler Cluster Autoscaler permissions
ebs EBS CSI driver permissions
efs EFS CSI driver permissions
fsx FSx for Lustre CSI driver
certManager Route 53 DNS01 challenge
awsLoadBalancerController ALB/NLB controller
externalDNS Route 53 external DNS
appMesh AWS App Mesh permissions
xRay AWS X-Ray permissions
cloudWatch CloudWatch logging/metrics

Use via wellKnownPolicies in service account config:

wellKnownPolicies:
  autoScaler: true
  ebs: true

Add-ons

Add-on Configuration

addons:
  - name: vpc-cni
    version: latest

  - name: coredns
    version: latest

  - name: kube-proxy
    version: latest

  - name: aws-ebs-csi-driver
    version: latest
    serviceAccountRoleARN: arn:aws:iam::ACCOUNT_ID:role/AmazonEKS_EBS_CSI_DriverRole

Add-on Management

# Upgrade add-on
eksctl upgrade addon \
  --name vpc-cni \
  --cluster my-cluster

# List add-ons
eksctl get addons --cluster my-cluster

Upgrade Workflow

3-Step Upgrade Process

  1. Control plane upgrade: eksctl upgrade cluster
  2. Node group upgrade: eksctl upgrade nodegroup
  3. Default add-ons upgrade: Automatic or manual via eksctl upgrade addon

Control Plane

# Dry run (preview without --approve)
eksctl upgrade cluster \
  --name my-cluster \
  --version 1.30

# Apply upgrade
eksctl upgrade cluster \
  --name my-cluster \
  --version 1.30 \
  --approve

Node Groups

Rolling Update

eksctl upgrade nodegroup \
  --cluster my-cluster \
  --name my-nodes \
  --kubernetes-version 1.30

Blue-Green Strategy

# Create new node group
eksctl create nodegroup \
  --cluster my-cluster \
  --name my-nodes-130 \
  --version 1.30

# Drain and delete old node group
eksctl delete nodegroup \
  --cluster my-cluster \
  --name my-nodes \
  --drain

Add-ons

eksctl upgrade addon \
  --name vpc-cni \
  --cluster my-cluster

Troubleshooting

Cluster Creation Failures

CloudFormation Stack Errors

# Check stack events
aws cloudformation describe-stack-events \
  --stack-name eksctl-CLUSTER_NAME-cluster

# Check stack resources
aws cloudformation describe-stack-resources \
  --stack-name eksctl-CLUSTER_NAME-cluster

IAM Permission Issues

  • Ensure AWS credentials have necessary permissions
  • Required: EKS, CloudFormation, EC2, IAM, VPC permissions
  • Check CloudTrail for denied API calls

VPC/Networking Issues

# Error: UnsupportedAvailabilityZoneException
# Solution: Explicitly specify zones
eksctl create cluster \
  --zones us-east-1a,us-east-1b

Common issues:

  • Insufficient IPs (minimum 8 per subnet recommended)
  • Subnet availability zones in regions like us-east-1

Node Group Scaling Issues

# Check node group status
eksctl get nodegroup --cluster my-cluster

# Check Auto Scaling Group
aws autoscaling describe-auto-scaling-groups \
  --query 'AutoScalingGroups[?Tags[?Key==`eks:cluster-name`&&Value==`my-cluster`]]'

Deletion Stuck/Failed

PodDisruptionBudgets Blocking

# Check PDBs
kubectl get pdb -A

# Delete cluster bypassing PDBs
eksctl delete cluster \
  -f cluster.yaml \
  --disable-nodegroup-eviction

CloudFormation Stack Stuck

  • Check stack events for specific errors
  • Delete resources manually if needed
  • Delete stack from AWS Console or CLI

IRSA Issues

Service Account Not Created

# Check if OIDC provider exists
aws iam list-open-id-connect-providers

# Associate OIDC provider
eksctl utils associate-iam-oidc-provider \
  --cluster my-cluster \
  --approve

# Recreate service account
eksctl create iamserviceaccount ...

Pod Cannot Assume Role

# Check service account annotations
kubectl describe sa SERVICE_ACCOUNT_NAME -n NAMESPACE
# Should see: eks.amazonaws.com/role-arn

# Check pod environment variables
kubectl exec POD_NAME -n NAMESPACE -- env | grep AWS
# Should see: AWS_ROLE_ARN and AWS_WEB_IDENTITY_TOKEN_FILE

Best Practices

Config File Management

  • Store cluster.yaml in version control
  • Use descriptive cluster and node group names
  • Document configuration choices
  • Parameterize with environment variables or templating

Cluster Creation

  • Always specify Kubernetes version explicitly
  • Use managed node groups for easier lifecycle management
  • Enable control plane logging at creation
  • Configure VPC with public and private subnets
  • Tag resources appropriately

Node Groups

  • Use multiple instance types for spot node groups
  • Set appropriate min/max/desired capacity
  • Use labels and taints for workload scheduling
  • Enable SSH access only when needed
  • Use private networking when possible

IAM

  • Use IRSA for Pod IAM permissions (not node IAM roles)
  • Follow least privilege principle
  • Use well-known addon policies when available
  • Document custom IAM policies

Operations

  • Use --wait flag with delete operations
  • Test configurations in non-production first
  • Review CloudFormation stacks periodically
  • Keep eksctl updated
  • Use --dry-run or omit --approve to preview changes

Security

  • Restrict API endpoint access
  • Enable secrets encryption with KMS
  • Use private subnets for worker nodes
  • Enable control plane logging
  • Regularly rotate credentials

Comparison

eksctl vs AWS Console vs Terraform

Feature eksctl AWS Console Terraform
Speed Fast automation Manual clicks Moderate
Reproducibility High (YAML configs) Low High (HCL code)
Version Control Yes No Yes
EKS-Specific Yes (dedicated) Yes No (multi-service)
Learning Curve Low Low Moderate
Multi-Cloud No No Yes
CI/CD Integration Easy Difficult Easy
Complex Infrastructure Limited Moderate Excellent

When to Use

eksctl:

  • Quick EKS cluster creation
  • EKS-focused workflows
  • CI/CD automation
  • Learning EKS

AWS Console:

  • Visual interface preference
  • One-off operations
  • Beginners

Terraform:

  • Multi-service AWS infrastructure
  • Multi-cloud deployments
  • Complex networking requirements
  • Organizational IaC standards

Examples

Production-Ready Cluster

apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
  name: prod-cluster
  region: us-west-2
  version: "1.30"

vpc:
  cidr: 10.0.0.0/16
  nat:
    gateway: HighlyAvailable

iam:
  withOIDC: true

cloudWatch:
  clusterLogging:
    enableTypes: ["all"]

managedNodeGroups:
  - name: ng-on-demand
    instanceType: t3.large
    desiredCapacity: 3
    minSize: 2
    maxSize: 6
    privateNetworking: true
    volumeSize: 100
    labels:
      role: worker
      capacity-type: on-demand
    tags:
      Environment: production
      ManagedBy: eksctl

  - name: ng-spot
    instanceTypes:
      - t3.large
      - t3a.large
      - t3.xlarge
    spot: true
    desiredCapacity: 2
    minSize: 0
    maxSize: 10
    privateNetworking: true
    labels:
      role: worker
      capacity-type: spot
    taints:
      - key: spot
        value: "true"
        effect: NoSchedule

IRSA for AWS Load Balancer Controller

# Create IAM service account
eksctl create iamserviceaccount \
  --cluster prod-cluster \
  --namespace kube-system \
  --name aws-load-balancer-controller \
  --role-name AmazonEKSLoadBalancerControllerRole \
  --attach-policy-arn arn:aws:iam::ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy \
  --approve

# Install controller via Helm
helm repo add eks https://aws.github.io/eks-charts
helm install aws-load-balancer-controller \
  eks/aws-load-balancer-controller \
  -n kube-system \
  --set clusterName=prod-cluster \
  --set serviceAccount.create=false \
  --set serviceAccount.name=aws-load-balancer-controller

Multi-Region DR Setup

# Primary region
eksctl create cluster -f cluster-us-west-2.yaml

# DR region
eksctl create cluster -f cluster-us-east-1.yaml

# Configure kubectl contexts
kubectl config use-context primary@prod-cluster.us-west-2
kubectl config use-context dr@prod-cluster.us-east-1

# Verify both clusters
eksctl get cluster --region us-west-2
eksctl get cluster --region us-east-1

IRSA with Well-Known Policies

apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
  name: my-cluster
  region: us-west-2

iam:
  withOIDC: true
  serviceAccounts:
    - metadata:
        name: cluster-autoscaler
        namespace: kube-system
      wellKnownPolicies:
        autoScaler: true

    - metadata:
        name: aws-load-balancer-controller
        namespace: kube-system
      wellKnownPolicies:
        awsLoadBalancerController: true

    - metadata:
        name: external-dns
        namespace: kube-system
      wellKnownPolicies:
        externalDNS: true

Spot Instance Cost Optimization

managedNodeGroups:
  # On-demand baseline for critical workloads
  - name: on-demand-baseline
    instanceType: t3.medium
    desiredCapacity: 2
    minSize: 2
    maxSize: 4
    labels:
      workload: critical

  # Spot instances for batch processing
  - name: spot-batch
    instanceTypes:
      - t3.medium
      - t3a.medium
      - t3.large
      - t3a.large
    spot: true
    desiredCapacity: 5
    minSize: 0
    maxSize: 20
    labels:
      workload: batch
    taints:
      - key: spot
        value: "true"
        effect: NoSchedule
    tags:
      k8s.io/cluster-autoscaler/node-template/label/workload: batch

Also see

eksctl Cheatsheet - NexusCS