본문 바로가기

study/AEWS 2기

AEWS 2기 5주차 두번째

6. Karpenter : K8S Native AutoScaler

소개 : 오픈소스 노드 수명 주기 관리 솔루션, 몇 초 만에 컴퓨팅 리소스 제공 - https://ec2spotworkshops.com/karpenter.html

 

Getting Started with Karpenter 실습 : Docs Intro : 복잡도를 줄이기 위해서 신규 EKS(myeks2) 환경에서 실습 진행 - TeamGithub Chart

# YAML 파일 다운로드
curl -O https://s3.ap-northeast-2.amazonaws.com/cloudformation.cloudneta.net/K8S/karpenter-preconfig.yaml

# CloudFormation 스택 배포
예시) aws cloudformation deploy --template-file karpenter-preconfig.yaml --stack-name myeks2 --parameter-overrides KeyName=kp-gasida SgIngressSshCidr=$(curl -s ipinfo.io/ip)/32  MyIamUserAccessKeyID=AKIA5... MyIamUserSecretAccessKey='CVNa2...' ClusterBaseName=myeks2 --region ap-northeast-2

# CloudFormation 스택 배포 완료 후 작업용 EC2 IP 출력
aws cloudformation describe-stacks --stack-name myeks2 --query 'Stacks[*].Outputs[0].OutputValue' --output text

# 작업용 EC2 SSH 접속
ssh -i ~/.ssh/kp-gasida.pem ec2-user@$(aws cloudformation describe-stacks --stack-name myeks2 --query 'Stacks[*].Outputs[0].OutputValue' --output text)

 

사전 확인 & eks-node-viewer 설치

# IP 주소 확인 : 172.30.0.0/16 VPC 대역에서 172.30.1.0/24 대역을 사용 중
ip -br -c addr

# EKS Node Viewer 설치 : 현재 ec2 spec에서는 설치에 다소 시간이 소요됨 = 2분 이상
wget https://go.dev/dl/go1.22.1.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.22.1.linux-amd64.tar.gz
export PATH=$PATH:/usr/local/go/bin
go install github.com/awslabs/eks-node-viewer/cmd/eks-node-viewer@latest

# [터미널1] bin 확인
cd ~/go/bin && ./eks-node-viewer -h

# EKS 배포 완료 후 실행 하자
cd ~/go/bin && ./eks-node-viewer --resources cpu,memory

 

EKS 배포 - Link

# 변수 정보 확인
export | egrep 'ACCOUNT|AWS_' | egrep -v 'SECRET|KEY'

# 변수 설정
export KARPENTER_NAMESPACE="kube-system"
export K8S_VERSION="1.29"
export KARPENTER_VERSION="0.35.2"
export TEMPOUT=$(mktemp)
export ARM_AMI_ID="$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-arm64/recommended/image_id --query Parameter.Value --output text)"
export AMD_AMI_ID="$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2/recommended/image_id --query Parameter.Value --output text)"
export GPU_AMI_ID="$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id --query Parameter.Value --output text)"
export AWS_PARTITION="aws"
export CLUSTER_NAME="${USER}-karpenter-demo"
echo "export CLUSTER_NAME=$CLUSTER_NAME" >> /etc/profile
echo $KARPENTER_VERSION $CLUSTER_NAME $AWS_DEFAULT_REGION $AWS_ACCOUNT_ID $TEMPOUT $ARM_AMI_ID $AMD_AMI_ID $GPU_AMI_ID

# CloudFormation 스택으로 IAM Policy, Role(KarpenterNodeRole-myeks2) 생성 : 3분 정도 소요
curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml  > "${TEMPOUT}" \
&& aws cloudformation deploy \
  --stack-name "Karpenter-${CLUSTER_NAME}" \
  --template-file "${TEMPOUT}" \
  --capabilities CAPABILITY_NAMED_IAM \
  --parameter-overrides "ClusterName=${CLUSTER_NAME}"

# 클러스터 생성 : myeks2 EKS 클러스터 생성 19분 정도 소요
eksctl create cluster -f - <<EOF
---
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
  name: ${CLUSTER_NAME}
  region: ${AWS_DEFAULT_REGION}
  version: "${K8S_VERSION}"
  tags:
    karpenter.sh/discovery: ${CLUSTER_NAME}

iam:
  withOIDC: true
  serviceAccounts:
  - metadata:
      name: karpenter
      namespace: "${KARPENTER_NAMESPACE}"
    roleName: ${CLUSTER_NAME}-karpenter
    attachPolicyARNs:
    - arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
    roleOnly: true

iamIdentityMappings:
- arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
  username: system:node:{{EC2PrivateDNSName}}
  groups:
  - system:bootstrappers
  - system:nodes

managedNodeGroups:
- instanceType: m5.large
  amiFamily: AmazonLinux2
  name: ${CLUSTER_NAME}-ng
  desiredCapacity: 2
  minSize: 1
  maxSize: 10
  iam:
    withAddonPolicies:
      externalDNS: true
EOF

# eks 배포 확인
eksctl get cluster
eksctl get nodegroup --cluster $CLUSTER_NAME
eksctl get iamidentitymapping --cluster $CLUSTER_NAME
eksctl get iamserviceaccount --cluster $CLUSTER_NAME
eksctl get addon --cluster $CLUSTER_NAME

# default 네임스페이스 적용
kubectl ns default


# 노드 정보 확인
kubectl get node --label-columns=node.kubernetes.io/instance-type,eks.amazonaws.com/capacityType,topology.kubernetes.io/zone

# ExternalDNS
MyDomain=<자신의 도메인>
echo "export MyDomain=<자신의 도메인>" >> /etc/profile
MyDomain=gasida.link
echo "export MyDomain=gasida.link" >> /etc/profile
MyDnzHostedZoneId=$(aws route53 list-hosted-zones-by-name --dns-name "${MyDomain}." --query "HostedZones[0].Id" --output text)
echo $MyDomain, $MyDnzHostedZoneId
curl -s -O https://raw.githubusercontent.com/gasida/PKOS/main/aews/externaldns.yaml
MyDomain=$MyDomain MyDnzHostedZoneId=$MyDnzHostedZoneId envsubst < externaldns.yaml | kubectl apply -f -

# kube-ops-view
helm repo add geek-cookbook https://geek-cookbook.github.io/charts/
helm install kube-ops-view geek-cookbook/kube-ops-view --version 1.2.2 --set env.TZ="Asia/Seoul" --namespace kube-system
kubectl patch svc -n kube-system kube-ops-view -p '{"spec":{"type":"LoadBalancer"}}'
kubectl annotate service kube-ops-view -n kube-system "external-dns.alpha.kubernetes.io/hostname=kubeopsview.$MyDomain"
echo -e "Kube Ops View URL = http://kubeopsview.$MyDomain:8080/#scale=1.5"


# [터미널1] eks-node-viewer
cd ~/go/bin && ./eks-node-viewer --resources cpu,memory

# k8s 확인
kubectl cluster-info
kubectl get node --label-columns=node.kubernetes.io/instance-type,eks.amazonaws.com/capacityType,topology.kubernetes.io/zone
kubectl get pod -n kube-system -owide
kubectl describe cm -n kube-system aws-auth

# Karpenter 설치를 위한 변수 설정 및 확인
export CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --query "cluster.endpoint" --output text)"
export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter"
echo "${CLUSTER_ENDPOINT} ${KARPENTER_IAM_ROLE_ARN}"

# EC2 Spot Fleet의 service-linked-role 생성 확인 : 만들어있는것을 확인하는 거라 아래 에러 출력이 정상!
# If the role has already been successfully created, you will see:
# An error occurred (InvalidInput) when calling the CreateServiceLinkedRole operation: Service role name AWSServiceRoleForEC2Spot has been taken in this account, please try a different suffix.
aws iam create-service-linked-role --aws-service-name spot.amazonaws.com || true

# docker logout : Logout of docker to perform an unauthenticated pull against the public ECR
docker logout public.ecr.aws

# helm registry logout
helm registry logout public.ecr.aws

# karpenter 설치
helm install karpenter oci://public.ecr.aws/karpenter/karpenter --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \
  --set "serviceAccount.annotations.eks\.amazonaws\.com/role-arn=${KARPENTER_IAM_ROLE_ARN}" \
  --set "settings.clusterName=${CLUSTER_NAME}" \
  --set "settings.interruptionQueue=${CLUSTER_NAME}" \
  --set controller.resources.requests.cpu=1 \
  --set controller.resources.requests.memory=1Gi \
  --set controller.resources.limits.cpu=1 \
  --set controller.resources.limits.memory=1Gi \
  --wait
 
# 확인
kubectl get-all -n $KARPENTER_NAMESPACE
kubectl get all -n $KARPENTER_NAMESPACE
kubectl get crd | grep karpenter

# APi 변경
v1alpha5/Provisioner → v1beta1/NodePool
v1alpha1/AWSNodeTemplate → v1beta1/EC2NodeClass 
v1alpha5/Machine → v1beta1/NodeClaim

 

 

  • Create NodePool(구 Provisioner) - Link Workshop
    • 관리 리소스는 securityGroupSelector and subnetSelector로 찾음
    • consolidationPolicy : 미사용 노드 정리 정책, 데몬셋 제외
cat <<EOF | envsubst | kubectl apply -f -
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
  name: default
spec:
  template:
    spec:
      requirements:
        - key: kubernetes.io/arch
          operator: In
          values: ["amd64"]
        - key: kubernetes.io/os
          operator: In
          values: ["linux"]
        - key: karpenter.sh/capacity-type
          operator: In
          values: ["spot"]
        - key: karpenter.k8s.aws/instance-category
          operator: In
          values: ["c", "m", "r"]
        - key: karpenter.k8s.aws/instance-generation
          operator: Gt
          values: ["2"]
      nodeClassRef:
        apiVersion: karpenter.k8s.aws/v1beta1
        kind: EC2NodeClass
        name: default
  limits:
    cpu: 1000
  disruption:
    consolidationPolicy: WhenUnderutilized
    expireAfter: 720h # 30 * 24h = 720h
---
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
  name: default
spec:
  amiFamily: AL2 # Amazon Linux 2
  role: "KarpenterNodeRole-${CLUSTER_NAME}" # replace with your cluster name
  subnetSelectorTerms:
    - tags:
        karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
  securityGroupSelectorTerms:
    - tags:
        karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
  amiSelectorTerms:
    - id: "${ARM_AMI_ID}"
    - id: "${AMD_AMI_ID}"
#   - id: "${GPU_AMI_ID}" # <- GPU Optimized AMD AMI 
#   - name: "amazon-eks-node-${K8S_VERSION}-*" # <- automatically upgrade when a new AL2 EKS Optimized AMI is released. This is unsafe for production workloads. Validate AMIs in lower environments before deploying them to production.
EOF

# 확인
kubectl get nodepool,ec2nodeclass

 

 

Scale up deployment - Link Workshop

 

Automatic Node Provisioning | EKS Workshop

We'll start putting Karpenter to work by examining how it can dynamically provision appropriately sized EC2 instances depending on the needs of Pods that cannot be scheduled at any given time. This can reduce the amount of unused compute resources in an EK

www.eksworkshop.com

# pause 파드 1개에 CPU 1개 최소 보장 할당
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: inflate
spec:
  replicas: 0
  selector:
    matchLabels:
      app: inflate
  template:
    metadata:
      labels:
        app: inflate
    spec:
      terminationGracePeriodSeconds: 0
      containers:
        - name: inflate
          image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
          resources:
            requests:
              cpu: 1
EOF

# Scale up
kubectl get pod
kubectl scale deployment inflate --replicas 5
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller | jq '.'

 

노드가 2개->3개로 증가

 

Scale down deployment

# Now, delete the deployment. After a short amount of time, Karpenter should terminate the empty nodes due to consolidation.
kubectl delete deployment inflate && date
kubectl logs -f -n "${KARPENTER_NAMESPACE}" -l app.kubernetes.io/name=karpenter -c controller

 

노드가 감소

 

 

 

Disruption (구 Consolidation) : Expiration , Drift , Consolidation - Workshop Docs 링크 Work Spot-to-Spot

 

Applying Spot-to-Spot consolidation best practices with Karpenter | Amazon Web Services

This post is written by Robert Northard – AWS Container Specialist Solutions Architect, and Carlos Manzanedo Rueda – AWS WW SA Leader for Efficient Compute Karpenter is an open source node lifecycle management project built for Kubernetes. In this post

aws.amazon.com

 

  • Expiration 만료 : 기본 720시간(30일) 후 인스턴스를 자동으로 만료하여 강제로 노드를 최신 상태로 유지
  • Drift 드리프트 : 구성 변경 사항(NodePool, EC2NodeClass)를 감지하여 필요한 변경 사항을 적용
  • Consolidation 통합 : 비용 효율적인 컴퓨팅 최적화
  • 스팟 인스턴스 시작 시 Karpenter는 AWS EC2 Fleet Instance API를 호출하여 NodePool 구성 기반으로 선택한 인스턴스 유형을 전달.
  • AWS EC2 Fleet Instance API는 시작된 인스턴스 목록과 시작할 수 없는 인스턴스 목록을 즉시 반환하는 API로, 시작할 수 없을 경우 Karpenter는 대체 용량을 요청하거나 워크로드에 대한 soft 일정 제약 조건을 제거할 수 있음
  • Spot-to-Spot Consolidation 에는 주문형 통합과 다른 접근 방식이 필요했습니다. 온디맨드 통합의 경우 규모 조정 및 최저 가격이 주요 지표로 사용됩니다.
  • 스팟 간 통합이 이루어지려면 Karpenter에는 최소 15개의 인스턴스 유형이 포함된 다양한 인스턴스 구성(연습에 정의된 NodePool 예제 참조)이 필요합니다. 이러한 제약 조건이 없으면 Karpenter가 가용성이 낮고 중단 빈도가 높은 인스턴스를 선택할 위험이 있습니다.
# 기존 nodepool 삭제
kubectl delete nodepool,ec2nodeclass default

# v0.34.0 부터 featureGates 에 spotToSpotConsolidation 활성화로 사용 가능
helm upgrade karpenter -n kube-system oci://public.ecr.aws/karpenter/karpenter --reuse-values --set settings.featureGates.spotToSpotConsolidation=true

# Create a Karpenter NodePool and EC2NodeClass
cat <<EOF > nodepool.yaml
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
  name: default
spec:
  template:
    metadata:
      labels:
        intent: apps
    spec:
      nodeClassRef:
        name: default
      requirements:
        - key: karpenter.sh/capacity-type
          operator: In
          values: ["spot"]
        - key: karpenter.k8s.aws/instance-category
          operator: In
          values: ["c","m","r"]
        - key: karpenter.k8s.aws/instance-size
          operator: NotIn
          values: ["nano","micro","small","medium"]
        - key: karpenter.k8s.aws/instance-hypervisor
          operator: In
          values: ["nitro"]
  limits:
    cpu: 100
    memory: 100Gi
  disruption:
    consolidationPolicy: WhenUnderutilized
---
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
  name: default
spec:
  amiFamily: Bottlerocket
  subnetSelectorTerms:          
    - tags:
        karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
  securityGroupSelectorTerms:
    - tags:
        karpenter.sh/discovery: "${CLUSTER_NAME}" # replace with your cluster name
  role: "KarpenterNodeRole-${CLUSTER_NAME}" # replace with your cluster name
  tags:
    Name: karpenter.sh/nodepool/default
    IntentLabel: "apps"
EOF
kubectl apply -f nodepool.yaml

# Deploy a sample workload
cat <<EOF > inflate.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: inflate
spec:
  replicas: 5
  selector:
    matchLabels:
      app: inflate
  template:
    metadata:
      labels:
        app: inflate
    spec:
      nodeSelector:
        intent: apps
      containers:
        - name: inflate
          image: public.ecr.aws/eks-distro/kubernetes/pause:3.2
          resources:
            requests:
              cpu: 1
              memory: 1.5Gi
EOF
kubectl apply -f inflate.yaml

#
watch -d "kubectl get nodes -L karpenter.sh/nodepool -L node.kubernetes.io/instance-type -L topology.kubernetes.io/zone -L karpenter.sh/capacity-type"
kubectl get nodes -L karpenter.sh/nodepool -L node.kubernetes.io/instance-type -L topology.kubernetes.io/zone -L karpenter.sh/capacity-type

# Scale in a sample workload to observe consolidation
# To invoke a Karpenter consolidation event scale, inflate the deployment to 1. Run the following command:
kubectl scale --replicas=1 deployment/inflate
kubectl -n kube-system logs -l app.kubernetes.io/name=karpenter --all-containers=true -f --tail=20
kubectl get nodes -L karpenter.sh/nodepool -L node.kubernetes.io/instance-type -L topology.kubernetes.io/zone -L karpenter.sh/capacity-type
kubectl get node --label-columns=eks.amazonaws.com/capacityType,karpenter.sh/capacity-type
kubectl get node --label-columns=node.kubernetes.io/instance-type,topology.kubernetes.io/zone

# Use kubectl get nodeclaims to list all objects of type NodeClaim and then describe the NodeClaim Kubernetes resource
# using kubectl get nodeclaim/<claim-name> -o yaml. 
# In the NodeClaim .spec.requirements, you can also see the 15 instance types passed to the Amazon EC2 Fleet API:
kubectl get nodeclaims 
NAME            TYPE        ZONE              NODE                                                READY   AGE
default-w52c4   c6g.large   ap-northeast-2d   ip-192-168-77-172.ap-northeast-2.compute.internal   True    3m8s

kubectl get nodeclaims -o yaml | kubectl neat | yh
....
  spec: 
    nodeClassRef: 
      name: default
    requirements: 
    - key: karpenter.sh/nodepool
      operator: In
      values: 
      - default
    - key: node.kubernetes.io/instance-type
      operator: In
      values: 
      - c6g.large
      - c6gd.large
      - c6gn.large
      - c6id.large
      - c6in.large
      - c7g.large
      - c7i.large
      - m6g.large
      - m6gd.large
      - m6i.large
      - m7g.large
      - m7i-flex.large
      - m7i.large
      - r6g.large
      - r7g.large

# 삭제
kubectl delete deployment inflate
kubectl delete nodepool,ec2nodeclass default

 

확인

 

 

레플리카스 조정 후 적절한 스펙으로 최적화

 

실습 리소스 삭제

# Karpenter IAM Role 생성한 CloudFormation 삭제
aws cloudformation delete-stack --stack-name "Karpenter-${CLUSTER_NAME}"

# EC2 Launch Template 삭제
aws ec2 describe-launch-templates --filters "Name=tag:karpenter.k8s.aws/cluster,Values=${CLUSTER_NAME}" |
    jq -r ".LaunchTemplates[].LaunchTemplateName" |
    xargs -I{} aws ec2 delete-launch-template --launch-template-name {}

# 클러스터 삭제
eksctl delete cluster --name "${CLUSTER_NAME}"

# 위 삭제 완료 후 아래 삭제
aws cloudformation delete-stack --stack-name myeks2

 

'study > AEWS 2기' 카테고리의 다른 글

AEWS 2기 6주차 두번째  (0) 2024.04.10
AEWS 2기 6주차 첫번째  (0) 2024.04.10
AEWS 2기 5주차 첫번째  (0) 2024.04.05
AEWS 2기 4주차 세번째  (0) 2024.03.30
AEWS 2기 4주차 두번째  (0) 2024.03.30