Skip to content

100 Nodes Scale Test #187

100 Nodes Scale Test

100 Nodes Scale Test #187

name: 100 Nodes Scale Test
on:
schedule:
- cron: '39 0 * * 1-5'
permissions:
# To be able to access the repository with actions/checkout
contents: read
# To be able to request the JWT from GitHub's OIDC provider
id-token: write
concurrency:
# Structure:
# - Workflow name
# - Event type
# - A unique identifier depending on event type:
# - schedule: SHA
# - workflow_dispatch: PR number
#
# This structure ensures a unique concurrency group name is generated for each
# type of testing, such that re-runs will cancel the previous run.
group: |
${{ github.workflow }}
${{ github.event_name }}
${{
(github.event_name == 'schedule' && github.sha) ||
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number)
}}
cancel-in-progress: true
env:
# renovate: datasource=golang-version depName=go
go_version: 1.22.2
# Adding k8s.local to the end makes kops happy-
# has stricter DNS naming requirements.
test_name: scale-100
cluster_base_name: ${{ github.run_id }}-${{ github.run_attempt }}.k8s.local
jobs:
install-and-scaletest:
runs-on: ubuntu-latest
name: Install and Scale Test
timeout-minutes: 150
steps:
- name: Checkout context ref (trusted)
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
with:
ref: ${{ inputs.context-ref || github.sha }}
persist-credentials: false
- name: Set Environment Variables
uses: ./.github/actions/set-env-variables
- name: Get Cilium's default values
id: default_vars
uses: ./.github/actions/helm-default
with:
image-tag: ${{ github.sha }}
- name: Set up job variables
id: vars
run: |
SHA="${{ github.sha }}"
# Setup Cilium install options
CILIUM_INSTALL_DEFAULTS="${{ steps.default_vars.outputs.cilium_install_defaults }} \
--set pprof.enabled=true \
--wait=false"
CLUSTER_NAME="${{ env.test_name }}-${{ env.cluster_base_name }}"
echo SHA=${SHA} >> $GITHUB_OUTPUT
echo cilium_install_defaults=${CILIUM_INSTALL_DEFAULTS} >> $GITHUB_OUTPUT
echo CLUSTER_NAME=${CLUSTER_NAME} >> $GITHUB_OUTPUT
- name: Wait for images to be available
timeout-minutes: 30
shell: bash
run: |
for image in cilium-ci operator-generic-ci hubble-relay-ci ; do
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.SHA }} &> /dev/null; do sleep 45s; done
done
- name: Install Go
uses: actions/setup-go@0c52d547c9bc32b1aa3301fd7a9cb496313a4491 # v5.0.0
with:
go-version: ${{ env.go_version }}
- name: Install Cilium CLI
uses: cilium/cilium-cli@bfa4f93a41da410a1b4c9373c1694ca6d5c35ade # v0.16.6
with:
repository: ${{ env.CILIUM_CLI_RELEASE_REPO }}
release-version: ${{ env.CILIUM_CLI_VERSION }}
- name: Install Kops
uses: cilium/scale-tests-action/install-kops@238d773bd07754bfd693a6b22c94eddf3a12778d # main
- name: Setup gcloud credentials
uses: google-github-actions/auth@55bd3a7c6e2ae7cf1877fd1ccb9d54c0503c457c # v2.1.2
with:
workload_identity_provider: ${{ secrets.GCP_PERF_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_PERF_SA }}
create_credentials_file: true
export_environment_variables: true
- name: Setup gcloud CLI
uses: google-github-actions/setup-gcloud@98ddc00a17442e89a24bbf282954a3b65ce6d200 # v2.1.0
with:
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }}
version: "405.0.0"
- name: Clone ClusterLoader2
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
with:
repository: kubernetes/perf-tests
# Avoid using renovate to update this dependency because: (1)
# perf-tests does not tag or release, so renovate will pull
# all updates to the default branch and (2) continually
# updating CL2 may impact the stability of the scale test
# results.
ref: 920c39ef245a81bd8fb39d7fecf39eb35820d9ef
persist-credentials: false
sparse-checkout: clusterloader2
path: perf-tests
- name: Display version info of installed tools
run: |
echo "--- go ---"
go version
echo "--- cilium-cli ---"
cilium version --client
echo "--- kops ---"
./kops version
echo "--- gcloud ---"
gcloud version
- name: Deploy cluster
id: deploy-cluster
uses: cilium/scale-tests-action/create-cluster@238d773bd07754bfd693a6b22c94eddf3a12778d # main
timeout-minutes: 30
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
control_plane_size: n1-standard-8
control_plane_count: 1
node_size: e2-medium
node_count: 100
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }}
- name: Create Instance Group for resource heavy deployments
uses: cilium/scale-tests-action/create-instance-group@238d773bd07754bfd693a6b22c94eddf3a12778d # main
timeout-minutes: 30
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
node_size: e2-standard-8
node_count: 1
ig_name: heapster
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
- name: Setup firewall rules
uses: cilium/scale-tests-action/setup-firewall@238d773bd07754bfd693a6b22c94eddf3a12778d # main
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
- name: Install Cilium
run: |
cilium install ${{ steps.vars.outputs.cilium_install_defaults }}
- name: Wait for cluster to be ready
uses: cilium/scale-tests-action/validate-cluster@238d773bd07754bfd693a6b22c94eddf3a12778d # main
timeout-minutes: 20
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
- name: Wait for Cilium status to be ready
run: |
cilium status --wait
- name: Run CL2
id: run-cl2
working-directory: ./perf-tests/clusterloader2
shell: bash
timeout-minutes: 30
run: |
mkdir ./report
export CL2_PROMETHEUS_PVC_ENABLED=false
export CL2_ENABLE_PVS=false
export CL2_ENABLE_NETWORKPOLICIES=true
export CL2_ALLOWED_SLOW_API_CALLS=1
export CL2_SCHEDULER_THROUGHPUT_THRESHOLD=0
# CL2 hardcodes module paths to live in ./testing/load, even
# if the path given is relative.
cp ../../.github/actions/cl2-modules/cilium-agent-pprofs.yaml ./testing/load/
echo \
'{"CL2_ADDITIONAL_MEASUREMENT_MODULES": ["./cilium-agent-pprofs.yaml"]}' \
> modules.yaml
# CL2 needs ssh access to control plane nodes
gcloud compute config-ssh
go run ./cmd/clusterloader.go \
-v=2 \
--testconfig=./testing/load/config.yaml \
--provider=gce \
--enable-prometheus-server \
--tear-down-prometheus-server=false \
--nodes=100 \
--report-dir=./report \
--experimental-prometheus-snapshot-to-report-dir=true \
--kubeconfig=$HOME/.kube/config \
--testoverrides=./testing/overrides/load_throughput.yaml \
--testoverrides=./testing/experiments/use_simple_latency_query.yaml \
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \
--testoverrides=./modules.yaml \
2>&1 | tee cl2-output.txt
- name: Get sysdump
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }}
run: |
cilium status
cilium sysdump --output-filename cilium-sysdump-final
- name: Cleanup cluster
if: ${{ always() && steps.deploy-cluster.outcome != 'skipped' }}
uses: cilium/scale-tests-action/cleanup-cluster@238d773bd07754bfd693a6b22c94eddf3a12778d # main
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
- name: Export results and sysdump to GS bucket
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }}
uses: cilium/scale-tests-action/export-results@238d773bd07754bfd693a6b22c94eddf3a12778d # main
with:
test_name: ${{ env.test_name }}
results_bucket: ${{ env.GCP_PERF_RESULTS_BUCKET }}
artifacts: ./perf-tests/clusterloader2/report/*
other_files: cilium-sysdump-final.zip ./perf-tests/clusterloader2/cl2-output.txt