Skip to content

Commit aa57e47

Browse files
authored
Merge pull request #88 from XenitAB/fix-broken-since-v130
fix: node-ttl broken since cluster-autoscaler v1.30+
2 parents dfd5ab8 + 08b85e5 commit aa57e47

File tree

6 files changed

+127
-96
lines changed

6 files changed

+127
-96
lines changed

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ e2e: docker-build
2525
# Create kind cluster and load images
2626
kind create cluster --kubeconfig $$KIND_KUBECONFIG
2727
kind load docker-image ${IMG}
28-
docker pull quay.io/elmiko/kubemark:v1.25.3
29-
kind load docker-image quay.io/elmiko/kubemark:v1.25.3
28+
docker pull quay.io/cluster-api-provider-kubemark/kubemark:v1.31.0
29+
kind load docker-image quay.io/cluster-api-provider-kubemark/kubemark:v1.31.0
3030

3131
# Start hollow node
3232
kubectl --kubeconfig $$KIND_KUBECONFIG apply -f ./e2e/hollow-node.yaml
@@ -54,4 +54,4 @@ e2e: docker-build
5454
go test ./e2e/e2e_test.go -cover -v -timeout 300s -run TestTTLEviction
5555

5656
# Delete cluster
57-
kind delete cluster
57+
#kind delete cluster

e2e/cluster-autoscaler.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ spec:
310310
dnsPolicy: "ClusterFirst"
311311
containers:
312312
- name: kubemark-cluster-autoscaler
313-
image: "ghcr.io/xenitab/cluster-autoscaler-kubemark:1.25.0"
313+
image: "ghcr.io/xenitab/cluster-autoscaler-kubemark:1.31.2"
314314
command:
315315
- ./cluster-autoscaler
316316
- --cloud-provider=kubemark

e2e/hollow-node.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ spec:
3838
path: /run/containerd
3939
containers:
4040
- name: hollow-kubelet
41-
image: quay.io/elmiko/kubemark:v1.25.3
41+
image: quay.io/cluster-api-provider-kubemark/kubemark:v1.31.0
4242
ports:
4343
- containerPort: 4194
4444
- containerPort: 10250
@@ -54,7 +54,6 @@ spec:
5454
- --morph=kubelet
5555
- --name=$(NODE_NAME)
5656
- --kubeconfig=/kubeconfig/kubelet.kubeconfig
57-
- --alsologtostderr
5857
- --v=2
5958
- --node-labels=xkf.xenit.io/node-ttl=30s,autoscaling.k8s.io/nodegroup=asg1
6059
- --register-with-taints=kubemark:NoSchedule
@@ -73,7 +72,7 @@ spec:
7372
securityContext:
7473
privileged: true
7574
- name: hollow-proxy
76-
image: quay.io/elmiko/kubemark:v1.25.3
75+
image: quay.io/cluster-api-provider-kubemark/kubemark:v1.31.0
7776
env:
7877
- name: NODE_NAME
7978
valueFrom:
@@ -86,7 +85,6 @@ spec:
8685
- --name=$(NODE_NAME)
8786
- --use-real-proxier=false
8887
- --kubeconfig=/kubeconfig/kubeproxy.kubeconfig
89-
- --alsologtostderr
9088
- --v=2
9189
volumeMounts:
9290
- name: kubeconfig-volume

internal/status/status.go

Lines changed: 70 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"regexp"
77
"strconv"
88
"strings"
9+
"time"
910

1011
yaml "github.com/goccy/go-yaml"
1112
corev1 "k8s.io/api/core/v1"
@@ -17,11 +18,66 @@ const (
1718
KubemarkNodePoolLabelKey = "autoscaling.k8s.io/nodegroup"
1819
)
1920

21+
type ClusterWideType struct {
22+
Health HealthType `yaml:"health"`
23+
ScaleUp *ScaleUpType `yaml:"scaleUp"`
24+
ScaleDown *ScaleDownType `yaml:"scaleDown"`
25+
}
26+
27+
type HealthType struct {
28+
Status string `yaml:"status"`
29+
NodeCounts *NodeCountsType `yaml:"nodeCounts,omitempty"`
30+
CloudProviderTarget int `yaml:"cloudProviderTarget"`
31+
MinSize int `yaml:"minSize"`
32+
MaxSize int `yaml:"maxSize"`
33+
LastProbeTime time.Time `yaml:"lastProbeTime"`
34+
LastTransitionTime time.Time `yaml:"lastTransitionTime"`
35+
}
36+
37+
type NodeCountsType struct {
38+
Registered *RegisteredType `yaml:"registered"`
39+
LongUnregistered int `yaml:"longUnregistered"`
40+
Unregistered int `yaml:"unregistered"`
41+
}
42+
43+
type RegisteredType struct {
44+
Total int `yaml:"total"`
45+
Ready int `yaml:"ready"`
46+
NotStarted int `yaml:"notStarted"`
47+
}
48+
49+
type ScaleUpType struct {
50+
Status string `yaml:"status"`
51+
LastProbeTime time.Time `yaml:"lastProbeTime"`
52+
LastTransitionTime time.Time `yaml:"lastTransitionTime"`
53+
}
54+
55+
type ScaleDownType struct {
56+
Status string `yaml:"status"`
57+
LastProbeTime time.Time `yaml:"lastProbeTime"`
58+
LastTransitionTime time.Time `yaml:"lastTransitionTime"`
59+
}
60+
61+
type NodeGroupsType struct {
62+
Name string `yaml:"name"`
63+
Health *HealthType `yaml:"health,omitempty"`
64+
ScaleUp *ScaleUpType `yaml:"scaleUp,omitempty"`
65+
ScaleDown *ScaleDownType `yaml:"scaleDown"`
66+
}
67+
68+
type ClusterAutoscalerStatusConfigMap struct {
69+
Time string `yaml:"time"`
70+
AutoscalerStatus string `yaml:"autoscalerStatus"`
71+
ClusterWide ClusterWideType `yaml:"clusterWide"`
72+
NodeGroups []*NodeGroupsType `yaml:"nodeGroups"`
73+
}
74+
2075
func HasScaleDownCapacity(status string, node *corev1.Node) (bool, error) {
2176
nodePoolName, err := getNodePoolName(node)
2277
if err != nil {
2378
return false, err
2479
}
80+
2581
ready, min, err := getNodePoolReadyAndMinCount(node.Status.NodeInfo.KubeletVersion, status, nodePoolName)
2682
if err != nil {
2783
return false, err
@@ -78,10 +134,14 @@ func getNodePoolReadyAndMinCount(kubeletVersion, status, nodePoolName string) (i
78134
// v1.3.X or later
79135
health, err := getNodePoolHealth(status, nodePoolName)
80136
if err != nil {
137+
fmt.Printf("Error: %s", err.Error())
81138
return 0, 0, err
82139
}
83-
ready, min := getReadyAndMinCount(health)
84-
return ready, min, nil
140+
141+
if health.NodeCounts != nil && health.NodeCounts.Registered != nil {
142+
return health.NodeCounts.Registered.Ready, health.MinSize, nil
143+
}
144+
return 0, 0, nil
85145
}
86146

87147
func getNodePoolHealthPreV130(status string, nodePoolName string) (string, error) {
@@ -101,33 +161,22 @@ func getNodePoolHealthPreV130(status string, nodePoolName string) (string, error
101161
return matches[1], nil
102162
}
103163

104-
func getNodePoolHealth(status string, nodePoolName string) (interface{}, error) {
105-
data := make(map[string]interface{})
164+
func getNodePoolHealth(s string, nodePoolName string) (*HealthType, error) {
165+
status := ClusterAutoscalerStatusConfigMap{}
106166

107-
err := yaml.Unmarshal([]byte(status), &data)
167+
err := yaml.Unmarshal([]byte(s), &status)
108168
if err != nil {
109169
log.Fatalf("error: %v", err)
110-
return "", fmt.Errorf("could not unmarshal the cluster-autoscaler status")
170+
return nil, fmt.Errorf("could not unmarshal the cluster-autoscaler status")
111171
}
112172

113-
ng, ok := data["nodeGroups"].([]interface{})
114-
if ok {
115-
for _, myMap := range ng {
116-
x, ok := myMap.(map[string]interface{})
117-
if !ok {
118-
break
119-
}
120-
121-
if x["name"] == nodePoolName {
122-
health := x["health"]
123-
if health != nil {
124-
return x["health"], nil
125-
}
126-
}
173+
for _, ng := range status.NodeGroups {
174+
if strings.EqualFold(ng.Name, nodePoolName) {
175+
return ng.Health, nil
127176
}
128177
}
129178

130-
return "", fmt.Errorf("could not find status for node pool: %s", nodePoolName)
179+
return nil, fmt.Errorf("could not find status for node pool: %s", nodePoolName)
131180
}
132181

133182
func getReadyAndMinCountPreV130(health string) (int, int, error) {
@@ -148,32 +197,3 @@ func getReadyAndMinCountPreV130(health string) (int, int, error) {
148197
}
149198
return ready, min, nil
150199
}
151-
152-
func getReadyAndMinCount(health interface{}) (int, int) {
153-
healthmap, ok := health.(map[string]interface{})
154-
if !ok {
155-
return 0, 0
156-
}
157-
158-
minSize, ok := healthmap["minSize"].(int)
159-
if !ok {
160-
return 0, 0
161-
}
162-
163-
nodeCounts, ok := healthmap["nodeCounts"].(map[string]interface{})
164-
if !ok {
165-
return 0, 0
166-
}
167-
168-
registerednodes, ok := nodeCounts["registered"].(map[string]interface{})
169-
if !ok {
170-
return 0, 0
171-
}
172-
173-
ready, ok := registerednodes["ready"].(int)
174-
if !ok {
175-
return 0, 0
176-
}
177-
178-
return ready, minSize
179-
}

internal/status/status_test.go

Lines changed: 51 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ func TestGetNodePoolReadyAndMinCountExisting(t *testing.T) {
3232
min: 35,
3333
},
3434
}
35-
status := mockClusterAutoscalerStatusPreV130(t, nodePools)
35+
status := mockClusterAutoscalerStatus(t, nodePools)
3636
for _, nodePool := range nodePools {
37-
ready, min, err := getNodePoolReadyAndMinCount("v1.25.3", status, nodePool.name)
37+
ready, min, err := getNodePoolReadyAndMinCount("v1.31.2", status, nodePool.name)
3838
require.NoError(t, err)
3939
require.Equal(t, nodePool.ready, ready)
4040
require.Equal(t, nodePool.min, min)
@@ -49,8 +49,8 @@ func TestGetNodePoolReadyAndMinCountNotFound(t *testing.T) {
4949
min: 22,
5050
},
5151
}
52-
status := mockClusterAutoscalerStatusPreV130(t, nodePools)
53-
_, _, err := getNodePoolReadyAndMinCount("v1.25.3", status, "bar")
52+
status := mockClusterAutoscalerStatus(t, nodePools)
53+
_, _, err := getNodePoolReadyAndMinCount("v1.31.2", status, "bar")
5454
require.EqualError(t, err, "could not find status for node pool: bar")
5555
}
5656

@@ -80,13 +80,13 @@ func TestHasScaleDownCapacity(t *testing.T) {
8080
for _, tt := range tests {
8181
t.Run(tt.name, func(t *testing.T) {
8282
for _, cp := range getNodePoolLabelKeys() {
83-
node, nodePoolName := getNodePoolNameAndNode(t, cp, "foobar")
83+
node, nodePoolName := getNodePoolNameAndNode(t, "v1.31.2", cp, "foobar")
8484
nodePool := testNodePool{
8585
name: nodePoolName,
8686
ready: tt.ready,
8787
min: tt.min,
8888
}
89-
status := mockClusterAutoscalerStatusPreV130(t, []testNodePool{nodePool})
89+
status := mockClusterAutoscalerStatus(t, []testNodePool{nodePool})
9090
ok, err := HasScaleDownCapacity(status, node)
9191
require.NoError(t, err)
9292
require.Equal(t, tt.isSafe, ok)
@@ -101,44 +101,57 @@ type testNodePool struct {
101101
min int
102102
}
103103

104-
func mockClusterAutoscalerStatusPreV130(t *testing.T, nodePools []testNodePool) string {
104+
func mockClusterAutoscalerStatus(t *testing.T, nodePools []testNodePool) string {
105105
t.Helper()
106106

107-
status := `Cluster-autoscaler status at 2022-08-11 12:35:11.797051423 +0000 UTC:
108-
Cluster-wide:
109-
Health: Healthy (ready=10 unready=0 notStarted=0 longNotStarted=0 registered=10 longUnregistered=0)
110-
LastProbeTime: 2022-08-11 12:35:11.782449164 +0000 UTC m=+935528.106132475
111-
LastTransitionTime: 2022-08-08 10:28:18.652598604 +0000 UTC m=+668714.976282015
112-
ScaleUp: NoActivity (ready=10 registered=10)
113-
LastProbeTime: 2022-08-11 12:35:11.782449164 +0000 UTC m=+935528.106132475
114-
LastTransitionTime: 2022-08-08 11:57:06.468308057 +0000 UTC m=+674042.791991368
115-
ScaleDown: NoCandidates (candidates=0)
116-
LastProbeTime: 2022-08-11 12:35:11.782449164 +0000 UTC m=+935528.106132475
117-
LastTransitionTime: 2022-08-08 12:03:59.241031335 +0000 UTC m=+674455.564714746
118-
119-
NodeGroups:`
107+
status := `time: 2025-04-22 14:29:08.360891242 +0000 UTC
108+
autoscalerStatus: Running
109+
clusterWide:
110+
health:
111+
status: Healthy
112+
nodeCounts:
113+
registered:
114+
total: 5
115+
ready: 5
116+
notStarted: 0
117+
longUnregistered: 0
118+
unregistered: 0
119+
lastProbeTime: "2025-04-22T14:29:08.360891242Z"
120+
lastTransitionTime: "2025-04-17T23:46:40.655271485Z"
121+
scaleUp:
122+
status: NoActivity
123+
lastProbeTime: "2025-04-22T14:29:08.360891242Z"
124+
lastTransitionTime: "2025-04-22T00:37:48.447964164Z"
125+
scaleDown:
126+
status: NoCandidates
127+
lastProbeTime: "2025-04-22T14:29:08.360891242Z"
128+
lastTransitionTime: "2025-04-22T00:48:01.870055554Z"
129+
nodeGroups:`
120130

121131
//nolint:gocritic // ignore
122132
for _, nodePool := range nodePools {
123-
//nolint:lll // ignore
124133
status = fmt.Sprintf(`%[1]s
125-
Name: %[2]s
126-
Health: Healthy (ready=%[3]d unready=0 notStarted=0 longNotStarted=0 registered=%[3]d longUnregistered=0 cloudProviderTarget=%[3]d (minSize=%[4]d, maxSize=0))
127-
LastProbeTime: 2022-08-11 12:35:11.782449164 +0000 UTC m=+935528.106132475
128-
LastTransitionTime: 2022-08-08 10:28:18.652598604 +0000 UTC m=+668714.976282015
129-
ScaleUp: NoActivity (ready=%[3]d cloudProviderTarget=%[3]d)
130-
LastProbeTime: 2022-08-11 12:35:11.782449164 +0000 UTC m=+935528.106132475
131-
LastTransitionTime: 2022-08-08 11:57:06.468308057 +0000 UTC m=+674042.791991368
132-
ScaleDown: NoCandidates (candidates=0)
133-
LastProbeTime: 2022-08-11 12:35:11.782449164 +0000 UTC m=+935528.106132475
134-
LastTransitionTime: 2022-08-08 12:03:59.241031335 +0000 UTC m=+674455.564714746
135-
136-
`, status, nodePool.name, nodePool.ready, nodePool.min)
134+
- name: %[2]s
135+
health:
136+
status: Healthy
137+
nodeCounts:
138+
registered:
139+
total: %[3]d
140+
ready: %[3]d
141+
notStarted: 0
142+
longUnregistered: 0
143+
unregistered: 0
144+
cloudProviderTarget: %[3]d
145+
minSize: %[4]d
146+
maxSize: 10
147+
lastProbeTime: "2025-04-22T14:29:08.360891242Z"
148+
lastTransitionTime: "2025-04-17T23:46:40.655271485Z"`, status, nodePool.name, nodePool.ready, nodePool.min)
137149
}
150+
138151
return status
139152
}
140153

141-
func getNodePoolNameAndNode(t *testing.T, cp string, name string) (*corev1.Node, string) {
154+
func getNodePoolNameAndNode(t *testing.T, version string, cp string, name string) (*corev1.Node, string) {
142155
t.Helper()
143156

144157
switch cp {
@@ -154,7 +167,7 @@ func getNodePoolNameAndNode(t *testing.T, cp string, name string) (*corev1.Node,
154167
},
155168
Status: corev1.NodeStatus{
156169
NodeInfo: corev1.NodeSystemInfo{
157-
KubeletVersion: "v1.25.3",
170+
KubeletVersion: version,
158171
},
159172
},
160173
}, nodePoolName
@@ -169,10 +182,10 @@ func getNodePoolNameAndNode(t *testing.T, cp string, name string) (*corev1.Node,
169182
},
170183
Status: corev1.NodeStatus{
171184
NodeInfo: corev1.NodeSystemInfo{
172-
KubeletVersion: "v1.25.3",
185+
KubeletVersion: version,
173186
},
174187
},
175-
}, fmt.Sprintf("eks-%s-c8c2d2a8-2d51-8764-1776-0b3f58267273", eksNodePoolName)
188+
}, fmt.Sprintf("eks-%s-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", eksNodePoolName)
176189
case KubemarkNodePoolLabelKey:
177190
return &corev1.Node{
178191
ObjectMeta: metav1.ObjectMeta{
@@ -183,7 +196,7 @@ func getNodePoolNameAndNode(t *testing.T, cp string, name string) (*corev1.Node,
183196
},
184197
Status: corev1.NodeStatus{
185198
NodeInfo: corev1.NodeSystemInfo{
186-
KubeletVersion: "v1.25.3",
199+
KubeletVersion: version,
187200
},
188201
},
189202
}, name

kubemark.log

Whitespace-only changes.

0 commit comments

Comments
 (0)