From b6da5aac0510138b3d5b63799fd0c01d467e6096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Vila=C3=A7a?= Date: Tue, 28 Nov 2023 13:20:37 +0000 Subject: [PATCH 1/4] Fix VirtualMachineCRCErrors not getting resolved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The controller keeps reporting old metric values even after the VM is deleted. This commit updates the metric name and labels so that we can set up the metric value to 0, and no longer trigger the alert on VM deletion Signed-off-by: João Vilaça --- controllers/vm_controller.go | 6 ++- docs/metrics.md | 4 +- internal/operands/metrics/resources.go | 4 +- pkg/monitoring/metrics/rbd_metrics.go | 36 ++++++++++------- pkg/monitoring/metrics/rbd_metrics_test.go | 20 +++------- tests/monitoring_test.go | 45 ++++++++-------------- 6 files changed, 52 insertions(+), 63 deletions(-) diff --git a/controllers/vm_controller.go b/controllers/vm_controller.go index 0a2612301..4f078f8fb 100644 --- a/controllers/vm_controller.go +++ b/controllers/vm_controller.go @@ -58,7 +58,11 @@ func (r *VmReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Re vm := kubevirtv1.VirtualMachine{} if err := r.client.Get(ctx, req.NamespacedName, &vm); err != nil { if errors.IsNotFound(err) { - // VM was deleted, so we can ignore it + // VM was deleted + vm.Name = req.Name + vm.Namespace = req.Namespace + metrics.SetVmWithVolume(&vm, nil, nil) + return ctrl.Result{}, nil } r.log.Error(err, "Error getting VM", "vm", req.NamespacedName) diff --git a/docs/metrics.md b/docs/metrics.md index fe842da8d..df129aa42 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -19,7 +19,7 @@ The increase in the number of rejected template validators, over the last hour. The total number of rejected template validators. Type: Counter. ### kubevirt_ssp_template_validator_up The total number of running virt-template-validator pods. Type: Gauge. -### kubevirt_ssp_vm_rbd_volume -VM with RBD mounted volume. Type: Gauge. +### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce +VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge. ## Developing new metrics After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document. diff --git a/internal/operands/metrics/resources.go b/internal/operands/metrics/resources.go index ffb54befb..c54048a2d 100644 --- a/internal/operands/metrics/resources.go +++ b/internal/operands/metrics/resources.go @@ -166,7 +166,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, { Alert: "VirtualMachineCRCErrors", - Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_volume{volume_mode=\"Block\", rxbounce_enabled=\"false\"} > 0) or vector(0)) > 0"), + Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), Annotations: map[string]string{ "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages", "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", @@ -174,7 +174,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, Labels: map[string]string{ severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", partOfAlertLabelKey: partOfAlertLabelValue, componentAlertLabelKey: componentAlertLabelValue, }, diff --git a/pkg/monitoring/metrics/rbd_metrics.go b/pkg/monitoring/metrics/rbd_metrics.go index 3c30b711f..f11e7d67e 100644 --- a/pkg/monitoring/metrics/rbd_metrics.go +++ b/pkg/monitoring/metrics/rbd_metrics.go @@ -1,7 +1,6 @@ package metrics import ( - "strconv" "strings" "github.com/machadovilaca/operator-observability/pkg/operatormetrics" @@ -16,36 +15,45 @@ var ( vmRbdVolume = operatormetrics.NewGaugeVec( operatormetrics.MetricOpts{ - Name: metricPrefix + "vm_rbd_volume", - Help: "VM with RBD mounted volume", + Name: metricPrefix + "vm_rbd_block_volume_without_rxbounce", + Help: "VM with RBD mounted Block volume (without rxbounce option set)", ExtraFields: map[string]string{ "StabilityLevel": "ALPHA", }, }, - []string{"name", "namespace", "pv_name", "volume_mode", "rxbounce_enabled"}, + []string{"name", "namespace"}, ) ) func SetVmWithVolume(vm *kubevirtv1.VirtualMachine, pvc *k8sv1.PersistentVolumeClaim, pv *k8sv1.PersistentVolume) { - if pv.Spec.PersistentVolumeSource.CSI == nil || !strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com") { + if pv == nil || pvc == nil { + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0) + return + } + + // If the volume is not using the RBD CSI driver, or if it is not using the block mode, it is not impacted by https://bugzilla.redhat.com/2109455 + if !usesRbdCsiDriver(pv) || *pvc.Spec.VolumeMode != k8sv1.PersistentVolumeBlock { + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0) return } mounter, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mounter"] + // If mounter is not set, it is using the default mounter, which is "rbd" if ok && mounter != "rbd" { + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0) return } rxbounce, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mapOptions"] - if !ok { - rxbounce = "" + // If mapOptions is not set, or if it is set but does not contain "krbd:rxbounce", it might be impacted by https://bugzilla.redhat.com/2109455 + if !ok || !strings.Contains(rxbounce, "krbd:rxbounce") { + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(1) + return } - vmRbdVolume.WithLabelValues( - vm.Name, - vm.Namespace, - pv.Name, - string(*pvc.Spec.VolumeMode), - strconv.FormatBool(strings.Contains(rxbounce, "krbd:rxbounce")), - ).Set(1) + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0) +} + +func usesRbdCsiDriver(pv *k8sv1.PersistentVolume) bool { + return pv.Spec.PersistentVolumeSource.CSI != nil && strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com") } diff --git a/pkg/monitoring/metrics/rbd_metrics_test.go b/pkg/monitoring/metrics/rbd_metrics_test.go index 7d363290f..c670535c2 100644 --- a/pkg/monitoring/metrics/rbd_metrics_test.go +++ b/pkg/monitoring/metrics/rbd_metrics_test.go @@ -1,9 +1,6 @@ package metrics import ( - "strconv" - "strings" - . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -73,14 +70,7 @@ var _ = Describe("rbd_metrics", func() { SetVmWithVolume(vm, pvc, pv) dto := &ioprometheusclient.Metric{} - err := vmRbdVolume.WithLabelValues( - vm.Name, - vm.Namespace, - pv.Name, - string(*pvc.Spec.VolumeMode), - strconv.FormatBool(strings.Contains(mapOptions, "krbd:rxbounce")), - ).Write(dto) - + err := vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Write(dto) Expect(err).ToNot(HaveOccurred()) if metricExists { @@ -89,10 +79,10 @@ var _ = Describe("rbd_metrics", func() { Expect(dto.GetGauge().GetValue()).To(Equal(float64(0))) } }, - Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", true), - Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", true), - Entry("non-rbd driver", "random", "rbd", "krbd:rxbounce", false), - Entry("non-rbd mounter", rbdDriver, "random", "krbd:rxbounce", false), + Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", false), + Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", false), + Entry("non-rbd driver", "random", "rbd", "", false), + Entry("non-rbd mounter", rbdDriver, "random", "", false), Entry("krbd:rxbounce not enabled", rbdDriver, "rbd", "random", true), ) }) diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index fd487c4e6..dfff2e650 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -19,6 +19,7 @@ import ( apps "k8s.io/api/apps/v1" authnv1 "k8s.io/api/authentication/v1" core "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -173,7 +174,7 @@ var _ = Describe("Prometheus Alerts", func() { AfterEach(func() { vmError := apiClient.Delete(ctx, vm) - Expect(vmError).ToNot(HaveOccurred()) + Expect(client.IgnoreNotFound(vmError)).ToNot(HaveOccurred()) if pvc != nil { pvcError := apiClient.Delete(ctx, pvc) @@ -269,22 +270,26 @@ var _ = Describe("Prometheus Alerts", func() { return vmName } - It("[test_id:TODO] Should not create kubevirt_ssp_vm_rbd_volume when not using DataVolume or PVC", func() { - vmName := createResources(false, true) - seriesShouldNotBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s'}", vmName)) - alertShouldNotBeActive("VirtualMachineCRCErrors") - }) - It("[test_id:TODO] Should not fire VirtualMachineCRCErrors when rxbounce is enabled", func() { vmName := createResources(true, true) - waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='true'}", vmName)) + waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName)) alertShouldNotBeActive("VirtualMachineCRCErrors") }) It("[test_id:TODO] Should fire VirtualMachineCRCErrors when rxbounce is disabled", func() { vmName := createResources(true, false) - waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='false'}", vmName)) + waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 1", vmName)) waitForAlertToActivate("VirtualMachineCRCErrors") + + err := apiClient.Delete(ctx, vm) + Expect(err).ToNot(HaveOccurred()) + + Eventually(func(g Gomega) error { + return apiClient.Get(ctx, types.NamespacedName{Name: vmName, Namespace: strategy.GetNamespace()}, vm) + }).Should(MatchError(k8serrors.IsNotFound, "IsNotFound")) + + waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName)) + alertShouldNotBeActive("VirtualMachineCRCErrors") }) }) }) @@ -317,7 +322,7 @@ func alertShouldNotBeActive(alertName string) { if err != nil { return err } - if alert == nil { + if alert == nil || alert.State == "inactive" { return nil } return fmt.Errorf("alert %s found", alertName) @@ -328,7 +333,7 @@ func alertShouldNotBeActive(alertName string) { if err != nil { return err } - if alert == nil { + if alert == nil || alert.State == "inactive" { return nil } return fmt.Errorf("alert %s found", alertName) @@ -343,24 +348,6 @@ func waitForSeriesToBeDetected(seriesName string) { }, env.Timeout(), 10*time.Second).Should(BeTrue()) } -func seriesShouldNotBeDetected(seriesName string) { - Eventually(func() bool { - results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now()) - if err != nil { - return false - } - return results.String() == "" - }, env.Timeout(), 10*time.Second).Should(BeTrue()) - - Consistently(func() bool { - results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now()) - if err != nil { - return false - } - return results.String() == "" - }, env.ShortTimeout(), 10*time.Second).Should(BeTrue()) -} - func getAlertByName(alerts promApiv1.AlertsResult, alertName string) *promApiv1.Alert { for _, alert := range alerts.Alerts { if string(alert.Labels["alertname"]) == alertName { From bbf2af1d3821c8ed2d2b16909e846965b4dfb921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Vila=C3=A7a?= Date: Mon, 8 Jan 2024 14:30:53 +0000 Subject: [PATCH 2/4] Add offset to ginkgo e2e validations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: João Vilaça --- tests/monitoring_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index dfff2e650..75e546b64 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -304,7 +304,7 @@ func checkAlert(alertName string) (*promApiv1.Alert, error) { } func waitForAlertToActivate(alertName string) { - Eventually(func() error { + EventuallyWithOffset(1, func() error { alert, err := checkAlert(alertName) if err != nil { return err @@ -317,7 +317,7 @@ func waitForAlertToActivate(alertName string) { } func alertShouldNotBeActive(alertName string) { - Eventually(func() error { + EventuallyWithOffset(1, func() error { alert, err := checkAlert(alertName) if err != nil { return err @@ -328,7 +328,7 @@ func alertShouldNotBeActive(alertName string) { return fmt.Errorf("alert %s found", alertName) }, env.Timeout(), time.Second).ShouldNot(HaveOccurred()) - Consistently(func() error { + ConsistentlyWithOffset(1, func() error { alert, err := checkAlert(alertName) if err != nil { return err @@ -341,7 +341,7 @@ func alertShouldNotBeActive(alertName string) { } func waitForSeriesToBeDetected(seriesName string) { - Eventually(func() bool { + EventuallyWithOffset(1, func() bool { results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now()) Expect(err).ShouldNot(HaveOccurred()) return results.String() != "" From 25e1aa944ff959eb343bb834e69ea11adce220dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Vila=C3=A7a?= Date: Mon, 4 Mar 2024 17:21:53 +0000 Subject: [PATCH 3/4] Update tests/monitoring_test.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: João Vilaça --- tests/monitoring_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index 75e546b64..d2cf95cce 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -26,6 +26,7 @@ import ( "k8s.io/apimachinery/pkg/util/rand" "k8s.io/utils/pointer" kubevirtv1 "kubevirt.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" ssp "kubevirt.io/ssp-operator/api/v1beta2" "kubevirt.io/ssp-operator/internal/operands/metrics" @@ -286,7 +287,7 @@ var _ = Describe("Prometheus Alerts", func() { Eventually(func(g Gomega) error { return apiClient.Get(ctx, types.NamespacedName{Name: vmName, Namespace: strategy.GetNamespace()}, vm) - }).Should(MatchError(k8serrors.IsNotFound, "IsNotFound")) + }).Should(MatchError(k8serrors.IsNotFound)) waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName)) alertShouldNotBeActive("VirtualMachineCRCErrors") From 04af2c8bb2deb205ba86c51b98fd54ef6d5a4c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Vila=C3=A7a?= Date: Tue, 5 Mar 2024 12:44:55 +0000 Subject: [PATCH 4/4] Fix MatchError error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: João Vilaça --- tests/monitoring_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index d2cf95cce..ec1eec52d 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -19,7 +19,6 @@ import ( apps "k8s.io/api/apps/v1" authnv1 "k8s.io/api/authentication/v1" core "k8s.io/api/core/v1" - k8serrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -287,7 +286,7 @@ var _ = Describe("Prometheus Alerts", func() { Eventually(func(g Gomega) error { return apiClient.Get(ctx, types.NamespacedName{Name: vmName, Namespace: strategy.GetNamespace()}, vm) - }).Should(MatchError(k8serrors.IsNotFound)) + }).Should(MatchError(ContainSubstring(fmt.Sprintf("virtualmachines.kubevirt.io \"%s\" not found", vmName)))) waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName)) alertShouldNotBeActive("VirtualMachineCRCErrors")