diff --git a/controllers/vm_controller.go b/controllers/vm_controller.go index 0a2612301..4f078f8fb 100644 --- a/controllers/vm_controller.go +++ b/controllers/vm_controller.go @@ -58,7 +58,11 @@ func (r *VmReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Re vm := kubevirtv1.VirtualMachine{} if err := r.client.Get(ctx, req.NamespacedName, &vm); err != nil { if errors.IsNotFound(err) { - // VM was deleted, so we can ignore it + // VM was deleted + vm.Name = req.Name + vm.Namespace = req.Namespace + metrics.SetVmWithVolume(&vm, nil, nil) + return ctrl.Result{}, nil } r.log.Error(err, "Error getting VM", "vm", req.NamespacedName) diff --git a/docs/metrics.md b/docs/metrics.md index fe842da8d..df129aa42 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -19,7 +19,7 @@ The increase in the number of rejected template validators, over the last hour. The total number of rejected template validators. Type: Counter. ### kubevirt_ssp_template_validator_up The total number of running virt-template-validator pods. Type: Gauge. -### kubevirt_ssp_vm_rbd_volume -VM with RBD mounted volume. Type: Gauge. +### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce +VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge. ## Developing new metrics After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document. diff --git a/internal/operands/metrics/resources.go b/internal/operands/metrics/resources.go index ffb54befb..c54048a2d 100644 --- a/internal/operands/metrics/resources.go +++ b/internal/operands/metrics/resources.go @@ -166,7 +166,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, { Alert: "VirtualMachineCRCErrors", - Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_volume{volume_mode=\"Block\", rxbounce_enabled=\"false\"} > 0) or vector(0)) > 0"), + Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), Annotations: map[string]string{ "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages", "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", @@ -174,7 +174,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, Labels: map[string]string{ severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", partOfAlertLabelKey: partOfAlertLabelValue, componentAlertLabelKey: componentAlertLabelValue, }, diff --git a/pkg/monitoring/metrics/rbd_metrics.go b/pkg/monitoring/metrics/rbd_metrics.go index 3c30b711f..f11e7d67e 100644 --- a/pkg/monitoring/metrics/rbd_metrics.go +++ b/pkg/monitoring/metrics/rbd_metrics.go @@ -1,7 +1,6 @@ package metrics import ( - "strconv" "strings" "github.com/machadovilaca/operator-observability/pkg/operatormetrics" @@ -16,36 +15,45 @@ var ( vmRbdVolume = operatormetrics.NewGaugeVec( operatormetrics.MetricOpts{ - Name: metricPrefix + "vm_rbd_volume", - Help: "VM with RBD mounted volume", + Name: metricPrefix + "vm_rbd_block_volume_without_rxbounce", + Help: "VM with RBD mounted Block volume (without rxbounce option set)", ExtraFields: map[string]string{ "StabilityLevel": "ALPHA", }, }, - []string{"name", "namespace", "pv_name", "volume_mode", "rxbounce_enabled"}, + []string{"name", "namespace"}, ) ) func SetVmWithVolume(vm *kubevirtv1.VirtualMachine, pvc *k8sv1.PersistentVolumeClaim, pv *k8sv1.PersistentVolume) { - if pv.Spec.PersistentVolumeSource.CSI == nil || !strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com") { + if pv == nil || pvc == nil { + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0) + return + } + + // If the volume is not using the RBD CSI driver, or if it is not using the block mode, it is not impacted by https://bugzilla.redhat.com/2109455 + if !usesRbdCsiDriver(pv) || *pvc.Spec.VolumeMode != k8sv1.PersistentVolumeBlock { + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0) return } mounter, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mounter"] + // If mounter is not set, it is using the default mounter, which is "rbd" if ok && mounter != "rbd" { + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0) return } rxbounce, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mapOptions"] - if !ok { - rxbounce = "" + // If mapOptions is not set, or if it is set but does not contain "krbd:rxbounce", it might be impacted by https://bugzilla.redhat.com/2109455 + if !ok || !strings.Contains(rxbounce, "krbd:rxbounce") { + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(1) + return } - vmRbdVolume.WithLabelValues( - vm.Name, - vm.Namespace, - pv.Name, - string(*pvc.Spec.VolumeMode), - strconv.FormatBool(strings.Contains(rxbounce, "krbd:rxbounce")), - ).Set(1) + vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0) +} + +func usesRbdCsiDriver(pv *k8sv1.PersistentVolume) bool { + return pv.Spec.PersistentVolumeSource.CSI != nil && strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com") } diff --git a/pkg/monitoring/metrics/rbd_metrics_test.go b/pkg/monitoring/metrics/rbd_metrics_test.go index 7d363290f..c670535c2 100644 --- a/pkg/monitoring/metrics/rbd_metrics_test.go +++ b/pkg/monitoring/metrics/rbd_metrics_test.go @@ -1,9 +1,6 @@ package metrics import ( - "strconv" - "strings" - . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -73,14 +70,7 @@ var _ = Describe("rbd_metrics", func() { SetVmWithVolume(vm, pvc, pv) dto := &ioprometheusclient.Metric{} - err := vmRbdVolume.WithLabelValues( - vm.Name, - vm.Namespace, - pv.Name, - string(*pvc.Spec.VolumeMode), - strconv.FormatBool(strings.Contains(mapOptions, "krbd:rxbounce")), - ).Write(dto) - + err := vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Write(dto) Expect(err).ToNot(HaveOccurred()) if metricExists { @@ -89,10 +79,10 @@ var _ = Describe("rbd_metrics", func() { Expect(dto.GetGauge().GetValue()).To(Equal(float64(0))) } }, - Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", true), - Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", true), - Entry("non-rbd driver", "random", "rbd", "krbd:rxbounce", false), - Entry("non-rbd mounter", rbdDriver, "random", "krbd:rxbounce", false), + Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", false), + Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", false), + Entry("non-rbd driver", "random", "rbd", "", false), + Entry("non-rbd mounter", rbdDriver, "random", "", false), Entry("krbd:rxbounce not enabled", rbdDriver, "rbd", "random", true), ) }) diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index fd487c4e6..ec1eec52d 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -25,6 +25,7 @@ import ( "k8s.io/apimachinery/pkg/util/rand" "k8s.io/utils/pointer" kubevirtv1 "kubevirt.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" ssp "kubevirt.io/ssp-operator/api/v1beta2" "kubevirt.io/ssp-operator/internal/operands/metrics" @@ -173,7 +174,7 @@ var _ = Describe("Prometheus Alerts", func() { AfterEach(func() { vmError := apiClient.Delete(ctx, vm) - Expect(vmError).ToNot(HaveOccurred()) + Expect(client.IgnoreNotFound(vmError)).ToNot(HaveOccurred()) if pvc != nil { pvcError := apiClient.Delete(ctx, pvc) @@ -269,22 +270,26 @@ var _ = Describe("Prometheus Alerts", func() { return vmName } - It("[test_id:TODO] Should not create kubevirt_ssp_vm_rbd_volume when not using DataVolume or PVC", func() { - vmName := createResources(false, true) - seriesShouldNotBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s'}", vmName)) - alertShouldNotBeActive("VirtualMachineCRCErrors") - }) - It("[test_id:TODO] Should not fire VirtualMachineCRCErrors when rxbounce is enabled", func() { vmName := createResources(true, true) - waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='true'}", vmName)) + waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName)) alertShouldNotBeActive("VirtualMachineCRCErrors") }) It("[test_id:TODO] Should fire VirtualMachineCRCErrors when rxbounce is disabled", func() { vmName := createResources(true, false) - waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='false'}", vmName)) + waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 1", vmName)) waitForAlertToActivate("VirtualMachineCRCErrors") + + err := apiClient.Delete(ctx, vm) + Expect(err).ToNot(HaveOccurred()) + + Eventually(func(g Gomega) error { + return apiClient.Get(ctx, types.NamespacedName{Name: vmName, Namespace: strategy.GetNamespace()}, vm) + }).Should(MatchError(ContainSubstring(fmt.Sprintf("virtualmachines.kubevirt.io \"%s\" not found", vmName)))) + + waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName)) + alertShouldNotBeActive("VirtualMachineCRCErrors") }) }) }) @@ -299,7 +304,7 @@ func checkAlert(alertName string) (*promApiv1.Alert, error) { } func waitForAlertToActivate(alertName string) { - Eventually(func() error { + EventuallyWithOffset(1, func() error { alert, err := checkAlert(alertName) if err != nil { return err @@ -312,23 +317,23 @@ func waitForAlertToActivate(alertName string) { } func alertShouldNotBeActive(alertName string) { - Eventually(func() error { + EventuallyWithOffset(1, func() error { alert, err := checkAlert(alertName) if err != nil { return err } - if alert == nil { + if alert == nil || alert.State == "inactive" { return nil } return fmt.Errorf("alert %s found", alertName) }, env.Timeout(), time.Second).ShouldNot(HaveOccurred()) - Consistently(func() error { + ConsistentlyWithOffset(1, func() error { alert, err := checkAlert(alertName) if err != nil { return err } - if alert == nil { + if alert == nil || alert.State == "inactive" { return nil } return fmt.Errorf("alert %s found", alertName) @@ -336,31 +341,13 @@ func alertShouldNotBeActive(alertName string) { } func waitForSeriesToBeDetected(seriesName string) { - Eventually(func() bool { + EventuallyWithOffset(1, func() bool { results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now()) Expect(err).ShouldNot(HaveOccurred()) return results.String() != "" }, env.Timeout(), 10*time.Second).Should(BeTrue()) } -func seriesShouldNotBeDetected(seriesName string) { - Eventually(func() bool { - results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now()) - if err != nil { - return false - } - return results.String() == "" - }, env.Timeout(), 10*time.Second).Should(BeTrue()) - - Consistently(func() bool { - results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now()) - if err != nil { - return false - } - return results.String() == "" - }, env.ShortTimeout(), 10*time.Second).Should(BeTrue()) -} - func getAlertByName(alerts promApiv1.AlertsResult, alertName string) *promApiv1.Alert { for _, alert := range alerts.Alerts { if string(alert.Labels["alertname"]) == alertName {