Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-v0.18] Fix VirtualMachineCRCErrors not stop firing #907

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 5 additions & 1 deletion controllers/vm_controller.go
Expand Up @@ -58,7 +58,11 @@ func (r *VmReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Re
vm := kubevirtv1.VirtualMachine{}
if err := r.client.Get(ctx, req.NamespacedName, &vm); err != nil {
if errors.IsNotFound(err) {
// VM was deleted, so we can ignore it
// VM was deleted
vm.Name = req.Name
vm.Namespace = req.Namespace
metrics.SetVmWithVolume(&vm, nil, nil)

return ctrl.Result{}, nil
}
r.log.Error(err, "Error getting VM", "vm", req.NamespacedName)
Expand Down
4 changes: 2 additions & 2 deletions docs/metrics.md
Expand Up @@ -19,7 +19,7 @@ The increase in the number of rejected template validators, over the last hour.
The total number of rejected template validators. Type: Counter.
### kubevirt_ssp_template_validator_up
The total number of running virt-template-validator pods. Type: Gauge.
### kubevirt_ssp_vm_rbd_volume
VM with RBD mounted volume. Type: Gauge.
### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce
VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.
## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.
4 changes: 2 additions & 2 deletions internal/operands/metrics/resources.go
Expand Up @@ -166,15 +166,15 @@ func getAlertRules() ([]promv1.Rule, error) {
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_volume{volume_mode=\"Block\", rxbounce_enabled=\"false\"} > 0) or vector(0)) > 0"),
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
Expand Down
36 changes: 22 additions & 14 deletions pkg/monitoring/metrics/rbd_metrics.go
@@ -1,7 +1,6 @@
package metrics

import (
"strconv"
"strings"

"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
Expand All @@ -16,36 +15,45 @@ var (

vmRbdVolume = operatormetrics.NewGaugeVec(
operatormetrics.MetricOpts{
Name: metricPrefix + "vm_rbd_volume",
Help: "VM with RBD mounted volume",
Name: metricPrefix + "vm_rbd_block_volume_without_rxbounce",
Help: "VM with RBD mounted Block volume (without rxbounce option set)",
ExtraFields: map[string]string{
"StabilityLevel": "ALPHA",
},
},
[]string{"name", "namespace", "pv_name", "volume_mode", "rxbounce_enabled"},
[]string{"name", "namespace"},
)
)

func SetVmWithVolume(vm *kubevirtv1.VirtualMachine, pvc *k8sv1.PersistentVolumeClaim, pv *k8sv1.PersistentVolume) {
if pv.Spec.PersistentVolumeSource.CSI == nil || !strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com") {
if pv == nil || pvc == nil {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

// If the volume is not using the RBD CSI driver, or if it is not using the block mode, it is not impacted by https://bugzilla.redhat.com/2109455
if !usesRbdCsiDriver(pv) || *pvc.Spec.VolumeMode != k8sv1.PersistentVolumeBlock {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

mounter, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mounter"]
// If mounter is not set, it is using the default mounter, which is "rbd"
if ok && mounter != "rbd" {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

rxbounce, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mapOptions"]
if !ok {
rxbounce = ""
// If mapOptions is not set, or if it is set but does not contain "krbd:rxbounce", it might be impacted by https://bugzilla.redhat.com/2109455
if !ok || !strings.Contains(rxbounce, "krbd:rxbounce") {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(1)
return
}

vmRbdVolume.WithLabelValues(
vm.Name,
vm.Namespace,
pv.Name,
string(*pvc.Spec.VolumeMode),
strconv.FormatBool(strings.Contains(rxbounce, "krbd:rxbounce")),
).Set(1)
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
}

func usesRbdCsiDriver(pv *k8sv1.PersistentVolume) bool {
return pv.Spec.PersistentVolumeSource.CSI != nil && strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com")
}
20 changes: 5 additions & 15 deletions pkg/monitoring/metrics/rbd_metrics_test.go
@@ -1,9 +1,6 @@
package metrics

import (
"strconv"
"strings"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

Expand Down Expand Up @@ -73,14 +70,7 @@ var _ = Describe("rbd_metrics", func() {
SetVmWithVolume(vm, pvc, pv)

dto := &ioprometheusclient.Metric{}
err := vmRbdVolume.WithLabelValues(
vm.Name,
vm.Namespace,
pv.Name,
string(*pvc.Spec.VolumeMode),
strconv.FormatBool(strings.Contains(mapOptions, "krbd:rxbounce")),
).Write(dto)

err := vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Write(dto)
Expect(err).ToNot(HaveOccurred())

if metricExists {
Expand All @@ -89,10 +79,10 @@ var _ = Describe("rbd_metrics", func() {
Expect(dto.GetGauge().GetValue()).To(Equal(float64(0)))
}
},
Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", true),
Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", true),
Entry("non-rbd driver", "random", "rbd", "krbd:rxbounce", false),
Entry("non-rbd mounter", rbdDriver, "random", "krbd:rxbounce", false),
Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", false),
Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", false),
Entry("non-rbd driver", "random", "rbd", "", false),
Entry("non-rbd mounter", rbdDriver, "random", "", false),
Entry("krbd:rxbounce not enabled", rbdDriver, "rbd", "random", true),
)
})
53 changes: 20 additions & 33 deletions tests/monitoring_test.go
Expand Up @@ -25,6 +25,7 @@ import (
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/utils/pointer"
kubevirtv1 "kubevirt.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

ssp "kubevirt.io/ssp-operator/api/v1beta2"
"kubevirt.io/ssp-operator/internal/operands/metrics"
Expand Down Expand Up @@ -173,7 +174,7 @@ var _ = Describe("Prometheus Alerts", func() {

AfterEach(func() {
vmError := apiClient.Delete(ctx, vm)
Expect(vmError).ToNot(HaveOccurred())
Expect(client.IgnoreNotFound(vmError)).ToNot(HaveOccurred())

if pvc != nil {
pvcError := apiClient.Delete(ctx, pvc)
Expand Down Expand Up @@ -269,22 +270,26 @@ var _ = Describe("Prometheus Alerts", func() {
return vmName
}

It("[test_id:TODO] Should not create kubevirt_ssp_vm_rbd_volume when not using DataVolume or PVC", func() {
vmName := createResources(false, true)
seriesShouldNotBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s'}", vmName))
alertShouldNotBeActive("VirtualMachineCRCErrors")
})

It("[test_id:TODO] Should not fire VirtualMachineCRCErrors when rxbounce is enabled", func() {
vmName := createResources(true, true)
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='true'}", vmName))
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName))
alertShouldNotBeActive("VirtualMachineCRCErrors")
})

It("[test_id:TODO] Should fire VirtualMachineCRCErrors when rxbounce is disabled", func() {
vmName := createResources(true, false)
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='false'}", vmName))
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 1", vmName))
waitForAlertToActivate("VirtualMachineCRCErrors")

err := apiClient.Delete(ctx, vm)
Expect(err).ToNot(HaveOccurred())

Eventually(func(g Gomega) error {
return apiClient.Get(ctx, types.NamespacedName{Name: vmName, Namespace: strategy.GetNamespace()}, vm)
}).Should(MatchError(ContainSubstring(fmt.Sprintf("virtualmachines.kubevirt.io \"%s\" not found", vmName))))
akrejcir marked this conversation as resolved.
Show resolved Hide resolved

waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName))
alertShouldNotBeActive("VirtualMachineCRCErrors")
})
})
})
Expand All @@ -299,7 +304,7 @@ func checkAlert(alertName string) (*promApiv1.Alert, error) {
}

func waitForAlertToActivate(alertName string) {
Eventually(func() error {
EventuallyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
Expand All @@ -312,55 +317,37 @@ func waitForAlertToActivate(alertName string) {
}

func alertShouldNotBeActive(alertName string) {
Eventually(func() error {
EventuallyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
}
if alert == nil {
if alert == nil || alert.State == "inactive" {
return nil
}
return fmt.Errorf("alert %s found", alertName)
}, env.Timeout(), time.Second).ShouldNot(HaveOccurred())

Consistently(func() error {
ConsistentlyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
}
if alert == nil {
if alert == nil || alert.State == "inactive" {
return nil
}
return fmt.Errorf("alert %s found", alertName)
}, env.ShortTimeout(), time.Second).ShouldNot(HaveOccurred())
}

func waitForSeriesToBeDetected(seriesName string) {
Eventually(func() bool {
EventuallyWithOffset(1, func() bool {
results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now())
Expect(err).ShouldNot(HaveOccurred())
return results.String() != ""
}, env.Timeout(), 10*time.Second).Should(BeTrue())
}

func seriesShouldNotBeDetected(seriesName string) {
Eventually(func() bool {
results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now())
if err != nil {
return false
}
return results.String() == ""
}, env.Timeout(), 10*time.Second).Should(BeTrue())

Consistently(func() bool {
results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now())
if err != nil {
return false
}
return results.String() == ""
}, env.ShortTimeout(), 10*time.Second).Should(BeTrue())
}

func getAlertByName(alerts promApiv1.AlertsResult, alertName string) *promApiv1.Alert {
for _, alert := range alerts.Alerts {
if string(alert.Labels["alertname"]) == alertName {
Expand Down