From b6a6594512c63223f673fd36a753eee32b776030 Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Wed, 4 Sep 2024 17:31:01 +0200 Subject: [PATCH] cnf-tests: Compare Multus and SR-IOV metrics Statistics that relates to Multus interfaces can be collected by joining network-metrics-daemon [1] and cAdvisor [2] (see [3]). The same information, for kernel netdevice SR-IOV interface can be collected via the `sriov-network-metrics-exporter` [4], which leverages the Physical Function to get statistics about the Virtual Functions. Proposed test case verifies both sources produces congruent values. Only TX statistics are verified, as receiving ones might be spoiled by noise traffic on the wire (e.g. other nodes sending DHCP broadcast requests). [1] https://github.com/openshift/network-metrics-daemon [2] https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md [3] https://docs.openshift.com/container-platform/4.16/networking/associating-secondary-interfaces-metrics-to-network-attachments.html#cnf-associating-secondary-interfaces-metrics-with-network-name_secondary-interfaces-metrics [4] https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter Signed-off-by: Andrea Panattoni --- .../metrics/sriovnetworkmetricsexporter.go | 255 ++++++++++++++++++ .../testsuites/e2esuite/test_suite_test.go | 1 + cnf-tests/testsuites/pkg/utils/reporter.go | 11 +- 3 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 cnf-tests/testsuites/e2esuite/metrics/sriovnetworkmetricsexporter.go diff --git a/cnf-tests/testsuites/e2esuite/metrics/sriovnetworkmetricsexporter.go b/cnf-tests/testsuites/e2esuite/metrics/sriovnetworkmetricsexporter.go new file mode 100644 index 0000000000..0131dcadf8 --- /dev/null +++ b/cnf-tests/testsuites/e2esuite/metrics/sriovnetworkmetricsexporter.go @@ -0,0 +1,255 @@ +package metrics + +import ( + "context" + "encoding/json" + "fmt" + "net/url" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + sriovtestclient "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/client" + sriovcluster "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster" + sriovnamespaces "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces" + sriovnetwork "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/network" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + + "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/client" + "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/discovery" + "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/images" + "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/namespaces" + "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/networks" + "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/pods" + + "github.com/prometheus/common/model" +) + +const testNamespace string = "test-sriov-metrics" + +var sriovclient *sriovtestclient.ClientSet + +func init() { + sriovclient = sriovtestclient.New("") +} + +var _ = Describe("[sriov] SR-IOV Network Metrics Exporter", func() { + + var sriovCapableNodes *sriovcluster.EnabledNodes + + BeforeEach(func() { + if discovery.Enabled() { + Skip("Discovery mode not supported") + } + + restoreFeatureGates := enableMetricsExporterFeatureGate() + DeferCleanup(restoreFeatureGates) + + By("Adding monitoring label to " + namespaces.SRIOVOperator) + err := sriovnamespaces.AddLabel(sriovclient, context.Background(), namespaces.SRIOVOperator, "openshift.io/cluster-monitoring", "true") + Expect(err).ToNot(HaveOccurred()) + + By("Clean SRIOV policies and networks") + networks.CleanSriov(sriovclient) + + By("Discover SRIOV devices") + sriovCapableNodes, err = sriovcluster.DiscoverSriov(sriovclient, namespaces.SRIOVOperator) + Expect(err).ToNot(HaveOccurred()) + + err = namespaces.Create(testNamespace, client.Client) + Expect(err).ToNot(HaveOccurred()) + namespaces.CleanPods(testNamespace, client.Client) + }) + + It("should provide the same metrics as network-metrics-daemon", func() { + testNode, testDevice, err := sriovCapableNodes.FindOneSriovNodeAndDevice() + Expect(err).ToNot(HaveOccurred()) + By("Using device " + testDevice.Name + " on node " + testNode) + + sriovNetworkNodePolicy, err := sriovnetwork.CreateSriovPolicy( + sriovclient, "test-metrics-", namespaces.SRIOVOperator, + testDevice.Name, testNode, 8, + "testsriovmetricsresource", "netdevice", + ) + Expect(err).ToNot(HaveOccurred()) + DeferCleanup(sriovclient.Delete, context.Background(), sriovNetworkNodePolicy) + + ipam := `{ "type": "host-local", "subnet": "192.0.2.0/24" }` + err = sriovnetwork.CreateSriovNetwork(sriovclient, testDevice, "test-metrics-network", + testNamespace, namespaces.SRIOVOperator, "testsriovmetricsresource", ipam) + Expect(err).ToNot(HaveOccurred()) + + serverPod, clientPod := makeClientAndServerNetcatPod() + + // Do not verify pairs + // "container_network_receive_packets_total": "sriov_vf_rx_packets", + // "container_network_receive_bytes_total": "sriov_vf_rx_bytes", + // because there might be traffic on the wire that disturbs the counters. + // An example is a DHCP traffic that other nodes are producing, e.g. (tcpdump): + // + // 13:28:00.442893 04:3f:72:fe:d1:d1 > ff:ff:ff:ff:ff:ff, ethertype IPv4 (0x0800), length 327: 0.0.0.0.68 > 255.255.255.255.67: BOOTP/DHCP, Request from 04:3f:72:fe:d1:d1, length 285 + metricsToMatch := map[string]string{ + "container_network_transmit_packets_total": "sriov_vf_tx_packets", + "container_network_transmit_bytes_total": "sriov_vf_tx_bytes", + } + containerQuery := `%s + on(namespace,pod,interface) group_left(network_name) (pod_network_name_info{interface="net1",pod="%s"})` + sriovQuery := `%s * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice{pod="%s"}` + + for containerMetricName, sriovMetricName := range metricsToMatch { + By(fmt.Sprintf("verifying metrics %s == %s", containerMetricName, sriovMetricName)) + assertPromQLHasTheSameResult( + fmt.Sprintf(containerQuery, containerMetricName, serverPod.Name), + fmt.Sprintf(sriovQuery, sriovMetricName, serverPod.Name), + ) + + assertPromQLHasTheSameResult( + fmt.Sprintf(containerQuery, containerMetricName, clientPod.Name), + fmt.Sprintf(sriovQuery, sriovMetricName, clientPod.Name), + ) + } + }) +}) + +func makeClientAndServerNetcatPod() (*corev1.Pod, *corev1.Pod) { + serverPod := pods.DefinePod(testNamespace) + serverPod.GenerateName = "testpod-nc-server-" + serverPod = pods.RedefinePodWithNetwork(serverPod, `[{"name": "test-metrics-network","ips":["192.0.2.101/24"]}]`) + serverPod.Spec.Containers = append(serverPod.Spec.Containers, corev1.Container{ + Name: "netcat-tcp-server", + Image: images.For(images.TestUtils), + Command: []string{"nc", "-vv", "--keep-open", "--listen", "5000"}, + SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true)}, + }) + serverPod, err := pods.CreateAndStart(serverPod) + Expect(err).ToNot(HaveOccurred()) + + clientPod := pods.DefinePod(testNamespace) + clientPod.GenerateName = "testpod-nc-client-" + clientPod = pods.RedefinePodWithNetwork(clientPod, `[{"name": "test-metrics-network","ips":["192.0.2.102/24"]}]`) + clientPod.Spec.Containers = append(clientPod.Spec.Containers, corev1.Container{ + Name: "netcat-tcp-client", + Image: images.For(images.TestUtils), + Command: makeNetcatClientCommand("192.0.2.101 5000"), + SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true)}, + }) + clientPod, err = pods.CreateAndStart(clientPod) + Expect(err).ToNot(HaveOccurred()) + + return clientPod, serverPod +} + +func makeNetcatClientCommand(targetIpAddress string) []string { + // This command send 1001 bytes via netcat + script := fmt.Sprintf( + ` + sleep 10; + printf %%01000d 1 | nc -w 1 %s; + sleep inf +`, targetIpAddress) + return []string{"bash", "-xec", script} +} + +func runPromQLQuery(query string) model.Vector { + prometheusPods, err := client.Client.Pods("").List(context.Background(), metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/component=prometheus", + }) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, prometheusPods.Items).ToNot(HaveLen(0), "At least one Prometheus operator pod expected") + + prometheusPod := prometheusPods.Items[0] + + url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode()) + command := []string{"curl", url} + outputBuffer, err := pods.ExecCommand(client.Client, prometheusPod, command) + ExpectWithOffset(1, err). + ToNot(HaveOccurred(), + "promQL query failed: [%s/%s] command: [%v]\nstdout: %s\nstderr: %s", prometheusPod.Namespace, prometheusPod.Name, command, outputBuffer) + + result := struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result model.Vector `json:"result"` + } `json:"data"` + }{} + + json.Unmarshal(outputBuffer.Bytes(), &result) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, result.Status).To(Equal("success"), "cURL for [%s] failed: %s", url, outputBuffer.String()) + + return result.Data.Result +} + +func enableMetricsExporterFeatureGate() func() { + + operatorConfig, err := sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Get(context.Background(), "default", metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + // Save the current feature gates map to allowing restore + oldFeatureGates := make(map[string]bool) + for k, v := range operatorConfig.Spec.FeatureGates { + oldFeatureGates[k] = v + } + + if operatorConfig.Spec.FeatureGates == nil { + operatorConfig.Spec.FeatureGates = make(map[string]bool) + } + + if operatorConfig.Spec.FeatureGates["metricsExporter"] { + // The feature is already enabled: nothing to do + return func() {} + } + + By("Enabling metricsExporter feature gate") + operatorConfig.Spec.FeatureGates["metricsExporter"] = true + + _, err = sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Update(context.Background(), operatorConfig, metav1.UpdateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + return func() { + By("Resetting feature gate to its previous value") + operatorConfig, err := sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Get(context.Background(), "default", metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + + operatorConfig.Spec.FeatureGates = oldFeatureGates + _, err = sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Update(context.Background(), operatorConfig, metav1.UpdateOptions{}) + Expect(err).ToNot(HaveOccurred()) + } +} + +// assertPromQLHasTheSameResult evaluates both PromQL queries and checks if both return the same value. +func assertPromQLHasTheSameResult(queryA, queryB string) { + failedValues := "A - B" + + Eventually(func(g Gomega) { + samplesA := runPromQLQuery(queryA) + g.Expect(samplesA).To(HaveLen(1), "queryA[%s]", queryA) + valueA := float64(samplesA[0].Value) + + samplesB := runPromQLQuery(queryB) + g.Expect(samplesB).To(HaveLen(1), "queryB[%s]", queryB) + valueB := float64(samplesB[0].Value) + + failedValues += fmt.Sprintf("%s %f - %f\n", time.Now().Format(time.StampMilli), valueA, valueB) + + g.Expect(valueA).To( + Equal(valueB), + "queries returned different values:\nqueryA[%s]=%f\nqueryB[%s]=%f", + queryA, valueA, queryB, valueB, + ) + }). + WithPolling(1*time.Second). + WithTimeout(2*time.Minute). + WithOffset(1). + Should(Succeed(), func() string { + return fmt.Sprintf(`queries didn't return congruent values + queryA = [%s] + queryB = [%s], + recent values + %s`, queryA, queryB, failedValues) + }) +} diff --git a/cnf-tests/testsuites/e2esuite/test_suite_test.go b/cnf-tests/testsuites/e2esuite/test_suite_test.go index 7ba94fd2a9..4adac35621 100644 --- a/cnf-tests/testsuites/e2esuite/test_suite_test.go +++ b/cnf-tests/testsuites/e2esuite/test_suite_test.go @@ -24,6 +24,7 @@ import ( _ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/bond" // this is needed otherwise the bond test won't be executed _ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/dpdk" // this is needed otherwise the dpdk test won't be executed _ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/fec" // this is needed otherwise the fec test won't be executed + _ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/metrics" // this is needed otherwise the metrics test won't be executed _ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/multinetworkpolicy" // this is needed otherwise the multinetworkpolicy test won't be executed' _ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/ovs_qos" // this is needed otherwise the ovs_qos test won't be executed _ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/s2i" // this is needed otherwise the dpdk test won't be executed diff --git a/cnf-tests/testsuites/pkg/utils/reporter.go b/cnf-tests/testsuites/pkg/utils/reporter.go index 8201aa374f..313b7c30b3 100644 --- a/cnf-tests/testsuites/pkg/utils/reporter.go +++ b/cnf-tests/testsuites/pkg/utils/reporter.go @@ -3,6 +3,7 @@ package utils import ( "errors" "os" + "strings" gkopv1alpha "github.com/gatekeeper/gatekeeper-operator/api/v1alpha1" sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" @@ -145,7 +146,15 @@ func NewReporter(reportPath string) (*k8sreporter.KubernetesReporter, error) { namespaceToLog := func(ns string) bool { _, found := namespacesToDump[ns] - return found + if found { + return true + } + + if strings.HasPrefix(ns, "test-") { + return true + } + + return false } err := os.Mkdir(reportPath, 0755)