Skip to content

Commit

Permalink
cnf-tests: Compare Multus and SR-IOV metrics
Browse files Browse the repository at this point in the history
Statistics that relates to Multus interfaces can be collected by
joining network-metrics-daemon [1] and cAdvisor [2] (see [3]).
The same information, for kernel netdevice SR-IOV interface can be
collected via the `sriov-network-metrics-exporter` [4], which leverages
the Physical Function to get statistics about the Virtual Functions.

Proposed test case verifies both sources produces congruent values.

Only TX statistics are verified, as receiving ones might be spoiled by
noise traffic on the wire (e.g. other nodes sending DHCP broadcast
requests).

[1] https://github.com/openshift/network-metrics-daemon
[2] https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
[3] https://docs.openshift.com/container-platform/4.16/networking/associating-secondary-interfaces-metrics-to-network-attachments.html#cnf-associating-secondary-interfaces-metrics-with-network-name_secondary-interfaces-metrics
[4] https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter

Signed-off-by: Andrea Panattoni <[email protected]>
  • Loading branch information
zeeke committed Sep 10, 2024
1 parent 00f4750 commit 663753e
Show file tree
Hide file tree
Showing 3 changed files with 279 additions and 1 deletion.
268 changes: 268 additions & 0 deletions cnf-tests/testsuites/e2esuite/metrics/sriovnetworkmetricsexporter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
package metrics

import (
"context"
"encoding/json"
"fmt"
"net/url"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

sriovtestclient "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/client"
sriovcluster "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster"
sriovnamespaces "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces"
sriovnetwork "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/network"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/client"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/discovery"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/images"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/namespaces"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/networks"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/pods"

"github.com/prometheus/common/model"
)

const testNamespace string = "test-sriov-metrics"

var sriovclient *sriovtestclient.ClientSet

func init() {
sriovclient = sriovtestclient.New("")
}

var _ = Describe("[sriov] SR-IOV Network Metrics Exporter", func() {

var sriovCapableNodes *sriovcluster.EnabledNodes

BeforeEach(func() {
if discovery.Enabled() {
Skip("Discovery mode not supported")
}

restoreFeatureGates := enableMetricsExporterFeatureGate()
DeferCleanup(restoreFeatureGates)

By("Adding monitoring label to " + namespaces.SRIOVOperator)
err := sriovnamespaces.AddLabel(sriovclient, context.Background(), namespaces.SRIOVOperator, "openshift.io/cluster-monitoring", "true")
Expect(err).ToNot(HaveOccurred())

By("Clean SRIOV policies and networks")
networks.CleanSriov(sriovclient)

By("Discover SRIOV devices")
sriovCapableNodes, err = sriovcluster.DiscoverSriov(sriovclient, namespaces.SRIOVOperator)
Expect(err).ToNot(HaveOccurred())

err = namespaces.Create(testNamespace, client.Client)
Expect(err).ToNot(HaveOccurred())
namespaces.CleanPods(testNamespace, client.Client)
})

It("should provide the same metrics as network-metrics-daemon", func() {
testNode, testDevice, err := sriovCapableNodes.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
By("Using device " + testDevice.Name + " on node " + testNode)

sriovNetworkNodePolicy, err := sriovnetwork.CreateSriovPolicy(
sriovclient, "test-metrics-", namespaces.SRIOVOperator,
testDevice.Name, testNode, 8,
"testsriovmetricsresource", "netdevice",
)
Expect(err).ToNot(HaveOccurred())
DeferCleanup(sriovclient.Delete, context.Background(), sriovNetworkNodePolicy)

ipam := `{ "type": "host-local", "subnet": "192.0.2.0/24" }`
err = sriovnetwork.CreateSriovNetwork(sriovclient, testDevice, "test-metrics-network",
testNamespace, namespaces.SRIOVOperator, "testsriovmetricsresource", ipam)
Expect(err).ToNot(HaveOccurred())

serverPod, clientPod := makeClientAndServerNetcatPod()

// Do not verify pairs
// "container_network_receive_packets_total": "sriov_vf_rx_packets",
// "container_network_receive_bytes_total": "sriov_vf_rx_bytes",
// because there might be traffic on the wire that disturbs the counters.
// An example is a DHCP traffic that other nodes are producing, e.g. (tcpdump):
//
// 13:28:00.442893 04:3f:72:fe:d1:d1 > ff:ff:ff:ff:ff:ff, ethertype IPv4 (0x0800), length 327: 0.0.0.0.68 > 255.255.255.255.67: BOOTP/DHCP, Request from 04:3f:72:fe:d1:d1, length 285
metricsToMatch := map[string]string{
"container_network_transmit_packets_total": "sriov_vf_tx_packets",
"container_network_transmit_bytes_total": "sriov_vf_tx_bytes",
}
containerQuery := `%s + on(namespace,pod,interface) group_left(network_name) (pod_network_name_info{interface="net1",pod="%s"})`
sriovQuery := `%s * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice{pod="%s"}`

for containerMetricName, sriovMetricName := range metricsToMatch {
By(fmt.Sprintf("verifying metrics %s == %s", containerMetricName, sriovMetricName))
assertPromQLHasTheSameResult(
fmt.Sprintf(containerQuery, containerMetricName, serverPod.Name),
fmt.Sprintf(sriovQuery, sriovMetricName, serverPod.Name),
)

assertPromQLHasTheSameResult(
fmt.Sprintf(containerQuery, containerMetricName, clientPod.Name),
fmt.Sprintf(sriovQuery, sriovMetricName, clientPod.Name),
)
}
})
})

func makeClientAndServerNetcatPod() (*corev1.Pod, *corev1.Pod) {
serverPod := pods.DefinePod(testNamespace)
serverPod.GenerateName = "testpod-nc-server-"
serverPod = pods.RedefinePodWithNetwork(serverPod, `[{"name": "test-metrics-network","ips":["192.0.2.101/24"]}]`)
serverPod.Spec.Containers = append(serverPod.Spec.Containers, corev1.Container{
Name: "netcat-tcp-server",
Image: images.For(images.TestUtils),
Command: []string{"nc", "-vv", "--keep-open", "--listen", "5000"},
SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true)},
})
serverPod, err := pods.CreateAndStart(serverPod)
Expect(err).ToNot(HaveOccurred())

clientPod := pods.DefinePod(testNamespace)
clientPod.GenerateName = "testpod-nc-client-"
clientPod = pods.RedefinePodWithNetwork(clientPod, `[{"name": "test-metrics-network","ips":["192.0.2.102/24"]}]`)
clientPod.Spec.Containers = append(clientPod.Spec.Containers, corev1.Container{
Name: "netcat-tcp-client",
Image: images.For(images.TestUtils),
Command: makeNetcatClientCommand("192.0.2.101 5000"),
SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true)},
})
clientPod, err = pods.CreateAndStart(clientPod)
Expect(err).ToNot(HaveOccurred())

return clientPod, serverPod
}

func makeNetcatClientCommand(targetIpAddress string) []string {
// This command send 1001 bytes via netcat
script := fmt.Sprintf(
`
sleep 10;
printf %%01000d 1 | nc -w 1 %s;
sleep inf
`, targetIpAddress)
return []string{"bash", "-xec", script}
}

func runPromQLQuery(query string) (model.Vector, error) {
prometheusPods, err := client.Client.Pods("").List(context.Background(), metav1.ListOptions{
LabelSelector: "app.kubernetes.io/component=prometheus",
})
if err != nil {
return nil, fmt.Errorf("can't find a Prometheus pod: %w", err)
}

if len(prometheusPods.Items) == 0 {
return nil, fmt.Errorf("no instance of Prometheus found")
}

prometheusPod := prometheusPods.Items[0]

url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode())
command := []string{"curl", url}
outputBuffer, err := pods.ExecCommand(client.Client, prometheusPod, command)
if err != nil {
return nil, fmt.Errorf("promQL query : [%s/%s] command: [%v]\nout: %s\n%w",
prometheusPod.Namespace, prometheusPod.Name, command, outputBuffer.String(), err)
}

result := struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result model.Vector `json:"result"`
} `json:"data"`
}{}

json.Unmarshal(outputBuffer.Bytes(), &result)
if err != nil {
return nil, fmt.Errorf("can't unmarshal PromQL result: query[%s] response[%s] error: %w", query, outputBuffer.String(), err)
}
if result.Status != "success" {
return nil, fmt.Errorf("PromQL statement failed: query[%s] result[%v]", query, result)
}

return result.Data.Result, nil
}

func enableMetricsExporterFeatureGate() func() {

operatorConfig, err := sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Get(context.Background(), "default", metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())

// Save the current feature gates map to allowing restore
oldFeatureGates := make(map[string]bool)
for k, v := range operatorConfig.Spec.FeatureGates {
oldFeatureGates[k] = v
}

if operatorConfig.Spec.FeatureGates == nil {
operatorConfig.Spec.FeatureGates = make(map[string]bool)
}

if operatorConfig.Spec.FeatureGates["metricsExporter"] {
// The feature is already enabled: nothing to do
return func() {}
}

By("Enabling metricsExporter feature gate")
operatorConfig.Spec.FeatureGates["metricsExporter"] = true

_, err = sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Update(context.Background(), operatorConfig, metav1.UpdateOptions{})
Expect(err).ToNot(HaveOccurred())

return func() {
By("Resetting feature gate to its previous value")
operatorConfig, err := sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Get(context.Background(), "default", metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())

operatorConfig.Spec.FeatureGates = oldFeatureGates
_, err = sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Update(context.Background(), operatorConfig, metav1.UpdateOptions{})
Expect(err).ToNot(HaveOccurred())
}
}

// assertPromQLHasTheSameResult evaluates both PromQL queries and checks if both return the same value.
func assertPromQLHasTheSameResult(queryA, queryB string) {
failedValues := "time A - B\n "

Eventually(func(g Gomega) {
samplesA, errA := runPromQLQuery(queryA)
samplesB, errB := runPromQLQuery(queryB)

failedValues += fmt.Sprintf("%s %v - %v\n", time.Now().Format(time.StampMilli), samplesA, samplesB)

g.Expect(errA).ToNot(HaveOccurred())
g.Expect(samplesA).To(HaveLen(1), "queryA[%s]", queryA)
valueA := float64(samplesA[0].Value)

g.Expect(errB).ToNot(HaveOccurred())
g.Expect(samplesB).To(HaveLen(1), "queryB[%s]", queryB)
valueB := float64(samplesB[0].Value)

g.Expect(valueA).To(
Equal(valueB),
"queries returned different values:\nqueryA[%s]=%f\nqueryB[%s]=%f",
queryA, valueA, queryB, valueB,
)
}).
WithPolling(1*time.Second).
WithTimeout(2*time.Minute).
WithOffset(1).
Should(Succeed(), func() string {
return fmt.Sprintf(`queries didn't return congruent values
queryA = [%s]
queryB = [%s],
recent values
%s`, queryA, queryB, failedValues)
})
}
1 change: 1 addition & 0 deletions cnf-tests/testsuites/e2esuite/test_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/bond" // this is needed otherwise the bond test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/dpdk" // this is needed otherwise the dpdk test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/fec" // this is needed otherwise the fec test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/metrics" // this is needed otherwise the metrics test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/multinetworkpolicy" // this is needed otherwise the multinetworkpolicy test won't be executed'
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/ovs_qos" // this is needed otherwise the ovs_qos test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/s2i" // this is needed otherwise the dpdk test won't be executed
Expand Down
11 changes: 10 additions & 1 deletion cnf-tests/testsuites/pkg/utils/reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package utils
import (
"errors"
"os"
"strings"

gkopv1alpha "github.com/gatekeeper/gatekeeper-operator/api/v1alpha1"
sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
Expand Down Expand Up @@ -145,7 +146,15 @@ func NewReporter(reportPath string) (*k8sreporter.KubernetesReporter, error) {

namespaceToLog := func(ns string) bool {
_, found := namespacesToDump[ns]
return found
if found {
return true
}

if strings.HasPrefix(ns, "test-") {
return true
}

return false
}

err := os.Mkdir(reportPath, 0755)
Expand Down

0 comments on commit 663753e

Please sign in to comment.