Skip to content

Commit

Permalink
Support sriov-network-metrics-exporter
Browse files Browse the repository at this point in the history
Deploy `sriov-network-metrics-exporter` DaemonSet and related
configuration. The feature is activated by the feature gate
`metricsExporter`.

Add deployment logic to the SriovOperatorConfig reconcile loop.

The operator's environment variable `SRIOV_NETWORK_METRICS_EXPORTER_IMAGE`
controls the exporter image to deploy. Update helm charts with
`.Values.images.metricsExporter` with the same semantic.

Signed-off-by: Andrea Panattoni <[email protected]>
  • Loading branch information
zeeke committed Apr 8, 2024
1 parent abf7c98 commit 9e5ae54
Show file tree
Hide file tree
Showing 15 changed files with 429 additions and 12 deletions.
12 changes: 12 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-config-map.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: sriov-network-metrics-exporter-config
namespace: {{.Namespace}}
data:
drivers.yaml: |-
drivers:
- name: ice
version: 1.0.0
- name: mlx5_core
version: 1.0.0
122 changes: 122 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-daemonset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: sriov-network-metrics-exporter
name: sriov-network-metrics-exporter
namespace: {{.Namespace}}
spec:
selector:
matchLabels:
app: sriov-network-metrics-exporter
template:
metadata:
labels:
app: sriov-network-metrics-exporter
spec:
hostNetwork: true
serviceAccountName: sriov-network-config-daemon
{{- if .ImagePullSecrets }}
imagePullSecrets:
{{- range .ImagePullSecrets }}
- name: {{ . }}
{{- end }}
{{- end }}
containers:
- args:
- --web.listen-address=127.0.0.1:{{.MetricsExporterPort}}
- --path.kubecgroup=/sys/fs/cgroup
- --path.sysbuspci=/host/sys/bus/pci/devices/
- --path.sysclassnet=/host/sys/class/net/
- --path.cpucheckpoint=/host/cpu_manager_state
- --path.kubeletsocket=/host/kubelet.sock
- --collector.kubepoddevice=true
- --collector.vfstatspriority=sysfs,netlink
image: {{.Image}}
imagePullPolicy: IfNotPresent
name: metrics-exporter
resources:
requests:
memory: 100Mi
cpu: 100m
securityContext:
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
volumeMounts:
- mountPath: /host/kubelet.sock
name: kubeletsocket
- mountPath: /host/sys/bus/pci/devices
name: sysbuspcidevices
readOnly: true
- mountPath: /host/sys/devices
name: sysdevices
readOnly: true
- mountPath: /host/sys/class/net
name: sysclassnet
readOnly: true
- mountPath: /host/cpu_manager_state
name: cpucheckpoint
readOnly: true
- name: sriov-network-metrics-exporter-config
mountPath: /etc/sriov-network-metrics-exporter
- name: kube-rbac-proxy
image: '{{.KubeRbacProxyImage}}'
imagePullPolicy: IfNotPresent
args:
- --logtostderr
- --secure-listen-address=[$(HOST_IP)]:{{.MetricsExporterPort}}
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --upstream=http://127.0.0.1:{{.MetricsExporterPort}}/
- --tls-private-key-file=/etc/metrics/tls.key
- --tls-cert-file=/etc/metrics/tls.crt
ports:
- containerPort: {{.MetricsExporterPort}}
name: https-metrics
env:
- name: HOST_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
resources:
requests:
cpu: 10m
memory: 20Mi
volumeMounts:
- name: metrics-certs
mountPath: /etc/metrics
readOnly: true
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/worker: ""
restartPolicy: Always
volumes:
- hostPath:
path: /var/lib/kubelet/pod-resources/kubelet.sock
type: "Socket"
name: kubeletsocket
- hostPath:
path: /var/lib/kubelet/cpu_manager_state
type: "File"
name: cpucheckpoint
- hostPath:
path: /sys/class/net
type: "Directory"
name: sysclassnet
- hostPath:
path: /sys/bus/pci/devices
type: "Directory"
name: sysbuspcidevices
- hostPath:
path: /sys/devices
type: "Directory"
name: sysdevices
- name: sriov-network-metrics-exporter-config
configMap:
name: sriov-network-metrics-exporter-config
- name: metrics-certs
secret:
defaultMode: 420
secretName: {{ .MetricsExporterSecretName }}
66 changes: 66 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: metrics-exporter-sa
namespace: {{.Namespace}}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: metrics-exporter-role
namespace: {{.Namespace}}
rules:
- apiGroups:
- security.openshift.io
resourceNames:
- hostaccess
resources:
- securitycontextconstraints
verbs:
- use
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: metrics-exporter-rb
namespace: {{.Namespace}}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: metrics-exporter-role
subjects:
- kind: ServiceAccount
name: metrics-exporter-sa
namespace: {{.Namespace}}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: sriov-metrics-kube-rbac-role
rules:
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: sriov-metrics-kube-rbac-rolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: sriov-metrics-kube-rbac-role
subjects:
- kind: ServiceAccount
name: metrics-exporter-sa
namespace: {{.Namespace}}
18 changes: 18 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: Service
metadata:
name: sriov-network-metrics-exporter-service
namespace: {{.Namespace}}
annotations:
prometheus.io/target: "true"
service.beta.openshift.io/serving-cert-secret-name: {{ .MetricsExporterSecretName }}
labels:
name: sriov-network-metrics-exporter-service
spec:
selector:
app: sriov-network-metrics-exporter
ports:
- protocol: TCP
name: sriov-network-metrics
port: {{ .MetricsExporterPort }}
targetPort: {{ .MetricsExporterPort }}
84 changes: 74 additions & 10 deletions controllers/sriovoperatorconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ func (r *SriovOperatorConfigReconciler) Reconcile(ctx context.Context, req ctrl.
return reconcile.Result{}, err
}

if err = r.syncMetricsExporter(ctx, defaultConfig); err != nil {
return reconcile.Result{}, err
}

// For Openshift we need to create the systemd files using a machine config
if vars.ClusterType == consts.ClusterTypeOpenshift {
// TODO: add support for hypershift as today there is no MCO on hypershift clusters
Expand Down Expand Up @@ -196,27 +200,87 @@ func (r *SriovOperatorConfigReconciler) syncConfigDaemonSet(ctx context.Context,
}
// Sync DaemonSets
for _, obj := range objs {
if obj.GetKind() == "DaemonSet" && len(dc.Spec.ConfigDaemonNodeSelector) > 0 {
scheme := kscheme.Scheme
ds := &appsv1.DaemonSet{}
err = scheme.Convert(obj, ds, nil)
err = updateDaemonsetNodeSelector(obj, dc.Spec.ConfigDaemonNodeSelector)
if err != nil {
return err
}

err = r.syncK8sResource(ctx, dc, obj)
if err != nil {
logger.Error(err, "Couldn't sync SR-IoV daemons objects")
return err
}
}
return nil
}

func updateDaemonsetNodeSelector(obj *uns.Unstructured, nodeSelector map[string]string) error {
if obj.GetKind() != "DaemonSet" {
return nil
}

if len(nodeSelector) == 0 {
return nil
}

ds := &appsv1.DaemonSet{}
scheme := kscheme.Scheme
err := scheme.Convert(obj, ds, nil)
if err != nil {
return fmt.Errorf("failed to convert Unstructured [%s] to DaemonSet: %v", obj.GetName(), err)
}

ds.Spec.Template.Spec.NodeSelector = nodeSelector

err = scheme.Convert(ds, obj, nil)
if err != nil {
return fmt.Errorf("failed to convert DaemonSet [%s] to Unstructured: %v", obj.GetName(), err)
}
return nil
}

func (r *SriovOperatorConfigReconciler) syncMetricsExporter(ctx context.Context, dc *sriovnetworkv1.SriovOperatorConfig) error {
logger := log.Log.WithName("syncMetricsExporter")
logger.V(1).Info("Start to sync metrics exporter")

data := render.MakeRenderData()
data.Data["Image"] = os.Getenv("SRIOV_NETWORK_METRICS_EXPORTER_IMAGE")
data.Data["Namespace"] = vars.Namespace
data.Data["ImagePullSecrets"] = GetImagePullSecrets()
data.Data["MetricsExporterSecretName"] = os.Getenv("METRICS_EXPORTER_SECRET_NAME")
data.Data["MetricsExporterPort"] = os.Getenv("METRICS_EXPORTER_PORT")
data.Data["KubeRbacProxyImage"] = os.Getenv("KUBE_RBAC_PROXY_IMAGE")

objs, err := render.RenderDir(consts.MetricsExporterPath, &data)
if err != nil {
logger.Error(err, "Fail to render metrics exporter manifests")
return err
}

deployMetricsExporter, ok := dc.Spec.FeatureGates[consts.MetricsExporterFeatureGate]
if ok && deployMetricsExporter {
for _, obj := range objs {
err = updateDaemonsetNodeSelector(obj, dc.Spec.ConfigDaemonNodeSelector)
if err != nil {
logger.Error(err, "Fail to convert to DaemonSet")
return err
}
ds.Spec.Template.Spec.NodeSelector = dc.Spec.ConfigDaemonNodeSelector
err = scheme.Convert(ds, obj, nil)

err = r.syncK8sResource(ctx, dc, obj)
if err != nil {
logger.Error(err, "Fail to convert to Unstructured")
logger.Error(err, "Couldn't sync metrics exporter objects")
return err
}
}
err = r.syncK8sResource(ctx, dc, obj)
return nil
}

for _, obj := range objs {
err = r.deleteK8sResource(ctx, obj)
if err != nil {
logger.Error(err, "Couldn't sync SR-IoV daemons objects")
return err
}
}

return nil
}

Expand Down
30 changes: 30 additions & 0 deletions controllers/sriovoperatorconfig_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

admv1 "k8s.io/api/admissionregistration/v1"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
Expand Down Expand Up @@ -325,5 +326,34 @@ var _ = Describe("SriovOperatorConfig controller", Ordered, func() {
})
Expect(err).ToNot(HaveOccurred())
})
It("should deploy the metrics-exporter when the feature gate is enabled", func() {
config := &sriovnetworkv1.SriovOperatorConfig{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: testNamespace, Name: "default"}, config)).NotTo(HaveOccurred())

daemonSet := &appsv1.DaemonSet{}
err := k8sClient.Get(ctx, types.NamespacedName{Name: "sriov-metrics-exporter", Namespace: testNamespace}, daemonSet)
Expect(err).To(HaveOccurred())
Expect(errors.IsNotFound(err)).To(BeTrue())

config.Spec.FeatureGates = map[string]bool{constants.MetricsExporterFeatureGate: true}
err = k8sClient.Update(ctx, config)
Expect(err).NotTo(HaveOccurred())

DeferCleanup(func() {
config.Spec.FeatureGates = map[string]bool{}
err = k8sClient.Update(ctx, config)
Expect(err).NotTo(HaveOccurred())
})

err = util.WaitForNamespacedObject(&appsv1.DaemonSet{}, k8sClient, testNamespace, "sriov-network-metrics-exporter", util.RetryInterval, util.APITimeout)
Expect(err).NotTo(HaveOccurred())

err = util.WaitForNamespacedObject(&v1.ConfigMap{}, k8sClient, testNamespace, "sriov-network-metrics-exporter-config", util.RetryInterval, util.APITimeout)
Expect(err).NotTo(HaveOccurred())

err = util.WaitForNamespacedObject(&v1.Service{}, k8sClient, testNamespace, "sriov-network-metrics-exporter-service", util.RetryInterval, util.APITimeout)
Expect(err).ToNot(HaveOccurred())
})

})
})
8 changes: 8 additions & 0 deletions controllers/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ var _ = BeforeSuite(func() {
Expect(err).NotTo(HaveOccurred())
err = os.Setenv("OPERATOR_NAME", "sriov-network-operator")
Expect(err).NotTo(HaveOccurred())
err = os.Setenv("SRIOV_NETWORK_METRICS_EXPORTER_IMAGE", "mock-image")
Expect(err).NotTo(HaveOccurred())
err = os.Setenv("METRICS_EXPORTER_SECRET_NAME", "metrics-exporter-cert")
Expect(err).NotTo(HaveOccurred())
err = os.Setenv("METRICS_EXPORTER_PORT", "9110")
Expect(err).NotTo(HaveOccurred())
err = os.Setenv("KUBE_RBAC_PROXY_IMAGE", "mock-image")
Expect(err).NotTo(HaveOccurred())

By("bootstrapping test environment")
testEnv = &envtest.Environment{
Expand Down
Loading

0 comments on commit 9e5ae54

Please sign in to comment.