-
Notifications
You must be signed in to change notification settings - Fork 114
/
run-e2e-conformance-virtual-cluster.sh
executable file
·469 lines (396 loc) · 12.1 KB
/
run-e2e-conformance-virtual-cluster.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#!/usr/bin/env bash
set -xeo pipefail
cluster_version=${CLUSTER_VERSION:-1.29.3}
cluster_name=${CLUSTER_NAME:-virtual}
domain_name=$cluster_name.lab
api_ip=${API_IP:-192.168.124.250}
virtual_router_id=${VIRTUAL_ROUTER_ID:-250}
HOME="/root"
here="$(dirname "$(readlink --canonicalize "${BASH_SOURCE[0]}")")"
root="$(readlink --canonicalize "$here/..")"
NUM_OF_WORKERS=${NUM_OF_WORKERS:-2}
total_number_of_nodes=$((1 + NUM_OF_WORKERS))
## Global configuration
export NAMESPACE="sriov-network-operator"
export OPERATOR_NAMESPACE="sriov-network-operator"
export SKIP_VAR_SET=""
export OPERATOR_EXEC=kubectl
export CLUSTER_HAS_EMULATED_PF=TRUE
if [ "$NUM_OF_WORKERS" -lt 2 ]; then
echo "Min number of workers is 2"
exit 1
fi
export MULTUS_NAMESPACE="kube-system"
source $here/run-e2e-conformance-common
check_requirements() {
for cmd in kcli virsh virt-edit podman make go; do
if ! command -v "$cmd" &> /dev/null; then
echo "$cmd is not available"
exit 1
fi
done
return 0
}
echo "## checking requirements"
check_requirements
echo "## delete existing cluster name $cluster_name"
kcli delete cluster $cluster_name -y
kcli delete network $cluster_name -y
function cleanup {
kcli delete cluster $cluster_name -y
kcli delete network $cluster_name -y
}
if [ -z $SKIP_DELETE ]; then
trap cleanup EXIT
fi
kcli create network -c 192.168.124.0/24 k8s
kcli create network -c 192.168.${virtual_router_id}.0/24 --nodhcp -i $cluster_name
cat <<EOF > ./${cluster_name}-plan.yaml
version: $cluster_version
ctlplane_memory: 4096
worker_memory: 4096
pool: default
disk_size: 50
network: k8s
api_ip: $api_ip
virtual_router_id: $virtual_router_id
domain: $domain_name
ctlplanes: 1
workers: $NUM_OF_WORKERS
ingress: false
machine: q35
engine: crio
sdn: flannel
autolabeller: false
vmrules:
- $cluster_name-worker-.*:
nets:
- name: k8s
type: igb
vfio: true
noconf: true
numa: 0
- name: $cluster_name
type: igb
vfio: true
noconf: true
numa: 1
numcpus: 6
numa:
- id: 0
vcpus: 0,2,4
memory: 2048
- id: 1
vcpus: 1,3,5
memory: 2048
EOF
kcli create cluster generic --paramfile ./${cluster_name}-plan.yaml $cluster_name
export KUBECONFIG=$HOME/.kcli/clusters/$cluster_name/auth/kubeconfig
export PATH=$PWD:$PATH
ATTEMPTS=0
MAX_ATTEMPTS=72
ready=false
sleep_time=10
until $ready || [ $ATTEMPTS -eq $MAX_ATTEMPTS ]
do
echo "waiting for cluster to be ready"
if [ `kubectl get node | grep Ready | wc -l` == $total_number_of_nodes ]; then
echo "cluster is ready"
ready=true
else
echo "cluster is not ready yet"
sleep $sleep_time
fi
ATTEMPTS=$((ATTEMPTS+1))
done
if ! $ready; then
echo "Timed out waiting for cluster to be ready"
kubectl get nodes
exit 1
fi
function update_worker_labels() {
echo "## label cluster workers as sriov capable"
for ((num=0; num<NUM_OF_WORKERS; num++))
do
kubectl label node $cluster_name-worker-$num.$domain_name feature.node.kubernetes.io/network-sriov.capable=true --overwrite
done
echo "## label cluster worker as worker"
for ((num=0; num<NUM_OF_WORKERS; num++))
do
kubectl label node $cluster_name-worker-$num.$domain_name node-role.kubernetes.io/worker= --overwrite
done
}
update_worker_labels
controller_ip=`kubectl get node -o wide | grep ctlp | awk '{print $6}'`
insecure_registry="[[registry]]
location = \"$controller_ip:5000\"
insecure = true
[aliases]
\"golang\" = \"docker.io/library/golang\"
"
cat << EOF > /etc/containers/registries.conf.d/003-${cluster_name}.conf
$insecure_registry
EOF
function update_host() {
node_name=$1
kcli ssh $node_name << EOF
sudo su
echo '$insecure_registry' > /etc/containers/registries.conf.d/003-internal.conf
systemctl restart crio
echo '[connection]
id=multi
type=ethernet
[ethernet]
[match]
driver=igbvf;
[ipv4]
method=disabled
[ipv6]
addr-gen-mode=default
method=disabled
[proxy]' > /etc/NetworkManager/system-connections/multi.nmconnection
chmod 600 /etc/NetworkManager/system-connections/multi.nmconnection
echo '[Unit]
Description=disable checksum offload to avoid vf bug
After=network.target
[Service]
Type=oneshot
ExecStart=/usr/bin/bash -c "ethtool --offload eth1 rx off tx off && ethtool -K eth1 gso off"
StandardOutput=journal+console
StandardError=journal+console
[Install]
WantedBy=default.target' > /etc/systemd/system/disable-offload.service
systemctl daemon-reload
systemctl enable --now disable-offload
echo '[Unit]
Description=load br_netfilter
After=network.target
[Service]
Type=oneshot
ExecStart=/usr/bin/bash -c "modprobe br_netfilter"
StandardOutput=journal+console
StandardError=journal+console
[Install]
WantedBy=default.target' > /etc/systemd/system/load-br-netfilter.service
systemctl daemon-reload
systemctl enable --now load-br-netfilter
systemctl restart NetworkManager
EOF
}
update_host $cluster_name-ctlplane-0
for ((num=0; num<NUM_OF_WORKERS; num++))
do
update_host $cluster_name-worker-$num
done
# remove the patch after multus bug is fixed
# https://github.com/k8snetworkplumbingwg/multus-cni/issues/1221
kubectl patch -n ${MULTUS_NAMESPACE} ds/kube-multus-ds --type=json -p='[{"op": "replace", "path": "/spec/template/spec/initContainers/0/command", "value":["cp", "-f","/usr/src/multus-cni/bin/multus-shim", "/host/opt/cni/bin/multus-shim"]}]'
kubectl create namespace container-registry
echo "## deploy internal registry"
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolume
metadata:
name: registry-pv
spec:
capacity:
storage: 60Gi
volumeMode: Filesystem
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Delete
storageClassName: registry-local-storage
local:
path: /mnt/
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- ${cluster_name}-ctlplane-0.${domain_name}
EOF
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: registry-pv-claim
namespace: container-registry
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
resources:
requests:
storage: 60Gi
storageClassName: registry-local-storage
EOF
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: registry
namespace: container-registry
spec:
replicas: 1
selector:
matchLabels:
app: registry
template:
metadata:
labels:
app: registry
spec:
hostNetwork: true
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
containers:
- image: quay.io/libpod/registry:2.8.2
imagePullPolicy: Always
name: registry
volumeMounts:
- name: data
mountPath: /var/lib/registry
volumes:
- name: data
persistentVolumeClaim:
claimName: registry-pv-claim
terminationGracePeriodSeconds: 10
EOF
export SRIOV_NETWORK_OPERATOR_IMAGE="$controller_ip:5000/sriov-network-operator:latest"
export SRIOV_NETWORK_CONFIG_DAEMON_IMAGE="$controller_ip:5000/sriov-network-config-daemon:latest"
export SRIOV_NETWORK_WEBHOOK_IMAGE="$controller_ip:5000/sriov-network-operator-webhook:latest"
echo "## build operator image"
podman build -t "${SRIOV_NETWORK_OPERATOR_IMAGE}" -f "${root}/Dockerfile" "${root}"
echo "## build daemon image"
podman build -t "${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE}" -f "${root}/Dockerfile.sriov-network-config-daemon" "${root}"
echo "## build webhook image"
podman build -t "${SRIOV_NETWORK_WEBHOOK_IMAGE}" -f "${root}/Dockerfile.webhook" "${root}"
podman push --tls-verify=false "${SRIOV_NETWORK_OPERATOR_IMAGE}"
podman rmi -fi ${SRIOV_NETWORK_OPERATOR_IMAGE}
podman push --tls-verify=false "${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE}"
podman rmi -fi ${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE}
podman push --tls-verify=false "${SRIOV_NETWORK_WEBHOOK_IMAGE}"
podman rmi -fi ${SRIOV_NETWORK_WEBHOOK_IMAGE}
if [[ -v LOCAL_SRIOV_CNI_IMAGE ]]; then
export SRIOV_CNI_IMAGE="$controller_ip:5000/sriov-cni:latest"
podman_tag_and_push ${LOCAL_SRIOV_CNI_IMAGE} ${SRIOV_CNI_IMAGE}
fi
if [[ -v LOCAL_SRIOV_DEVICE_PLUGIN_IMAGE ]]; then
export SRIOV_DEVICE_PLUGIN_IMAGE="$controller_ip:5000/sriov-network-device-plugin:latest"
podman_tag_and_push ${LOCAL_SRIOV_DEVICE_PLUGIN_IMAGE} ${SRIOV_DEVICE_PLUGIN_IMAGE}
fi
if [[ -v LOCAL_NETWORK_RESOURCES_INJECTOR_IMAGE ]]; then
export NETWORK_RESOURCES_INJECTOR_IMAGE="$controller_ip:5000/network-resources-injector:latest"
podman_tag_and_push ${LOCAL_NETWORK_RESOURCES_INJECTOR_IMAGE} ${NETWORK_RESOURCES_INJECTOR_IMAGE}
fi
if [[ -v LOCAL_SRIOV_NETWORK_METRICS_EXPORTER_IMAGE ]]; then
export METRICS_EXPORTER_IMAGE="$controller_ip:5000/sriov-network-metrics-exporter:latest"
podman_tag_and_push ${LOCAL_SRIOV_NETWORK_METRICS_EXPORTER_IMAGE} ${METRICS_EXPORTER_IMAGE}
fi
# remove the crio bridge and let flannel to recreate
kcli ssh $cluster_name-ctlplane-0 << EOF
sudo su
if [ $(ip a | grep 10.85.0 | wc -l) -eq 0 ]; then ip link del cni0; fi
EOF
kubectl -n ${MULTUS_NAMESPACE} get po | grep multus | awk '{print "kubectl -n kube-system delete po",$1}' | sh
kubectl -n kube-system get po | grep coredns | awk '{print "kubectl -n kube-system delete po",$1}' | sh
TIMEOUT=400
echo "## wait for coredns"
kubectl -n kube-system wait --for=condition=available deploy/coredns --timeout=${TIMEOUT}s
echo "## wait for multus"
kubectl -n ${MULTUS_NAMESPACE} wait --for=condition=ready -l name=multus pod --timeout=${TIMEOUT}s
echo "## deploy cert manager"
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.12.0/cert-manager.yaml
echo "## wait for cert manager to be ready"
ATTEMPTS=0
MAX_ATTEMPTS=72
ready=false
sleep_time=5
until $ready || [ $ATTEMPTS -eq $MAX_ATTEMPTS ]
do
echo "waiting for cert manager webhook to be ready"
if [ `kubectl -n cert-manager get po | grep webhook | grep "1/1" | wc -l` == 1 ]; then
echo "cluster is ready"
ready=true
else
echo "cert manager webhook is not ready yet"
sleep $sleep_time
fi
ATTEMPTS=$((ATTEMPTS+1))
done
# Deploy the sriov operator via helm
hack/deploy-operator-helm.sh
echo "## create certificates for webhook"
cat <<EOF | kubectl apply -f -
---
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: selfsigned-issuer
namespace: ${NAMESPACE}
spec:
selfSigned: {}
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: network-resources-injector-cert
namespace: ${NAMESPACE}
spec:
commonName: network-resources-injector-service.svc
dnsNames:
- network-resources-injector-service.${NAMESPACE}.svc.cluster.local
- network-resources-injector-service.${NAMESPACE}.svc
issuerRef:
kind: Issuer
name: selfsigned-issuer
secretName: network-resources-injector-cert
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: operator-webhook-cert
namespace: ${NAMESPACE}
spec:
commonName: operator-webhook-service.svc
dnsNames:
- operator-webhook-service.${NAMESPACE}.svc.cluster.local
- operator-webhook-service.${NAMESPACE}.svc
issuerRef:
kind: Issuer
name: selfsigned-issuer
secretName: operator-webhook-cert
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: metrics-exporter-cert
namespace: ${NAMESPACE}
spec:
commonName: sriov-network-metrics-exporter-service.svc
dnsNames:
- sriov-network-metrics-exporter-service.${NAMESPACE}.svc.cluster.local
- sriov-network-metrics-exporter-service.${NAMESPACE}.svc
issuerRef:
kind: Issuer
name: selfsigned-issuer
secretName: metrics-exporter-cert
EOF
function cluster_info {
if [[ -v TEST_REPORT_PATH ]]; then
kubectl cluster-info dump --namespaces ${NAMESPACE},${MULTUS_NAMESPACE} --output-directory "${root}/${TEST_REPORT_PATH}/cluster-info"
fi
}
trap cluster_info ERR
echo "## wait for sriov operator to be ready"
hack/deploy-wait.sh
if [ -z $SKIP_TEST ]; then
echo "## run sriov e2e conformance tests"
if [[ -v TEST_REPORT_PATH ]]; then
export JUNIT_OUTPUT="${root}/${TEST_REPORT_PATH}/conformance-test-report"
fi
SUITE=./test/conformance hack/run-e2e-conformance.sh
fi