[RHOAIENG-15141] Fix failing E2E tests (#473)

There were two E2E tests in failure: --- FAIL: TestE2ENotebookController/update/thoth-minimal-oauth-notebook/Notebook_Statefulset_Validation_After_Update (60.28s) --- FAIL: TestE2ENotebookController/update/thoth-minimal-oauth-notebook/Verify_Notebook_Traffic_After_Update (10.75s) The reason was that after the culling test, the culling configuration is reverted, but the Notebook CR isn't updated and the annotation that was added by the notebook controller: `kubeflow-resource-stopped` stays in that CR and as such, the workbench instance isn't started back. This change deletes this annotation at the end of the culler test and the workbench is up and running and ready for the followup tests. Another issue there was that the update test updated the workbench to a non-existent image link, which resulted in a ImagePullBackOff Error in the end. This change updates this link to some existing image.
opendatahub-io · Nov 27, 2024 · fc5ffd4 · fc5ffd4
1 parent bb2dd26
commit fc5ffd4
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 24 deletions.
diff --git a/components/odh-notebook-controller/e2e/helper_test.go b/components/odh-notebook-controller/e2e/helper_test.go
@@ -3,7 +3,6 @@ package e2e
 import (
 	"crypto/tls"
 	"fmt"
-	netv1 "k8s.io/api/networking/v1"
 	"log"
 	"net/http"
 	"time"
@@ -12,9 +11,11 @@ import (
 	routev1 "github.com/openshift/api/route/v1"
 	appsv1 "k8s.io/api/apps/v1"
 	v1 "k8s.io/api/core/v1"
+	netv1 "k8s.io/api/networking/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -130,18 +131,57 @@ func (tc *testContext) rolloutDeployment(depMeta metav1.ObjectMeta) error {
 	return nil
 }
 
-func (tc *testContext) revertCullingConfiguration(cmMeta metav1.ObjectMeta, depMeta metav1.ObjectMeta) {
+func (tc *testContext) waitForStatefulSet(nbMeta *metav1.ObjectMeta, availableReplicas int32, readyReplicas int32) error {
+	// Verify StatefulSet is running expected number of replicas
+	err := wait.Poll(tc.resourceRetryInterval, tc.resourceCreationTimeout, func() (done bool, err error) {
+		notebookStatefulSet, err1 := tc.kubeClient.AppsV1().StatefulSets(tc.testNamespace).Get(tc.ctx,
+			nbMeta.Name, metav1.GetOptions{})
+
+		if err1 != nil {
+			if errors.IsNotFound(err1) {
+				return false, nil
+			} else {
+				log.Printf("Failed to get %s statefulset", nbMeta.Name)
+				return false, err1
+			}
+		}
+		if notebookStatefulSet.Status.AvailableReplicas == availableReplicas &&
+			notebookStatefulSet.Status.ReadyReplicas == readyReplicas {
+			return true, nil
+		}
+		return false, nil
+	})
+	return err
+}
+
+func (tc *testContext) revertCullingConfiguration(cmMeta metav1.ObjectMeta, depMeta metav1.ObjectMeta, nbMeta *metav1.ObjectMeta) {
 	// Delete the culling configuration Configmap once the test is completed
 	err := tc.kubeClient.CoreV1().ConfigMaps(tc.testNamespace).Delete(tc.ctx,
 		cmMeta.Name, metav1.DeleteOptions{})
 	if err != nil {
-		log.Printf("error creating configmap notebook-controller-culler-config: %v ", err)
+		log.Printf("error deleting configmap notebook-controller-culler-config: %v ", err)
 	}
 	// Roll out the controller deployment
 	err = tc.rolloutDeployment(depMeta)
 	if err != nil {
 		log.Printf("error rolling out the deployment %v: %v ", depMeta.Name, err)
 	}
+
+	testNotebook := &nbv1.Notebook{
+		ObjectMeta: *nbMeta,
+	}
+	// The NBC added the annotation to stop the idle workbench: kubeflow-resource-stopped: '2024-11-26T17:20:42Z'
+	// To make the workbench running again, we need to also remove that annotation.
+	patch := client.RawPatch(types.JSONPatchType, []byte(`[{"op": "remove", "path": "/metadata/annotations/kubeflow-resource-stopped"}]`))
+
+	if err := tc.customClient.Patch(tc.ctx, testNotebook, patch); err != nil {
+		log.Printf("failed to patch Notebook CR removing the kubeflow-resource-stopped annotation: %v ", err)
+	}
+	// now we should wait for pod to start again
+	err = tc.waitForStatefulSet(nbMeta, 1, 1)
+	if err != nil {
+		log.Printf("notebook StatefulSet: %s isn't ready as expected: %s", nbMeta.Name, err)
+	}
 }
 
 func (tc *testContext) scaleDeployment(depMeta metav1.ObjectMeta, desiredReplicas int32) error {

diff --git a/components/odh-notebook-controller/e2e/notebook_creation_test.go b/components/odh-notebook-controller/e2e/notebook_creation_test.go
@@ -198,24 +198,7 @@ func (tc *testContext) ensureNetworkPolicyAllowingAccessToOnlyNotebookController
 
 func (tc *testContext) testNotebookValidation(nbMeta *metav1.ObjectMeta) error {
 	// Verify StatefulSet is running
-	err := wait.Poll(tc.resourceRetryInterval, tc.resourceCreationTimeout, func() (done bool, err error) {
-		notebookStatefulSet, err1 := tc.kubeClient.AppsV1().StatefulSets(tc.testNamespace).Get(tc.ctx,
-			nbMeta.Name, metav1.GetOptions{})
-
-		if err1 != nil {
-			if errors.IsNotFound(err1) {
-				return false, nil
-			} else {
-				log.Printf("Failed to get %s statefulset", nbMeta.Name)
-				return false, err1
-			}
-		}
-		if notebookStatefulSet.Status.AvailableReplicas == 1 &&
-			notebookStatefulSet.Status.ReadyReplicas == 1 {
-			return true, nil
-		}
-		return false, nil
-	})
+	err := tc.waitForStatefulSet(nbMeta, 1, 1)
 	if err != nil {
 		return fmt.Errorf("error validating notebook StatefulSet: %s", err)
 	}
@@ -283,7 +266,7 @@ func (tc *testContext) testNotebookCulling(nbMeta *metav1.ObjectMeta) error {
 		return fmt.Errorf("error getting deployment %v: %v", controllerDeployment.Name, err)
 	}
 
-	defer tc.revertCullingConfiguration(cullingConfigMap.ObjectMeta, controllerDeployment.ObjectMeta)
+	defer tc.revertCullingConfiguration(cullingConfigMap.ObjectMeta, controllerDeployment.ObjectMeta, nbMeta)
 
 	err = tc.rolloutDeployment(controllerDeployment.ObjectMeta)
 	if err != nil {

diff --git a/components/odh-notebook-controller/e2e/notebook_update_test.go b/components/odh-notebook-controller/e2e/notebook_update_test.go
@@ -65,7 +65,8 @@ func (tc *testContext) testNotebookUpdate(nbContext notebookContext) error {
 	}
 
 	// Example update: Change the Notebook image
-	updatedNotebook.Spec.Template.Spec.Containers[0].Image = "new-image:latest"
+	newImage := "quay.io/opendatahub/workbench-images:jupyter-minimal-ubi9-python-3.11-20241119-3ceb400"
+	updatedNotebook.Spec.Template.Spec.Containers[0].Image = newImage
 
 	err = tc.customClient.Update(tc.ctx, updatedNotebook)
 	if err != nil {
@@ -79,7 +80,7 @@ func (tc *testContext) testNotebookUpdate(nbContext notebookContext) error {
 		if err != nil {
 			return false, err
 		}
-		if note.Spec.Template.Spec.Containers[0].Image == "new-image:latest" {
+		if note.Spec.Template.Spec.Containers[0].Image == newImage {
 			return true, nil
 		}
 		return false, nil