Skip to content

Commit

Permalink
[RHOAIENG-15141] Fix failing E2E tests (#473)
Browse files Browse the repository at this point in the history
There were two E2E tests in failure:
            --- FAIL: TestE2ENotebookController/update/thoth-minimal-oauth-notebook/Notebook_Statefulset_Validation_After_Update (60.28s)
            --- FAIL: TestE2ENotebookController/update/thoth-minimal-oauth-notebook/Verify_Notebook_Traffic_After_Update (10.75s)

The reason was that after the culling test, the culling configuration is
reverted, but the Notebook CR isn't updated and the annotation that was
added by the notebook controller: `kubeflow-resource-stopped` stays in
that CR and as such, the workbench instance isn't started back. This
change deletes this annotation at the end of the culler test and the
workbench is up and running and ready for the followup tests.

Another issue there was that the update test updated the workbench to a
non-existent image link, which resulted in a ImagePullBackOff Error in
the end. This change updates this link to some existing image.
  • Loading branch information
jstourac authored Nov 27, 2024
1 parent bb2dd26 commit fc5ffd4
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 24 deletions.
46 changes: 43 additions & 3 deletions components/odh-notebook-controller/e2e/helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package e2e
import (
"crypto/tls"
"fmt"
netv1 "k8s.io/api/networking/v1"
"log"
"net/http"
"time"
Expand All @@ -12,9 +11,11 @@ import (
routev1 "github.com/openshift/api/route/v1"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
netv1 "k8s.io/api/networking/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/wait"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -130,18 +131,57 @@ func (tc *testContext) rolloutDeployment(depMeta metav1.ObjectMeta) error {
return nil
}

func (tc *testContext) revertCullingConfiguration(cmMeta metav1.ObjectMeta, depMeta metav1.ObjectMeta) {
func (tc *testContext) waitForStatefulSet(nbMeta *metav1.ObjectMeta, availableReplicas int32, readyReplicas int32) error {
// Verify StatefulSet is running expected number of replicas
err := wait.Poll(tc.resourceRetryInterval, tc.resourceCreationTimeout, func() (done bool, err error) {

Check failure on line 136 in components/odh-notebook-controller/e2e/helper_test.go

View workflow job for this annotation

GitHub Actions / golangci-lint (components/odh-notebook-controller)

SA1019: wait.Poll is deprecated: This method does not return errors from context, use PollUntilContextTimeout. Note that the new method will no longer return ErrWaitTimeout and instead return errors defined by the context package. Will be removed in a future release. (staticcheck)
notebookStatefulSet, err1 := tc.kubeClient.AppsV1().StatefulSets(tc.testNamespace).Get(tc.ctx,
nbMeta.Name, metav1.GetOptions{})

if err1 != nil {
if errors.IsNotFound(err1) {
return false, nil
} else {
log.Printf("Failed to get %s statefulset", nbMeta.Name)
return false, err1
}
}
if notebookStatefulSet.Status.AvailableReplicas == availableReplicas &&
notebookStatefulSet.Status.ReadyReplicas == readyReplicas {
return true, nil
}
return false, nil
})
return err
}

func (tc *testContext) revertCullingConfiguration(cmMeta metav1.ObjectMeta, depMeta metav1.ObjectMeta, nbMeta *metav1.ObjectMeta) {
// Delete the culling configuration Configmap once the test is completed
err := tc.kubeClient.CoreV1().ConfigMaps(tc.testNamespace).Delete(tc.ctx,
cmMeta.Name, metav1.DeleteOptions{})
if err != nil {
log.Printf("error creating configmap notebook-controller-culler-config: %v ", err)
log.Printf("error deleting configmap notebook-controller-culler-config: %v ", err)
}
// Roll out the controller deployment
err = tc.rolloutDeployment(depMeta)
if err != nil {
log.Printf("error rolling out the deployment %v: %v ", depMeta.Name, err)
}

testNotebook := &nbv1.Notebook{
ObjectMeta: *nbMeta,
}
// The NBC added the annotation to stop the idle workbench: kubeflow-resource-stopped: '2024-11-26T17:20:42Z'
// To make the workbench running again, we need to also remove that annotation.
patch := client.RawPatch(types.JSONPatchType, []byte(`[{"op": "remove", "path": "/metadata/annotations/kubeflow-resource-stopped"}]`))

if err := tc.customClient.Patch(tc.ctx, testNotebook, patch); err != nil {
log.Printf("failed to patch Notebook CR removing the kubeflow-resource-stopped annotation: %v ", err)
}
// now we should wait for pod to start again
err = tc.waitForStatefulSet(nbMeta, 1, 1)
if err != nil {
log.Printf("notebook StatefulSet: %s isn't ready as expected: %s", nbMeta.Name, err)
}
}

func (tc *testContext) scaleDeployment(depMeta metav1.ObjectMeta, desiredReplicas int32) error {
Expand Down
21 changes: 2 additions & 19 deletions components/odh-notebook-controller/e2e/notebook_creation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,24 +198,7 @@ func (tc *testContext) ensureNetworkPolicyAllowingAccessToOnlyNotebookController

func (tc *testContext) testNotebookValidation(nbMeta *metav1.ObjectMeta) error {
// Verify StatefulSet is running
err := wait.Poll(tc.resourceRetryInterval, tc.resourceCreationTimeout, func() (done bool, err error) {
notebookStatefulSet, err1 := tc.kubeClient.AppsV1().StatefulSets(tc.testNamespace).Get(tc.ctx,
nbMeta.Name, metav1.GetOptions{})

if err1 != nil {
if errors.IsNotFound(err1) {
return false, nil
} else {
log.Printf("Failed to get %s statefulset", nbMeta.Name)
return false, err1
}
}
if notebookStatefulSet.Status.AvailableReplicas == 1 &&
notebookStatefulSet.Status.ReadyReplicas == 1 {
return true, nil
}
return false, nil
})
err := tc.waitForStatefulSet(nbMeta, 1, 1)
if err != nil {
return fmt.Errorf("error validating notebook StatefulSet: %s", err)
}
Expand Down Expand Up @@ -283,7 +266,7 @@ func (tc *testContext) testNotebookCulling(nbMeta *metav1.ObjectMeta) error {
return fmt.Errorf("error getting deployment %v: %v", controllerDeployment.Name, err)
}

defer tc.revertCullingConfiguration(cullingConfigMap.ObjectMeta, controllerDeployment.ObjectMeta)
defer tc.revertCullingConfiguration(cullingConfigMap.ObjectMeta, controllerDeployment.ObjectMeta, nbMeta)

err = tc.rolloutDeployment(controllerDeployment.ObjectMeta)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ func (tc *testContext) testNotebookUpdate(nbContext notebookContext) error {
}

// Example update: Change the Notebook image
updatedNotebook.Spec.Template.Spec.Containers[0].Image = "new-image:latest"
newImage := "quay.io/opendatahub/workbench-images:jupyter-minimal-ubi9-python-3.11-20241119-3ceb400"
updatedNotebook.Spec.Template.Spec.Containers[0].Image = newImage

err = tc.customClient.Update(tc.ctx, updatedNotebook)
if err != nil {
Expand All @@ -79,7 +80,7 @@ func (tc *testContext) testNotebookUpdate(nbContext notebookContext) error {
if err != nil {
return false, err
}
if note.Spec.Template.Spec.Containers[0].Image == "new-image:latest" {
if note.Spec.Template.Spec.Containers[0].Image == newImage {
return true, nil
}
return false, nil
Expand Down

0 comments on commit fc5ffd4

Please sign in to comment.