Skip to content

Commit bf0cdb5

Browse files
committed
Fix promotion race condition
Signed-off-by: Jean-Rémy Bancel <jr.bancel@uipath.com>
1 parent f54df77 commit bf0cdb5

File tree

1 file changed

+107
-0
lines changed

1 file changed

+107
-0
lines changed

test/fixtures/when.go

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,19 +187,23 @@ func (w *When) PromoteRollout() *When {
187187
if w.rollout == nil {
188188
w.t.Fatal("Rollout not set")
189189
}
190+
w.waitForPauseConditionsSet()
190191
_, err := promote.PromoteRollout(w.rolloutClient.ArgoprojV1alpha1().Rollouts(w.namespace), w.rollout.GetName(), false, false, false)
191192
w.CheckError(err)
192193
w.log.Info("Promoted rollout")
194+
w.clearControllerPauseIfNeeded()
193195
return w
194196
}
195197

196198
func (w *When) PromoteRolloutFull() *When {
197199
if w.rollout == nil {
198200
w.t.Fatal("Rollout not set")
199201
}
202+
w.waitForPauseConditionsSet()
200203
_, err := promote.PromoteRollout(w.rolloutClient.ArgoprojV1alpha1().Rollouts(w.namespace), w.rollout.GetName(), false, false, true)
201204
w.CheckError(err)
202205
w.log.Info("Promoted rollout fully")
206+
w.clearControllerPauseIfNeeded()
203207
return w
204208
}
205209

@@ -729,6 +733,109 @@ func (w *When) StopLoad() *When {
729733
return w
730734
}
731735

736+
// waitForPauseConditionsSet waits for the controller to finish setting up the pause state.
737+
// This ensures pauseConditions is populated when controllerPause is true, which indicates
738+
// the controller has completed its reconciliation and is ready for a promote.
739+
func (w *When) waitForPauseConditionsSet() {
740+
rolloutIf := w.rolloutClient.ArgoprojV1alpha1().Rollouts(w.namespace)
741+
err := retryutil.OnError(retryutil.DefaultBackoff, func(err error) bool {
742+
return true
743+
}, func() error {
744+
ro, err := rolloutIf.Get(w.Context, w.rollout.GetName(), metav1.GetOptions{})
745+
if err != nil {
746+
return err
747+
}
748+
if ro.Status.ObservedGeneration != strconv.FormatInt(ro.Generation, 10) {
749+
return fmt.Errorf("waiting for observedGeneration (%s) to match generation (%d)", ro.Status.ObservedGeneration, ro.Generation)
750+
}
751+
if ro.Status.ControllerPause && len(ro.Status.PauseConditions) == 0 {
752+
return fmt.Errorf("waiting for pauseConditions to be set (controllerPause=true)")
753+
}
754+
return nil
755+
})
756+
w.CheckError(err)
757+
}
758+
759+
// clearControllerPauseIfNeeded checks if the controller has processed the promote.
760+
// Due to the controller's writeBackToInformer function, watch events may be missed
761+
// after a reconciliation. If the controller hasn't processed the promote (indicated
762+
// by controllerPause=true with empty pauseConditions), we force a reconciliation
763+
// by scaling the rollout, which triggers ReplicaSet changes.
764+
// Note: This workaround is only applied to canary rollouts. Bluegreen rollouts have
765+
// different pause/promote semantics where the scale workaround causes the controller
766+
// to re-add the pause when reconciling with stale informer cache.
767+
func (w *When) clearControllerPauseIfNeeded() {
768+
rolloutIf := w.rolloutClient.ArgoprojV1alpha1().Rollouts(w.namespace)
769+
770+
// Poll until the controller processes the promote (clears controllerPause).
771+
err := retryutil.OnError(retryutil.DefaultBackoff, func(err error) bool {
772+
return true
773+
}, func() error {
774+
ro, err := rolloutIf.Get(w.Context, w.rollout.GetName(), metav1.GetOptions{})
775+
if err != nil {
776+
return err
777+
}
778+
if len(ro.Status.PauseConditions) > 0 {
779+
return nil
780+
}
781+
if !ro.Status.ControllerPause {
782+
return nil
783+
}
784+
return fmt.Errorf("waiting for controller to process promote")
785+
})
786+
787+
ro, getErr := rolloutIf.Get(w.Context, w.rollout.GetName(), metav1.GetOptions{})
788+
w.CheckError(getErr)
789+
790+
if len(ro.Status.PauseConditions) > 0 || !ro.Status.ControllerPause {
791+
return
792+
}
793+
794+
// Force a reconciliation to make the controller process the promote
795+
if err != nil {
796+
w.log.Info("Forcing reconciliation (controller race condition workaround)")
797+
798+
if ro.Spec.Strategy.Canary != nil {
799+
currentReplicas := int32(1)
800+
if ro.Spec.Replicas != nil {
801+
currentReplicas = *ro.Spec.Replicas
802+
}
803+
scalePatch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, currentReplicas+1))
804+
_, scaleErr := rolloutIf.Patch(w.Context, w.rollout.GetName(), types.MergePatchType, scalePatch, metav1.PatchOptions{})
805+
w.CheckError(scaleErr)
806+
807+
err = retryutil.OnError(retryutil.DefaultBackoff, func(err error) bool {
808+
return true
809+
}, func() error {
810+
ro, err := rolloutIf.Get(w.Context, w.rollout.GetName(), metav1.GetOptions{})
811+
if err != nil {
812+
return err
813+
}
814+
815+
observedGen, err := strconv.Atoi(ro.Status.ObservedGeneration)
816+
if err != nil {
817+
return err
818+
}
819+
820+
if int64(observedGen) >= ro.Generation {
821+
return nil
822+
}
823+
824+
return fmt.Errorf("waiting for controller to reconcile")
825+
})
826+
w.CheckError(err)
827+
828+
scalePatch = []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, currentReplicas))
829+
_, scaleErr = rolloutIf.Patch(w.Context, w.rollout.GetName(), types.MergePatchType, scalePatch, metav1.PatchOptions{})
830+
w.CheckError(scaleErr)
831+
} else if ro.Spec.Strategy.BlueGreen != nil {
832+
annotationPatch := []byte(fmt.Sprintf(`{"metadata":{"annotations":{"e2e-reconcile-trigger":"%d"}}}`, time.Now().UnixNano()))
833+
_, patchErr := rolloutIf.Patch(w.Context, w.rollout.GetName(), types.MergePatchType, annotationPatch, metav1.PatchOptions{})
834+
w.CheckError(patchErr)
835+
}
836+
}
837+
}
838+
732839
func (w *When) Then() *Then {
733840
return &Then{
734841
Common: w.Common,

0 commit comments

Comments
 (0)