Skip to content

Commit 1d6f813

Browse files
RolloutPlugin - 11 - Analysis Fix
1 parent f393347 commit 1d6f813

File tree

14 files changed

+763
-741
lines changed

14 files changed

+763
-741
lines changed

cmd/combined-controller/main.go

Lines changed: 0 additions & 480 deletions
This file was deleted.

cmd/rollouts-controller/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,7 @@ func newCommand() *cobra.Command {
390390
DynamicClientset: dynamicClient,
391391
PluginManager: pluginManager,
392392
AnalysisHelper: analysisHelper,
393+
InstanceID: instanceID,
393394
}).SetupWithManager(mgr); err != nil {
394395
log.Fatalf("Failed to setup RolloutPlugin controller: %s", err.Error())
395396
}

rolloutplugin/analysis.go

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -194,15 +194,40 @@ func (r *RolloutPluginReconciler) reconcileStepBasedAnalysisRun(ctx context.Cont
194194
}
195195

196196
// Check if we need to create a new analysis run
197-
if currentAr == nil || analysisutil.IsTerminating(currentAr) {
197+
// Only create if:
198+
// 1. No current AR exists
199+
// 2. Rollout is aborted (allows retry after fixes)
200+
// 3. AR is inconclusive and rollout is paused for it
201+
if currentAr == nil {
198202
logger.Info("Creating step-based analysis run", "step", currentStepIndex)
199-
newAr, err := r.createAnalysisRun(ctx, rp, currentStep.Analysis, fmt.Sprintf("%s-step-%d", rp.Name, currentStepIndex))
203+
newAr, err := r.createAnalysisRun(ctx, rp, currentStep.Analysis, fmt.Sprintf("%s-step-%d", rp.Name, currentStepIndex), v1alpha1.RolloutTypeStepLabel)
200204
if err != nil {
201205
return nil, err
202206
}
203207
return newAr, nil
204208
}
205209

210+
// If rollout is aborted, allow creating a new analysis run (for retry scenarios)
211+
if rp.Status.Aborted {
212+
logger.Info("Rollout is aborted, creating new step-based analysis run", "step", currentStepIndex)
213+
newAr, err := r.createAnalysisRun(ctx, rp, currentStep.Analysis, fmt.Sprintf("%s-step-%d", rp.Name, currentStepIndex), v1alpha1.RolloutTypeStepLabel)
214+
if err != nil {
215+
return nil, err
216+
}
217+
return newAr, nil
218+
}
219+
220+
// If AR is inconclusive and rollout is paused, allow creating a new one
221+
if currentAr.Status.Phase == v1alpha1.AnalysisPhaseInconclusive && rp.Status.Paused {
222+
logger.Info("Analysis is inconclusive and rollout is paused, creating new step-based analysis run", "step", currentStepIndex)
223+
newAr, err := r.createAnalysisRun(ctx, rp, currentStep.Analysis, fmt.Sprintf("%s-step-%d", rp.Name, currentStepIndex), v1alpha1.RolloutTypeStepLabel)
224+
if err != nil {
225+
return nil, err
226+
}
227+
return newAr, nil
228+
}
229+
230+
// Otherwise, keep the existing AR (even if failed/error - controller needs to see it)
206231
return currentAr, nil
207232
}
208233

@@ -215,16 +240,41 @@ func (r *RolloutPluginReconciler) reconcileBackgroundAnalysisRun(ctx context.Con
215240
}
216241

217242
// Check if we need to create a new analysis run
218-
if currentAr == nil || analysisutil.IsTerminating(currentAr) {
243+
// Only create if:
244+
// 1. No current AR exists
245+
// 2. Rollout is aborted (allows retry after fixes)
246+
// 3. AR is inconclusive and rollout is paused for it
247+
if currentAr == nil {
219248
logger.Info("Creating background analysis run")
220249
// RolloutAnalysisBackground embeds RolloutAnalysis, so we can use it directly
221-
newAr, err := r.createAnalysisRun(ctx, rp, &rp.Spec.Strategy.Canary.Analysis.RolloutAnalysis, fmt.Sprintf("%s-background", rp.Name))
250+
newAr, err := r.createAnalysisRun(ctx, rp, &rp.Spec.Strategy.Canary.Analysis.RolloutAnalysis, fmt.Sprintf("%s-background", rp.Name), v1alpha1.RolloutTypeBackgroundRunLabel)
251+
if err != nil {
252+
return nil, err
253+
}
254+
return newAr, nil
255+
}
256+
257+
// If rollout is aborted, allow creating a new analysis run (for retry scenarios)
258+
if rp.Status.Aborted {
259+
logger.Info("Rollout is aborted, creating new background analysis run")
260+
newAr, err := r.createAnalysisRun(ctx, rp, &rp.Spec.Strategy.Canary.Analysis.RolloutAnalysis, fmt.Sprintf("%s-background", rp.Name), v1alpha1.RolloutTypeBackgroundRunLabel)
222261
if err != nil {
223262
return nil, err
224263
}
225264
return newAr, nil
226265
}
227266

267+
// If AR is inconclusive and rollout is paused, allow creating a new one
268+
if currentAr.Status.Phase == v1alpha1.AnalysisPhaseInconclusive && rp.Status.Paused {
269+
logger.Info("Analysis is inconclusive and rollout is paused, creating new background analysis run")
270+
newAr, err := r.createAnalysisRun(ctx, rp, &rp.Spec.Strategy.Canary.Analysis.RolloutAnalysis, fmt.Sprintf("%s-background", rp.Name), v1alpha1.RolloutTypeBackgroundRunLabel)
271+
if err != nil {
272+
return nil, err
273+
}
274+
return newAr, nil
275+
}
276+
277+
// Otherwise, keep the existing AR (even if failed/error - controller needs to see it)
228278
return currentAr, nil
229279
}
230280

@@ -246,7 +296,7 @@ func convertAnalysisRunArgsToArguments(args []v1alpha1.AnalysisRunArgument) []v1
246296
}
247297

248298
// createAnalysisRun creates a new AnalysisRun from the analysis spec
249-
func (r *RolloutPluginReconciler) createAnalysisRun(ctx context.Context, rp *v1alpha1.RolloutPlugin, analysisSpec *v1alpha1.RolloutAnalysis, namePrefix string) (*v1alpha1.AnalysisRun, error) {
299+
func (r *RolloutPluginReconciler) createAnalysisRun(ctx context.Context, rp *v1alpha1.RolloutPlugin, analysisSpec *v1alpha1.RolloutAnalysis, namePrefix string, rolloutType string) (*v1alpha1.AnalysisRun, error) {
250300
logger := log.FromContext(ctx)
251301

252302
// Get templates
@@ -277,10 +327,15 @@ func (r *RolloutPluginReconciler) createAnalysisRun(ctx context.Context, rp *v1a
277327

278328
// Build labels
279329
labels := map[string]string{
280-
v1alpha1.RolloutTypeLabel: "RolloutPlugin",
330+
v1alpha1.RolloutTypeLabel: rolloutType, // "Step" or "Background"
281331
"rollout-plugin-name": rp.Name,
282332
}
283333

334+
// Add instance ID label if set (required for controller filtering)
335+
if r.InstanceID != "" {
336+
labels[v1alpha1.LabelKeyControllerInstanceID] = r.InstanceID
337+
}
338+
284339
// Build annotations
285340
annotations := map[string]string{}
286341

rolloutplugin/controller.go

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ type RolloutPluginReconciler struct {
3838
DynamicClientset dynamic.Interface
3939
PluginManager PluginManager
4040
AnalysisHelper AnalysisHelper
41+
InstanceID string
4142
}
4243

4344
// AnalysisHelper is an interface for managing AnalysisRuns
@@ -522,6 +523,41 @@ func (r *RolloutPluginReconciler) processCanaryRollout(ctx context.Context, roll
522523
currentStep := canary.Steps[currentStepIndex]
523524
logCtx.WithField("stepIndex", currentStepIndex).Info("Processing canary step")
524525

526+
// Check background analysis status (if running)
527+
// Background analysis runs throughout the rollout, so check it before processing each step
528+
if newStatus.Canary.CurrentBackgroundAnalysisRunStatus != nil {
529+
bgAnalysisStatus := newStatus.Canary.CurrentBackgroundAnalysisRunStatus.Status
530+
logCtx.WithFields(log.Fields{
531+
"status": bgAnalysisStatus,
532+
"analysisRun": newStatus.Canary.CurrentBackgroundAnalysisRunStatus.Name,
533+
}).Info("Background analysis status")
534+
535+
switch bgAnalysisStatus {
536+
case v1alpha1.AnalysisPhaseFailed, v1alpha1.AnalysisPhaseError:
537+
// Background analysis failed, abort the rollout
538+
logCtx.Error("Background analysis failed, aborting rollout")
539+
newStatus.Phase = "Failed"
540+
newStatus.Message = fmt.Sprintf("Background analysis failed: %s", newStatus.Canary.CurrentBackgroundAnalysisRunStatus.Name)
541+
newStatus.RolloutInProgress = false
542+
newStatus.Aborted = true
543+
newStatus.AbortedRevision = newStatus.UpdatedRevision
544+
if err := plugin.Abort(ctx, workloadRef); err != nil {
545+
logCtx.WithError(err).Error("Failed to abort rollout")
546+
return ctrl.Result{}, err
547+
}
548+
return ctrl.Result{}, nil
549+
550+
case v1alpha1.AnalysisPhaseInconclusive:
551+
// Background analysis is inconclusive, pause the rollout
552+
logCtx.Info("Background analysis is inconclusive, pausing rollout")
553+
newStatus.Paused = true
554+
newStatus.Message = "Paused: Background analysis inconclusive"
555+
return ctrl.Result{}, nil
556+
}
557+
// For Running, Pending, Successful, or unknown status, continue processing the step
558+
// Successful means the analysis completed successfully
559+
}
560+
525561
// Process setWeight step
526562
if currentStep.SetWeight != nil {
527563
weight := *currentStep.SetWeight
@@ -625,6 +661,8 @@ func (r *RolloutPluginReconciler) processCanaryRollout(ctx context.Context, roll
625661
newStatus.Phase = "Failed"
626662
newStatus.Message = fmt.Sprintf("Analysis failed: %s", newStatus.Canary.CurrentStepAnalysisRunStatus.Name)
627663
newStatus.RolloutInProgress = false
664+
newStatus.Aborted = true
665+
newStatus.AbortedRevision = newStatus.UpdatedRevision
628666
if err := plugin.Abort(ctx, workloadRef); err != nil {
629667
logCtx.WithError(err).Error("Failed to abort rollout")
630668
return ctrl.Result{}, err
@@ -865,9 +903,6 @@ func (r *RolloutPluginReconciler) SetupWithManager(mgr ctrl.Manager) error {
865903
DeleteFunc: func(e event.DeleteEvent) bool {
866904
return true
867905
},
868-
GenericFunc: func(e event.GenericEvent) bool {
869-
return true
870-
},
871906
}
872907

873908
// Create a predicate for RolloutPlugin that watches ALL updates (like Rollouts controller)
@@ -896,13 +931,36 @@ func (r *RolloutPluginReconciler) SetupWithManager(mgr ctrl.Manager) error {
896931
DeleteFunc: func(e event.DeleteEvent) bool {
897932
return true
898933
},
899-
GenericFunc: func(e event.GenericEvent) bool {
900-
return true
934+
}
935+
936+
// Predicate to filter AnalysisRun events - only trigger on phase changes
937+
analysisRunPredicate := predicate.Funcs{
938+
CreateFunc: func(e event.CreateEvent) bool {
939+
return true // Always trigger on creation
940+
},
941+
UpdateFunc: func(e event.UpdateEvent) bool {
942+
oldAR, ok1 := e.ObjectOld.(*v1alpha1.AnalysisRun)
943+
newAR, ok2 := e.ObjectNew.(*v1alpha1.AnalysisRun)
944+
if !ok1 || !ok2 {
945+
return false
946+
}
947+
948+
// Only trigger if phase changed (matches Rollouts controller behavior)
949+
if oldAR.Status.Phase != newAR.Status.Phase {
950+
return true
951+
}
952+
953+
// Skip other updates
954+
return false
955+
},
956+
DeleteFunc: func(e event.DeleteEvent) bool {
957+
return true // Always trigger on deletion
901958
},
902959
}
903960

904961
return ctrl.NewControllerManagedBy(mgr).
905962
For(&v1alpha1.RolloutPlugin{}).
963+
Owns(&v1alpha1.AnalysisRun{}, builder.WithPredicates(analysisRunPredicate)).
906964
Watches(
907965
&appsv1.StatefulSet{},
908966
handler.EnqueueRequestsFromMapFunc(r.findRolloutPluginsForWorkload),

rolloutplugin/plugins/statefulset/plugin.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import (
2222

2323
const (
2424
// FieldManager is the field manager name used for Server-Side Apply
25-
FieldManager = "argo-rollouts"
25+
FieldManager = "argo-rollouts-statefulset-plugin"
2626
)
2727

2828
// Plugin implements the ResourcePlugin interface directly for StatefulSets.
@@ -423,20 +423,22 @@ func (p *Plugin) Abort(ctx context.Context, workloadRef v1alpha1.WorkloadRef) er
423423
return fmt.Errorf("failed to update StatefulSet during abort using SSA: %w", err)
424424
}
425425

426-
// STEP 2: Delete pods that were updated (ordinals < oldPartition)
426+
// STEP 2: Delete pods that were updated (ordinals >= oldPartition)
427427
// StatefulSet controller will recreate them using CurrentRevision (old version)
428-
// because partition=replicas means all pods should be on old version
429-
// We delete all pods with ordinal < oldPartition to ensure all potentially
430-
// updated pods are rolled back
428+
// because partition=replicas means all pods should be on old version.
429+
// With partition, pods with ordinal >= partition are on the NEW (updated) version.
430+
// Pods with ordinal < partition are on the OLD (current) version.
431+
// So we need to delete pods with ordinal >= oldPartition to roll them back.
432+
podsToDelete := replicas - oldPartition
431433
p.logCtx.WithFields(log.Fields{
432434
"oldPartition": oldPartition,
433-
"podsToDelete": oldPartition,
435+
"podsToDelete": podsToDelete,
434436
}).Info("Deleting updated pods to force rollback")
435437

436438
deletedCount := int32(0)
437439
failedDeletes := []string{}
438440

439-
for i := int32(0); i < oldPartition; i++ {
441+
for i := oldPartition; i < replicas; i++ {
440442
podName := fmt.Sprintf("%s-%d", sts.Name, i)
441443
pod := &corev1.Pod{
442444
ObjectMeta: metav1.ObjectMeta{
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
apiVersion: argoproj.io/v1alpha1
2+
kind: Rollout
3+
metadata:
4+
name: basic
5+
spec:
6+
strategy:
7+
canary:
8+
steps:
9+
- setWeight: 50
10+
- pause: {}
11+
selector:
12+
matchLabels:
13+
app: basic
14+
template:
15+
metadata:
16+
labels:
17+
app: basic
18+
spec:
19+
containers:
20+
- name: basic
21+
image: quay.io/nginx:1.19-alpine
22+
resources:
23+
requests:
24+
memory: 16Mi
25+
cpu: 1m
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
---
2+
# Background analysis with failing metrics
3+
# Used by: TestBackgroundAnalysisFailure
4+
apiVersion: argoproj.io/v1alpha1
5+
kind: AnalysisTemplate
6+
metadata:
7+
name: bg-analysis-fail
8+
spec:
9+
metrics:
10+
- name: fail-metric
11+
count: 1
12+
interval: 3s
13+
failureLimit: 0
14+
provider:
15+
job:
16+
spec:
17+
backoffLimit: 0
18+
template:
19+
spec:
20+
restartPolicy: Never
21+
containers:
22+
- name: fail
23+
image: quay.io/prometheus/busybox:latest
24+
command: ["/bin/sh", "-c", "echo 'Analysis failed' && exit 1"]
25+
---
26+
apiVersion: v1
27+
kind: Service
28+
metadata:
29+
name: bg-analysis-fail-svc
30+
spec:
31+
clusterIP: None
32+
selector:
33+
app: bg-analysis-fail
34+
ports:
35+
- port: 80
36+
name: web
37+
---
38+
apiVersion: apps/v1
39+
kind: StatefulSet
40+
metadata:
41+
name: bg-analysis-fail-sts
42+
spec:
43+
serviceName: bg-analysis-fail-svc
44+
replicas: 5
45+
selector:
46+
matchLabels:
47+
app: bg-analysis-fail
48+
template:
49+
metadata:
50+
labels:
51+
app: bg-analysis-fail
52+
spec:
53+
terminationGracePeriodSeconds: 1
54+
containers:
55+
- name: busybox
56+
image: quay.io/prometheus/busybox:latest
57+
command: ["sleep", "3600"]
58+
ports:
59+
- containerPort: 80
60+
updateStrategy:
61+
type: RollingUpdate
62+
---
63+
apiVersion: argoproj.io/v1alpha1
64+
kind: RolloutPlugin
65+
metadata:
66+
name: bg-analysis-fail-rp
67+
spec:
68+
workloadRef:
69+
apiVersion: apps/v1
70+
kind: StatefulSet
71+
name: bg-analysis-fail-sts
72+
plugin:
73+
name: statefulset
74+
strategy:
75+
canary:
76+
analysis:
77+
templates:
78+
- templateName: bg-analysis-fail
79+
steps:
80+
- setWeight: 60
81+
- pause: {duration: 5s}
82+
- setWeight: 80
83+
- pause: {duration: 5s}
84+
- setWeight: 100

0 commit comments

Comments
 (0)