Make jobs migrator idempotent and self-healing

maths22 · maths22 · commit d0a3ef1dc502 · 2026-01-20T09:57:36.000-06:00
Idempotency helps when your argument list doesn't change;
self-healing helps when the argument list is dynamically generated

refs AE-3244
diff --git a/db/migrate/20260120092005_add_previous_jobs_shard.rb b/db/migrate/20260120092005_add_previous_jobs_shard.rb
@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+
+class AddPreviousJobsShard < ActiveRecord::Migration[7.1]
+  def change
+    add_reference :switchman_shards, :previous_delayed_jobs_shard, foreign_key: { to_table: :switchman_shards }, index: false, if_not_exists: true
+  end
+end
diff --git a/lib/switchman_inst_jobs/jobs_migrator.rb b/lib/switchman_inst_jobs/jobs_migrator.rb
@@ -35,39 +35,58 @@ def transaction_on(shards, &)
       end
 
       def migrate_shards(shard_map)
-        source_shards = Set[]
+        effective_map = shard_map.dup
+        source_shards = Hash.new([].freeze)
         target_shards = Hash.new([].freeze)
-        shard_map.each do |(shard, target_shard)|
+        # Also add any incomplete moves to the source shards to ensure we clean up appropriately
+        ::Switchman::Shard.where.not(previous_delayed_jobs_shard_id: nil).each do |shard|
+          effective_map[shard.id] ||= shard.delayed_jobs_shard.id
+        end
+        effective_map.each do |(shard, target_shard)|
           shard = ::Switchman::Shard.find(shard) unless shard.is_a?(::Switchman::Shard)
-          source_shards << shard.delayed_jobs_shard.id
           target_shard = target_shard.try(:id) || target_shard
+          # if a move was interrupted, the new shard is already set as the delayed_jobs_shard
+          # but we still have the old shard stored in previous_delayed_jobs_shard and should
+          # act as if we are moving from there in the first place
+          if shard.previous_delayed_jobs_shard_id && shard.delayed_jobs_shard.id == target_shard
+            source_shards[shard.previous_delayed_jobs_shard_id] += [shard.id]
+          else
+            source_shards[shard.delayed_jobs_shard.id] += [shard.id]
+          end
           target_shards[target_shard] += [shard.id]
 
           @validation_callbacks&.each do |proc|
             proc.call(shard:, target_shard: ::Switchman::Shard.find(target_shard))
           end
         end
 
-        # Do the updates in batches and then just clear redis instead of clearing them one at a time
-        target_shards.each do |target_shard, shards|
-          updates = { delayed_jobs_shard_id: target_shard, block_stranded: true }
-          updates[:updated_at] = Time.zone.now if ::Switchman::Shard.column_names.include?("updated_at")
-          ::Switchman::Shard.where(id: shards).update_all(updates)
+        ::Switchman::Shard.transaction do
+          # Do the updates in batches and then just clear redis instead of clearing them one at a time
+          source_shards.each do |source_shard, shards|
+            updates = { previous_delayed_jobs_shard_id: source_shard }
+            ::Switchman::Shard.where(id: shards).update_all(updates)
+          end
+          target_shards.each do |target_shard, shards|
+            updates = { delayed_jobs_shard_id: target_shard, block_stranded: true }
+            updates[:updated_at] = Time.zone.now if ::Switchman::Shard.column_names.include?("updated_at")
+            ::Switchman::Shard.where(id: shards).update_all(updates)
+          end
         end
         clear_shard_cache(default: ::Switchman::Shard.exists?(id: target_shards.values.flatten, default: true))
 
         ::Switchman::Shard.clear_cache
         # rubocop:disable Style/CombinableLoops
         # We first migrate strands so that we can stop blocking strands before we migrate unstranded jobs
-        source_shards.each do |s|
+        source_shards.keys.each do |s|
           ::Switchman::Shard.lookup(s).activate(::Delayed::Backend::ActiveRecord::AbstractJob) { migrate_strands }
         end
 
-        source_shards.each do |s|
+        source_shards.keys.each do |s|
           ::Switchman::Shard.lookup(s).activate(::Delayed::Backend::ActiveRecord::AbstractJob) { migrate_everything }
         end
-        ensure_unblock_stranded_for(shard_map.map(&:first))
+        ensure_unblock_stranded_for(effective_map.map(&:first))
         # rubocop:enable Style/CombinableLoops
+        ::Switchman::Shard.where(id: effective_map.map(&:first)).update_all(previous_delayed_jobs_shard_id: nil)
       end
 
       # if :migrate_strands ran on any shards that fell into scenario 1, then
diff --git a/lib/switchman_inst_jobs/version.rb b/lib/switchman_inst_jobs/version.rb
@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 
 module SwitchmanInstJobs
-  VERSION = "4.3.0"
+  VERSION = "4.3.1"
 end
diff --git a/spec/lib/switchman_inst_jobs/jobs_migrator_spec.rb b/spec/lib/switchman_inst_jobs/jobs_migrator_spec.rb
@@ -529,4 +529,55 @@ def activate_target_shard
       described_class.clear_callbacks!
     end
   end
+
+  context "fault tolerance" do
+    before do
+      @times_called = 0
+      original_method = described_class.method(:migrate_everything)
+      allow(described_class).to receive(:migrate_everything).with(no_args) do
+        @times_called += 1
+        raise "test failure" if @times_called == 1
+        original_method.call
+      end
+    end
+
+    it "should do work when called twice in a row" do
+      activate_source_shard do
+        3.times { Kernel.delay.sleep(0.3) }
+      end
+      expect(Delayed::Job.count).to eq 3
+
+      shard1.delayed_jobs_shard_id = Switchman::Shard.default.id
+      shard1.save!
+      expect { described_class.migrate_shards({ shard1 => shard1 }) }.to raise_error("test failure")
+      expect(Delayed::Job.count).to eq 3
+      expect { described_class.migrate_shards({ shard1 => shard1 }) }.not_to raise_error
+      expect(Delayed::Job.count).to eq 0
+      
+      activate_target_shard do
+        expect(Delayed::Job.count).to eq 3
+      end
+    end
+
+    it "should cleanup partial migrations implicitly" do
+      activate_source_shard do
+        3.times { Kernel.delay.sleep(0.3) }
+      end
+      expect(Delayed::Job.count).to eq 3
+
+      shard1.delayed_jobs_shard_id = Switchman::Shard.default.id
+      shard1.save!
+      expect(::Switchman::Shard.where.not(previous_delayed_jobs_shard_id: nil).count).to eq 0
+      expect { described_class.migrate_shards({ shard1 => shard1 }) }.to raise_error("test failure")
+      expect(Delayed::Job.count).to eq 3
+      expect(::Switchman::Shard.where.not(previous_delayed_jobs_shard_id: nil).count).to eq 1
+      expect { described_class.migrate_shards({}) }.not_to raise_error
+      expect(Delayed::Job.count).to eq 0
+      
+      activate_target_shard do
+        expect(Delayed::Job.count).to eq 3
+      end
+      expect(::Switchman::Shard.where.not(previous_delayed_jobs_shard_id: nil).count).to eq 0
+    end
+  end
 end