update yaml to include new location of losses that were removed in #15211 (#15384)

blisc · web-flow · commit f055242f1bb5 · 2026-02-11T11:59:51.000-05:00
Signed-off-by: Jason &lt;jasoli@nvidia.com&gt;
diff --git a/examples/audio/conf/beamforming.yaml b/examples/audio/conf/beamforming.yaml
@@ -57,13 +57,13 @@ model:
     num_features: 256 # Number of features at RNN input
     num_layers: 5 # Number of RNN layers
     bidirectional: true # Use bi-directional RNN
-    
+
   mask_processor:
     _target_: nemo.collections.audio.modules.masking.MaskBasedBeamformer # Mask-based multi-channel processing
     ref_channel: 0 # Reference channel for the output
 
   loss:
-    _target_: nemo.collections.audio.losses.SDRLoss
+    _target_: nemo.collections.audio.losses.audio.SDRLoss
     scale_invariant: true # Use scale-invariant SDR
 
   metrics:
@@ -74,7 +74,7 @@ model:
       sdr_ch0: # SDR on output channel 0
         _target_: torchmetrics.audio.SignalDistortionRatio
         channel: 0
-    
+
   optim:
     name: adamw
     lr: 1e-4
diff --git a/examples/audio/conf/beamforming_flex_channels.yaml b/examples/audio/conf/beamforming_flex_channels.yaml
@@ -45,7 +45,7 @@ model:
 
   decoder:
     _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
-    fft_length: ${model.encoder.fft_length} 
+    fft_length: ${model.encoder.fft_length}
     hop_length: ${model.encoder.hop_length}
 
   mask_estimator:
@@ -64,7 +64,7 @@ model:
     mag_normalization: mean_var # normalization using mean and variance
     use_ipd: true # use inter-channel phase difference
     ipd_normalization: mean # mean normalization
-    
+
   mask_processor:
     # Mask-based multi-channel processor
     _target_: nemo.collections.audio.modules.masking.MaskBasedBeamformer
@@ -78,16 +78,16 @@ model:
     num_subbands: ${model.mask_estimator.num_subbands}
 
   loss:
-    _target_: nemo.collections.audio.losses.SDRLoss
+    _target_: nemo.collections.audio.losses.audio.SDRLoss
     convolution_invariant: true # convolution-invariant loss
-    sdr_max: 30 # soft threshold for SDR 
+    sdr_max: 30 # soft threshold for SDR
 
   metrics:
     val:
       sdr_0:
         _target_: torchmetrics.audio.SignalDistortionRatio
         channel: 0 # evaluate only on channel 0, if there are multiple outputs
-    
+
   optim:
     name: adamw
     lr: 1e-4
diff --git a/examples/audio/conf/flow_matching_generative.yaml b/examples/audio/conf/flow_matching_generative.yaml
@@ -29,12 +29,12 @@ model:
     shuffle: false
     num_workers: 4
     pin_memory: true
-  
+
   log_config:
     log_tensorboard: true
     log_wandb: false
     max_utts: 8
-    
+
   encoder:
     _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
     fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
@@ -44,7 +44,7 @@ model:
 
   decoder:
     _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
-    fft_length: ${model.encoder.fft_length} 
+    fft_length: ${model.encoder.fft_length}
     hop_length: ${model.encoder.hop_length}
     magnitude_power: ${model.encoder.magnitude_power}
     scale: ${model.encoder.scale}
@@ -68,9 +68,9 @@ model:
     time_min: 1e-8
     time_max: 1.0
     estimator_target: conditional_vector_field # or data
-    
+
   loss:
-    _target_: nemo.collections.audio.losses.MSELoss
+    _target_: nemo.collections.audio.losses.audio.MSELoss
     ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
 
   metrics:
@@ -85,7 +85,7 @@ model:
         _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
         fs: ${model.sample_rate}
         mode: wb
-    
+
   optim:
     name: adam
     lr: 1e-4
diff --git a/examples/audio/conf/flow_matching_generative_finetuning.yaml b/examples/audio/conf/flow_matching_generative_finetuning.yaml
@@ -32,12 +32,12 @@ model:
     shuffle: false
     num_workers: 4
     pin_memory: true
-  
+
   log_config:
     log_tensorboard: true
     log_wandb: false
     max_utts: 8
-    
+
   encoder:
     _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
     fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
@@ -47,7 +47,7 @@ model:
 
   decoder:
     _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
-    fft_length: ${model.encoder.fft_length} 
+    fft_length: ${model.encoder.fft_length}
     hop_length: ${model.encoder.hop_length}
     magnitude_power: ${model.encoder.magnitude_power}
     scale: ${model.encoder.scale}
@@ -71,9 +71,9 @@ model:
     time_min: 1e-8
     time_max: 1.0
     estimator_target: conditional_vector_field # or data
-    
+
   loss:
-    _target_: nemo.collections.audio.losses.MSELoss
+    _target_: nemo.collections.audio.losses.audio.MSELoss
     ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
 
   metrics:
@@ -88,7 +88,7 @@ model:
         _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
         fs: ${model.sample_rate}
         mode: wb
-    
+
   optim:
     name: adam
     lr: 1e-4
diff --git a/examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml b/examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml
@@ -29,12 +29,12 @@ model:
     shuffle: false
     num_workers: 4
     pin_memory: true
-  
+
   log_config:
     log_tensorboard: true
     log_wandb: false
     max_utts: 8
-    
+
   encoder:
     _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
     fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
@@ -44,7 +44,7 @@ model:
 
   decoder:
     _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
-    fft_length: ${model.encoder.fft_length} 
+    fft_length: ${model.encoder.fft_length}
     hop_length: ${model.encoder.hop_length}
     magnitude_power: ${model.encoder.magnitude_power}
     scale: ${model.encoder.scale}
@@ -68,14 +68,14 @@ model:
     time_min: 1e-8
     time_max: 1.0
     estimator_target: conditional_vector_field # or data
-    
+
   ssl_pretrain_masking:
     _target_: nemo.collections.audio.modules.ssl_pretrain_masking.SSLPretrainWithMaskedPatch
     patch_size: 10
     mask_fraction: 0.7
-    
+
   loss:
-    _target_: nemo.collections.audio.losses.MSELoss
+    _target_: nemo.collections.audio.losses.audio.MSELoss
     ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
 
   metrics:
@@ -90,7 +90,7 @@ model:
         _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
         fs: ${model.sample_rate}
         mode: wb
-    
+
   optim:
     name: adam
     lr: 5e-5
diff --git a/examples/audio/conf/masking.yaml b/examples/audio/conf/masking.yaml
@@ -55,13 +55,13 @@ model:
     num_features: 256 # Number of features at RNN input
     num_layers: 5 # Number of RNN layers
     bidirectional: true # Use bi-directional RNN
-    
+
   mask_processor:
     _target_: nemo.collections.audio.modules.masking.MaskReferenceChannel # Apply mask on the reference channel
     ref_channel: 0 # Reference channel for the output
 
   loss:
-    _target_: nemo.collections.audio.losses.SDRLoss
+    _target_: nemo.collections.audio.losses.audio.SDRLoss
     scale_invariant: true # Use scale-invariant SDR
 
   metrics:
@@ -72,7 +72,7 @@ model:
       sdr_ch0: # SDR on output channel 0
         _target_: torchmetrics.audio.SignalDistortionRatio
         channel: 0
-    
+
   optim:
     name: adamw
     lr: 1e-4
diff --git a/examples/audio/conf/masking_with_online_augmentation.yaml b/examples/audio/conf/masking_with_online_augmentation.yaml
@@ -51,13 +51,13 @@ model:
     num_features: 256 # Number of features at RNN input
     num_layers: 5 # Number of RNN layers
     bidirectional: true # Use bi-directional RNN
-    
+
   mask_processor:
     _target_: nemo.collections.audio.modules.masking.MaskReferenceChannel # Apply mask on the reference channel
     ref_channel: 0 # Reference channel for the output
 
   loss:
-    _target_: nemo.collections.audio.losses.SDRLoss
+    _target_: nemo.collections.audio.losses.audio.SDRLoss
     scale_invariant: true # Use scale-invariant SDR
 
   metrics:
@@ -68,7 +68,7 @@ model:
       sdr_ch0: # SDR on output channel 0
         _target_: torchmetrics.audio.SignalDistortionRatio
         channel: 0
-    
+
   optim:
     name: adamw
     lr: 1e-4
diff --git a/examples/audio/conf/predictive.yaml b/examples/audio/conf/predictive.yaml
@@ -37,7 +37,7 @@ model:
 
   decoder:
     _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
-    fft_length: ${model.encoder.fft_length} 
+    fft_length: ${model.encoder.fft_length}
     hop_length: ${model.encoder.hop_length}
     magnitude_power: ${model.encoder.magnitude_power}
     scale: ${model.encoder.scale}
@@ -49,15 +49,15 @@ model:
     num_res_blocks: 3 # increased number of res blocks
     pad_time_to: 64 # pad to 64 frames for the time dimension
     pad_dimension_to: 0 # no padding in the frequency dimension
-    
+
   loss:
-    _target_: nemo.collections.audio.losses.MSELoss # computed in the time domain
+    _target_: nemo.collections.audio.losses.audio.MSELoss # computed in the time domain
 
   metrics:
     val:
       sisdr: # output SI-SDR
         _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
-    
+
   optim:
     name: adam
     lr: 1e-4
diff --git a/examples/audio/conf/predictive_conformer.yaml b/examples/audio/conf/predictive_conformer.yaml
@@ -36,7 +36,7 @@ model:
 
   decoder:
     _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
-    fft_length: ${model.encoder.fft_length} 
+    fft_length: ${model.encoder.fft_length}
     hop_length: ${model.encoder.hop_length}
     magnitude_power: ${model.encoder.magnitude_power}
     scale: ${model.encoder.scale}
@@ -58,15 +58,15 @@ model:
     causal_downsampling: False
     att_context_size: [-1, -1]
     att_context_style: 'regular'
-    
+
   loss:
-    _target_: nemo.collections.audio.losses.MSELoss # computed in the time domain
+    _target_: nemo.collections.audio.losses.audio.MSELoss # computed in the time domain
 
   metrics:
     val:
       sisdr: # output SI-SDR
         _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
-    
+
   optim:
     name: adamw
     lr: 1e-3
diff --git a/examples/audio/conf/predictive_conformer_unet.yaml b/examples/audio/conf/predictive_conformer_unet.yaml
@@ -35,7 +35,7 @@ model:
 
   decoder:
     _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
-    fft_length: ${model.encoder.fft_length} 
+    fft_length: ${model.encoder.fft_length}
     hop_length: ${model.encoder.hop_length}
     magnitude_power: ${model.encoder.magnitude_power}
     scale: ${model.encoder.scale}
@@ -57,9 +57,9 @@ model:
     causal_downsampling: False
     att_context_size: [-1, -1]
     att_context_style: 'regular'
-    
+
   loss:
-    _target_: nemo.collections.audio.losses.MSELoss # computed in the time domain
+    _target_: nemo.collections.audio.losses.audio.MSELoss # computed in the time domain
 
   metrics:
     val:
@@ -73,7 +73,7 @@ model:
         _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
         fs: ${model.sample_rate}
         mode: wb
-    
+
   optim:
     name: adam
     lr: 1e-4
diff --git a/examples/audio/conf/schroedinger_bridge.yaml b/examples/audio/conf/schroedinger_bridge.yaml
@@ -39,7 +39,7 @@ model:
 
   decoder:
     _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
-    fft_length: ${model.encoder.fft_length} 
+    fft_length: ${model.encoder.fft_length}
     hop_length: ${model.encoder.hop_length}
     magnitude_power: ${model.encoder.magnitude_power}
     scale: ${model.encoder.scale}
@@ -71,12 +71,12 @@ model:
 
   # Loss in the encoded domain
   loss_encoded:
-    _target_: nemo.collections.audio.losses.MSELoss
+    _target_: nemo.collections.audio.losses.audio.MSELoss
     ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
 
   # Loss in the time domain
   loss_time:
-    _target_: nemo.collections.audio.losses.MAELoss
+    _target_: nemo.collections.audio.losses.audio.MAELoss
   loss_time_weight: 0.001
 
   metrics:
@@ -91,7 +91,7 @@ model:
         _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
         fs: ${model.sample_rate}
         mode: wb
-    
+
   optim:
     name: adam
     lr: 1e-4
diff --git a/examples/audio/conf/score_based_generative.yaml b/examples/audio/conf/score_based_generative.yaml
diff --git a/examples/audio/conf/streaming_predictive_conformer.yaml b/examples/audio/conf/streaming_predictive_conformer.yaml
diff --git a/examples/audio/conf/streaming_predictive_conformer_unet.yaml b/examples/audio/conf/streaming_predictive_conformer_unet.yaml