Skip to content

Commit f055242

Browse files
authored
update yaml to include new location of losses that were removed in #15211 (#15384)
Signed-off-by: Jason <jasoli@nvidia.com>
1 parent 9dda6d2 commit f055242

14 files changed

+61
-61
lines changed

examples/audio/conf/beamforming.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,13 @@ model:
5757
num_features: 256 # Number of features at RNN input
5858
num_layers: 5 # Number of RNN layers
5959
bidirectional: true # Use bi-directional RNN
60-
60+
6161
mask_processor:
6262
_target_: nemo.collections.audio.modules.masking.MaskBasedBeamformer # Mask-based multi-channel processing
6363
ref_channel: 0 # Reference channel for the output
6464

6565
loss:
66-
_target_: nemo.collections.audio.losses.SDRLoss
66+
_target_: nemo.collections.audio.losses.audio.SDRLoss
6767
scale_invariant: true # Use scale-invariant SDR
6868

6969
metrics:
@@ -74,7 +74,7 @@ model:
7474
sdr_ch0: # SDR on output channel 0
7575
_target_: torchmetrics.audio.SignalDistortionRatio
7676
channel: 0
77-
77+
7878
optim:
7979
name: adamw
8080
lr: 1e-4

examples/audio/conf/beamforming_flex_channels.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ model:
4545

4646
decoder:
4747
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
48-
fft_length: ${model.encoder.fft_length}
48+
fft_length: ${model.encoder.fft_length}
4949
hop_length: ${model.encoder.hop_length}
5050

5151
mask_estimator:
@@ -64,7 +64,7 @@ model:
6464
mag_normalization: mean_var # normalization using mean and variance
6565
use_ipd: true # use inter-channel phase difference
6666
ipd_normalization: mean # mean normalization
67-
67+
6868
mask_processor:
6969
# Mask-based multi-channel processor
7070
_target_: nemo.collections.audio.modules.masking.MaskBasedBeamformer
@@ -78,16 +78,16 @@ model:
7878
num_subbands: ${model.mask_estimator.num_subbands}
7979

8080
loss:
81-
_target_: nemo.collections.audio.losses.SDRLoss
81+
_target_: nemo.collections.audio.losses.audio.SDRLoss
8282
convolution_invariant: true # convolution-invariant loss
83-
sdr_max: 30 # soft threshold for SDR
83+
sdr_max: 30 # soft threshold for SDR
8484

8585
metrics:
8686
val:
8787
sdr_0:
8888
_target_: torchmetrics.audio.SignalDistortionRatio
8989
channel: 0 # evaluate only on channel 0, if there are multiple outputs
90-
90+
9191
optim:
9292
name: adamw
9393
lr: 1e-4

examples/audio/conf/flow_matching_generative.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ model:
2929
shuffle: false
3030
num_workers: 4
3131
pin_memory: true
32-
32+
3333
log_config:
3434
log_tensorboard: true
3535
log_wandb: false
3636
max_utts: 8
37-
37+
3838
encoder:
3939
_target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
4040
fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
@@ -44,7 +44,7 @@ model:
4444

4545
decoder:
4646
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
47-
fft_length: ${model.encoder.fft_length}
47+
fft_length: ${model.encoder.fft_length}
4848
hop_length: ${model.encoder.hop_length}
4949
magnitude_power: ${model.encoder.magnitude_power}
5050
scale: ${model.encoder.scale}
@@ -68,9 +68,9 @@ model:
6868
time_min: 1e-8
6969
time_max: 1.0
7070
estimator_target: conditional_vector_field # or data
71-
71+
7272
loss:
73-
_target_: nemo.collections.audio.losses.MSELoss
73+
_target_: nemo.collections.audio.losses.audio.MSELoss
7474
ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
7575

7676
metrics:
@@ -85,7 +85,7 @@ model:
8585
_target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
8686
fs: ${model.sample_rate}
8787
mode: wb
88-
88+
8989
optim:
9090
name: adam
9191
lr: 1e-4

examples/audio/conf/flow_matching_generative_finetuning.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ model:
3232
shuffle: false
3333
num_workers: 4
3434
pin_memory: true
35-
35+
3636
log_config:
3737
log_tensorboard: true
3838
log_wandb: false
3939
max_utts: 8
40-
40+
4141
encoder:
4242
_target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
4343
fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
@@ -47,7 +47,7 @@ model:
4747

4848
decoder:
4949
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
50-
fft_length: ${model.encoder.fft_length}
50+
fft_length: ${model.encoder.fft_length}
5151
hop_length: ${model.encoder.hop_length}
5252
magnitude_power: ${model.encoder.magnitude_power}
5353
scale: ${model.encoder.scale}
@@ -71,9 +71,9 @@ model:
7171
time_min: 1e-8
7272
time_max: 1.0
7373
estimator_target: conditional_vector_field # or data
74-
74+
7575
loss:
76-
_target_: nemo.collections.audio.losses.MSELoss
76+
_target_: nemo.collections.audio.losses.audio.MSELoss
7777
ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
7878

7979
metrics:
@@ -88,7 +88,7 @@ model:
8888
_target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
8989
fs: ${model.sample_rate}
9090
mode: wb
91-
91+
9292
optim:
9393
name: adam
9494
lr: 1e-4

examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ model:
2929
shuffle: false
3030
num_workers: 4
3131
pin_memory: true
32-
32+
3333
log_config:
3434
log_tensorboard: true
3535
log_wandb: false
3636
max_utts: 8
37-
37+
3838
encoder:
3939
_target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
4040
fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
@@ -44,7 +44,7 @@ model:
4444

4545
decoder:
4646
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
47-
fft_length: ${model.encoder.fft_length}
47+
fft_length: ${model.encoder.fft_length}
4848
hop_length: ${model.encoder.hop_length}
4949
magnitude_power: ${model.encoder.magnitude_power}
5050
scale: ${model.encoder.scale}
@@ -68,14 +68,14 @@ model:
6868
time_min: 1e-8
6969
time_max: 1.0
7070
estimator_target: conditional_vector_field # or data
71-
71+
7272
ssl_pretrain_masking:
7373
_target_: nemo.collections.audio.modules.ssl_pretrain_masking.SSLPretrainWithMaskedPatch
7474
patch_size: 10
7575
mask_fraction: 0.7
76-
76+
7777
loss:
78-
_target_: nemo.collections.audio.losses.MSELoss
78+
_target_: nemo.collections.audio.losses.audio.MSELoss
7979
ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
8080

8181
metrics:
@@ -90,7 +90,7 @@ model:
9090
_target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
9191
fs: ${model.sample_rate}
9292
mode: wb
93-
93+
9494
optim:
9595
name: adam
9696
lr: 5e-5

examples/audio/conf/masking.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,13 @@ model:
5555
num_features: 256 # Number of features at RNN input
5656
num_layers: 5 # Number of RNN layers
5757
bidirectional: true # Use bi-directional RNN
58-
58+
5959
mask_processor:
6060
_target_: nemo.collections.audio.modules.masking.MaskReferenceChannel # Apply mask on the reference channel
6161
ref_channel: 0 # Reference channel for the output
6262

6363
loss:
64-
_target_: nemo.collections.audio.losses.SDRLoss
64+
_target_: nemo.collections.audio.losses.audio.SDRLoss
6565
scale_invariant: true # Use scale-invariant SDR
6666

6767
metrics:
@@ -72,7 +72,7 @@ model:
7272
sdr_ch0: # SDR on output channel 0
7373
_target_: torchmetrics.audio.SignalDistortionRatio
7474
channel: 0
75-
75+
7676
optim:
7777
name: adamw
7878
lr: 1e-4

examples/audio/conf/masking_with_online_augmentation.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,13 @@ model:
5151
num_features: 256 # Number of features at RNN input
5252
num_layers: 5 # Number of RNN layers
5353
bidirectional: true # Use bi-directional RNN
54-
54+
5555
mask_processor:
5656
_target_: nemo.collections.audio.modules.masking.MaskReferenceChannel # Apply mask on the reference channel
5757
ref_channel: 0 # Reference channel for the output
5858

5959
loss:
60-
_target_: nemo.collections.audio.losses.SDRLoss
60+
_target_: nemo.collections.audio.losses.audio.SDRLoss
6161
scale_invariant: true # Use scale-invariant SDR
6262

6363
metrics:
@@ -68,7 +68,7 @@ model:
6868
sdr_ch0: # SDR on output channel 0
6969
_target_: torchmetrics.audio.SignalDistortionRatio
7070
channel: 0
71-
71+
7272
optim:
7373
name: adamw
7474
lr: 1e-4

examples/audio/conf/predictive.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ model:
3737

3838
decoder:
3939
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
40-
fft_length: ${model.encoder.fft_length}
40+
fft_length: ${model.encoder.fft_length}
4141
hop_length: ${model.encoder.hop_length}
4242
magnitude_power: ${model.encoder.magnitude_power}
4343
scale: ${model.encoder.scale}
@@ -49,15 +49,15 @@ model:
4949
num_res_blocks: 3 # increased number of res blocks
5050
pad_time_to: 64 # pad to 64 frames for the time dimension
5151
pad_dimension_to: 0 # no padding in the frequency dimension
52-
52+
5353
loss:
54-
_target_: nemo.collections.audio.losses.MSELoss # computed in the time domain
54+
_target_: nemo.collections.audio.losses.audio.MSELoss # computed in the time domain
5555

5656
metrics:
5757
val:
5858
sisdr: # output SI-SDR
5959
_target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
60-
60+
6161
optim:
6262
name: adam
6363
lr: 1e-4

examples/audio/conf/predictive_conformer.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ model:
3636

3737
decoder:
3838
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
39-
fft_length: ${model.encoder.fft_length}
39+
fft_length: ${model.encoder.fft_length}
4040
hop_length: ${model.encoder.hop_length}
4141
magnitude_power: ${model.encoder.magnitude_power}
4242
scale: ${model.encoder.scale}
@@ -58,15 +58,15 @@ model:
5858
causal_downsampling: False
5959
att_context_size: [-1, -1]
6060
att_context_style: 'regular'
61-
61+
6262
loss:
63-
_target_: nemo.collections.audio.losses.MSELoss # computed in the time domain
63+
_target_: nemo.collections.audio.losses.audio.MSELoss # computed in the time domain
6464

6565
metrics:
6666
val:
6767
sisdr: # output SI-SDR
6868
_target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
69-
69+
7070
optim:
7171
name: adamw
7272
lr: 1e-3

examples/audio/conf/predictive_conformer_unet.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ model:
3535

3636
decoder:
3737
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
38-
fft_length: ${model.encoder.fft_length}
38+
fft_length: ${model.encoder.fft_length}
3939
hop_length: ${model.encoder.hop_length}
4040
magnitude_power: ${model.encoder.magnitude_power}
4141
scale: ${model.encoder.scale}
@@ -57,9 +57,9 @@ model:
5757
causal_downsampling: False
5858
att_context_size: [-1, -1]
5959
att_context_style: 'regular'
60-
60+
6161
loss:
62-
_target_: nemo.collections.audio.losses.MSELoss # computed in the time domain
62+
_target_: nemo.collections.audio.losses.audio.MSELoss # computed in the time domain
6363

6464
metrics:
6565
val:
@@ -73,7 +73,7 @@ model:
7373
_target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
7474
fs: ${model.sample_rate}
7575
mode: wb
76-
76+
7777
optim:
7878
name: adam
7979
lr: 1e-4

0 commit comments

Comments
 (0)