MagpieTTS_Internal_Demo / examples /audio /conf /predictive_conformer_unet.yaml
subhankarg's picture
Upload folder using huggingface_hub
0558aa4 verified
name: predictive_conformer_unet
model:
type: predictive
sample_rate: 16000
skip_nan_grad: false
num_outputs: 1
# non-streaming config, use input normalization
normalize_input: true # normalize the input signal to 0dBFS
train_ds:
use_lhotse: true # enable Lhotse data loader
cuts_path: ??? # path to Lhotse cuts manifest with speech signals for augmentation (including custom "target_recording" field with the same signals)
truncate_duration: 2.04 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 256
truncate_offset_type: random # if the file is longer than truncate_duration, use random offset to select a subsegment
batch_size: 32 # batch size may be increased based on the available memory
shuffle: true
num_workers: 8
pin_memory: true
validation_ds:
use_lhotse: true # enable Lhotse data loader
cuts_path: ??? # path to Lhotse cuts manifest with noisy speech signals (including custom "target_recording" field with the clean signals)
batch_size: 4 # batch size may be increased based on the available memory
shuffle: false
num_workers: 4
pin_memory: true
encoder:
_target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
hop_length: 128
magnitude_power: 0.5
scale: 0.33
decoder:
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
fft_length: ${model.encoder.fft_length}
hop_length: ${model.encoder.hop_length}
magnitude_power: ${model.encoder.magnitude_power}
scale: ${model.encoder.scale}
estimator:
_target_: nemo.collections.audio.parts.submodules.conformer_unet.SpectrogramConformerUNet
in_channels: 1 # single-channel noisy input
out_channels: 1 # single-channel estimate
feat_in: 256 # input feature dimension = number of subbands
n_layers: 8 # number of layers in the model
d_model: 512 # the hidden size of the model
subsampling_factor: 1 # subsampling factor for the model
self_attention_model: 'rel_pos'
n_heads: 8 # number of heads for the model
# streaming-related arguments
# - this is a non-streaming config
conv_context_size: null
conv_norm_type: 'layer_norm'
causal_downsampling: False
att_context_size: [-1, -1]
att_context_style: 'regular'
loss:
_target_: nemo.collections.audio.losses.MSELoss # computed in the time domain
metrics:
val:
sisdr: # output SI-SDR
_target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
estoi: # output ESTOI
_target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility
fs: ${model.sample_rate}
extended: true
pesq: # output PESQ
_target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
fs: ${model.sample_rate}
mode: wb
optim:
name: adam
lr: 1e-4
# optimizer arguments
betas: [0.9, 0.98]
weight_decay: 0.0
trainer:
devices: -1 # number of GPUs, -1 would use all available GPUs
num_nodes: 1
max_epochs: -1
max_steps: -1 # computed at runtime if not set
val_check_interval: 1.0 # run validation after this many training steps
accelerator: auto
strategy: ddp
accumulate_grad_batches: 1
gradient_clip_val: null
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
log_every_n_steps: 100 # Interval of logging.
enable_progress_bar: true
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
sync_batchnorm: true
enable_checkpointing: false # Provided by exp_manager
logger: false # Provided by exp_manager
use_distributed_sampler: false # required for lhotse
exp_manager:
exp_dir: null
name: ${name}
# use exponential moving average for model parameters
ema:
enable: true
decay: 0.999 # decay rate
cpu_offload: false # offload EMA parameters to CPU to save GPU memory
every_n_steps: 1 # how often to update EMA weights
validate_original_weights: false # use original weights for validation calculation?
# logging
create_tensorboard_logger: true
# checkpointing
create_checkpoint_callback: true
checkpoint_callback_params:
# in case of multiple validation sets, first one is used
monitor: val_sisdr
mode: max
save_top_k: 5
always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
# early stopping
create_early_stopping_callback: true
early_stopping_callback_params:
monitor: val_sisdr
mode: max
min_delta: 0.0
patience: 20 # patience in terms of check_val_every_n_epoch
verbose: true
strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
# you need to set these two to true to continue the training
resume_if_exists: false
resume_ignore_no_checkpoint: false
# You may use this section to create a W&B logger
create_wandb_logger: false
wandb_logger_kwargs:
name: null
project: null