# This config contains the default values for training Tacotron2 model on LJSpeech dataset. # If you want to train model on other dataset, you can change config values according to your dataset. # Most dataset-specific arguments are in the head of the config file, see below. name: Tacotron2 train_dataset: ??? validation_datasets: ??? sup_data_path: null sup_data_types: null phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" model: pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 sample_rate: 22050 n_mel_channels: 80 n_window_size: 1024 n_window_stride: 256 n_fft: 1024 lowfreq: 0 highfreq: 8000 window: hann pad_value: -11.52 text_normalizer: _target_: nemo_text_processing.text_normalization.normalize.Normalizer lang: en input_case: cased text_normalizer_call_kwargs: verbose: false punct_pre_process: true punct_post_process: true text_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer punct: true stresses: true chars: true apostrophe: true pad_with_space: true g2p: _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p phoneme_dict: ${phoneme_dict_path} heteronyms: ${heteronyms_path} train_ds: dataset: _target_: "nemo.collections.tts.data.dataset.TTSDataset" manifest_filepath: ${train_dataset} sample_rate: ${model.sample_rate} sup_data_path: ${sup_data_path} sup_data_types: ${sup_data_types} n_fft: ${model.n_fft} win_length: ${model.n_window_size} hop_length: ${model.n_window_stride} window: ${model.window} n_mels: ${model.n_mel_channels} lowfreq: ${model.lowfreq} highfreq: ${model.highfreq} max_duration: null min_duration: 0.1 ignore_file: null trim: False pitch_fmin: ${model.pitch_fmin} pitch_fmax: ${model.pitch_fmax} dataloader_params: drop_last: false shuffle: true batch_size: 48 num_workers: 4 pin_memory: true validation_ds: dataset: _target_: "nemo.collections.tts.data.dataset.TTSDataset" manifest_filepath: ${validation_datasets} sample_rate: ${model.sample_rate} sup_data_path: ${sup_data_path} sup_data_types: ${sup_data_types} n_fft: ${model.n_fft} win_length: ${model.n_window_size} hop_length: ${model.n_window_stride} window: ${model.window} n_mels: ${model.n_mel_channels} lowfreq: ${model.lowfreq} highfreq: ${model.highfreq} max_duration: null min_duration: 0.1 ignore_file: null trim: False pitch_fmin: ${model.pitch_fmin} pitch_fmax: ${model.pitch_fmax} dataloader_params: drop_last: false shuffle: false batch_size: 24 num_workers: 8 pin_memory: true preprocessor: _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures nfilt: ${model.n_mel_channels} highfreq: ${model.highfreq} log: true log_zero_guard_type: clamp log_zero_guard_value: 1e-05 lowfreq: ${model.lowfreq} n_fft: ${model.n_fft} n_window_size: ${model.n_window_size} n_window_stride: ${model.n_window_stride} pad_to: 16 pad_value: ${model.pad_value} sample_rate: ${model.sample_rate} window: ${model.window} normalize: null preemph: null dither: 0.0 frame_splicing: 1 stft_conv: false nb_augmentation_prob : 0 mag_power: 1.0 exact_pad: true use_grads: false encoder: _target_: nemo.collections.tts.modules.tacotron2.Encoder encoder_kernel_size: 5 encoder_n_convolutions: 3 encoder_embedding_dim: 512 decoder: _target_: nemo.collections.tts.modules.tacotron2.Decoder decoder_rnn_dim: 1024 encoder_embedding_dim: ${model.encoder.encoder_embedding_dim} gate_threshold: 0.5 max_decoder_steps: 1000 n_frames_per_step: 1 # currently only 1 is supported n_mel_channels: ${model.n_mel_channels} p_attention_dropout: 0.1 p_decoder_dropout: 0.1 prenet_dim: 256 prenet_p_dropout: 0.5 # Attention parameters attention_dim: 128 attention_rnn_dim: 1024 # AttentionLocation Layer parameters attention_location_kernel_size: 31 attention_location_n_filters: 32 early_stopping: true postnet: _target_: nemo.collections.tts.modules.tacotron2.Postnet n_mel_channels: ${model.n_mel_channels} p_dropout: 0.5 postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 optim: name: adam lr: 1e-3 weight_decay: 1e-6 # scheduler setup sched: name: CosineAnnealing min_lr: 1e-5 trainer: devices: 1 # number of gpus max_epochs: ??? num_nodes: 1 accelerator: gpu strategy: ddp accumulate_grad_batches: 1 enable_checkpointing: False # Provided by exp_manager logger: False # Provided by exp_manager gradient_clip_val: 1.0 log_every_n_steps: 60 check_val_every_n_epoch: 2 benchmark: false exp_manager: exp_dir: null name: ${name} create_tensorboard_logger: true create_checkpoint_callback: true checkpoint_callback_params: monitor: val_loss mode: min