From ceefc8ddaa3af6ed21e65b9769f8831af92176c6 Mon Sep 17 00:00:00 2001 From: Parcollet Titouan <parcollet.titouan@gmail.com> Date: Sun, 11 Feb 2024 21:52:39 +0000 Subject: [PATCH] Shorten the data augmentation in YAML + cleaning (code from Samsung AI Cambridge) (#2399) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * shorter augmentations in yaml * layout to 80 char * listed label replication * listed label replication * listed label replication * Refact CTC * Refact transducer * Refact seq2seq * call replicate label instead of duplication * refactor aishell * refactor aishell * CommonLanuageà * fix error + CV CTC * Giga OOF * Giga OOF * Giga OOF * Giga OOF * Giga OOF * Giga OOF * Giga OOF * Giga OOF * Giga OOF * Finishing OOF * final touch LULZ * fix tests * Tests???à * fix augment in some recipes --------- Co-authored-by: Titouan Parcollet/Embedded AI /SRUK/Engineer/Samsung Electronics <t.parcollet@sruk-ccn4.eu.corp.samsungelectronics.net> Co-authored-by: Mirco Ravanelli <mirco.ravanelli@gmail.com> --- .../ASR/CTC/hparams/train_with_wav2vec.yaml | 57 ++---- .../AISHELL-1/ASR/CTC/train_with_wav2vec.py | 2 + .../AISHELL-1/ASR/seq2seq/hparams/train.yaml | 59 +++--- recipes/AISHELL-1/ASR/seq2seq/train.py | 18 +- .../hparams/train_ASR_transformer.yaml | 67 ++----- .../train_ASR_transformer_with_wav2vect.yaml | 47 ++--- recipes/AISHELL-1/ASR/transformer/train.py | 34 ++-- .../ASR/transformer/train_with_wav2vect.py | 18 +- .../Tokenizer/hparams/tokenizer_bpe5000.yaml | 2 +- .../train_transformer_tokenizer_bpe5000.yaml | 2 +- .../hparams/sepformer-aishell1mix2-wham.yaml | 2 +- .../hparams/sepformer-aishell1mix2.yaml | 2 +- .../hparams/sepformer-aishell1mix3-wham.yaml | 2 +- .../hparams/sepformer-aishell1mix3.yaml | 2 +- .../separation/hparams/convtasnet-cross.yaml | 2 +- .../hparams/convtasnet-independent.yaml | 2 +- .../hparams/convtasnet-parallel-noise.yaml | 2 +- .../hparams/convtasnet-parallel-reverb.yaml | 2 +- .../hparams/convtasnet-parallel.yaml | 2 +- recipes/CVSS/S2ST/hparams/train_fr-en.yaml | 2 +- .../lang_id/hparams/train_ecapa_tdnn.yaml | 21 +-- .../CTC/hparams/train_ar_with_wav2vec.yaml | 46 ++--- .../CTC/hparams/train_de_with_wav2vec.yaml | 46 ++--- .../CTC/hparams/train_en_with_wav2vec.yaml | 46 ++--- .../CTC/hparams/train_es_with_wav2vec.yaml | 46 ++--- .../CTC/hparams/train_fr_with_wav2vec.yaml | 46 ++--- .../CTC/hparams/train_it_with_wav2vec.yaml | 47 ++--- .../CTC/hparams/train_pt_with_wav2vec.yaml | 46 ++--- .../CTC/hparams/train_rw_with_wav2vec.yaml | 46 ++--- .../CTC/hparams/train_zh-CN_with_wav2vec.yaml | 46 ++--- .../ASR/seq2seq/hparams/train_de.yaml | 49 ++--- .../ASR/seq2seq/hparams/train_en.yaml | 50 ++--- .../ASR/seq2seq/hparams/train_es.yaml | 50 ++--- .../ASR/seq2seq/hparams/train_fr.yaml | 50 ++--- .../ASR/seq2seq/hparams/train_it.yaml | 50 ++--- .../ASR/seq2seq/hparams/train_rw.yaml | 50 ++--- .../ASR/transducer/hparams/train_de.yaml | 48 ++--- .../ASR/transducer/hparams/train_fr.yaml | 48 ++--- .../ASR/transducer/hparams/train_it.yaml | 48 ++--- recipes/CommonVoice/ASR/transducer/train.py | 32 ++-- .../hparams/train_ar_hf_whisper.yaml | 44 ++--- .../ASR/transformer/hparams/train_de.yaml | 47 ++--- .../hparams/train_fa_hf_whisper.yaml | 45 ++--- .../ASR/transformer/hparams/train_fr.yaml | 47 ++--- .../hparams/train_fr_hf_whisper.yaml | 45 ++--- .../hparams/train_hi_hf_whisper.yaml | 45 ++--- .../ASR/transformer/hparams/train_it.yaml | 47 ++--- .../hparams/train_it_hf_whisper.yaml | 45 ++--- .../hparams/train_mn_hf_whisper.yaml | 45 ++--- .../hparams/train_sr_hf_whisper.yaml | 45 ++--- recipes/CommonVoice/ASR/transformer/train.py | 34 ++-- .../wav2vec2/hparams/wav2vec2_base.yaml | 10 +- .../hparams/sepformer-dns-16k.yaml | 2 +- .../CTC/hparams/train_amh_with_wav2vec.yaml | 43 ++--- .../CTC/hparams/train_dar_with_wav2vec.yaml | 44 ++--- .../CTC/hparams/train_fon_with_wav2vec.yaml | 44 ++--- .../CTC/hparams/train_multi_with_wav2vec.yaml | 44 ++--- .../CTC/hparams/train_sw_with_wav2vec.yaml | 44 ++--- .../CTC/hparams/train_wol_with_wav2vec.yaml | 44 ++--- .../hparams/cnn14_classifier.yaml | 2 +- .../hparams/conv2d_classifier.yaml | 2 +- .../ESC50/interpret/hparams/l2i_cnn14.yaml | 2 +- .../hparams/l2i_conv2dclassifier.yaml | 2 +- recipes/ESC50/interpret/hparams/nmf.yaml | 2 +- recipes/ESC50/interpret/hparams/piq.yaml | 2 +- .../ST/transformer/hparams/conformer.yaml | 2 +- .../ST/transformer/hparams/transformer.yaml | 2 +- .../Google-speech-commands/hparams/xvect.yaml | 2 +- .../hparams/xvect_leaf.yaml | 2 +- .../hparams/train_with_wav2vec2.yaml | 4 +- .../AST/transformer/hparams/train_samu.yaml | 2 +- .../hparams/train_samu_mbart_st.yaml | 2 +- .../hparams/train_samu_nllb_st.yaml | 2 +- .../hparams/train_w2v2_mbart_st.yaml | 2 +- .../hparams/train_w2v2_nllb_st.yaml | 2 +- .../transformer/hparams/train_w2v2_st.yaml | 2 +- .../transformer/hparams/conformer_medium.yaml | 4 +- .../KsponSpeech/LM/hparams/transformer.yaml | 2 +- .../hparams/5K_unigram_subword_bpe.yaml | 2 +- .../hparams/sepformer-libri2mix.yaml | 2 +- .../hparams/sepformer-libri3mix.yaml | 2 +- recipes/LibriParty/VAD/hparams/train.yaml | 4 +- .../train_hf_wavlm_average_downsampling.yaml | 122 ++++++------ .../train_hf_wavlm_conv_downsampling.yaml | 122 ++++++------ .../train_hf_wavlm_signal_downsampling.yaml | 119 ++++++------ .../ASR/CTC/hparams/train_hf_wav2vec.yaml | 125 ++++++------- .../train_hf_wav2vec_rnn_rescoring.yaml | 174 +++++++++--------- ...rain_hf_wav2vec_transformer_rescoring.yaml | 136 +++++++------- .../CTC/hparams/train_hf_whisper_encoder.yaml | 118 ++++++------ .../ASR/CTC/hparams/train_sb_wav2vec.yaml | 119 ++++++------ .../LibriSpeech/ASR/CTC/train_with_wav2vec.py | 11 +- .../LibriSpeech/ASR/CTC/train_with_whisper.py | 11 +- .../ASR/seq2seq/hparams/train_BPE_1000.yaml | 128 ++++++------- .../hparams/train_BPE_1000_sligru.yaml | 129 ++++++------- .../ASR/seq2seq/hparams/train_BPE_5000.yaml | 129 ++++++------- recipes/LibriSpeech/ASR/seq2seq/train.py | 14 +- .../hparams/conformer_transducer.yaml | 60 +++--- recipes/LibriSpeech/ASR/transducer/train.py | 29 +-- .../ASR/transformer/hparams/bayesspeech.yaml | 61 +++--- .../hparams/branchformer_large.yaml | 67 +++---- .../transformer/hparams/conformer_large.yaml | 64 +++---- .../transformer/hparams/conformer_small.yaml | 67 +++---- .../hparams/hyperbranchformer_13M.yaml | 64 +++---- .../hparams/hyperbranchformer_25M.yaml | 64 +++---- .../hparams/hyperconformer_22M.yaml | 65 +++---- .../hparams/hyperconformer_8M.yaml | 64 +++---- .../transformer/hparams/train_hf_whisper.yaml | 53 +++--- .../ASR/transformer/hparams/transformer.yaml | 64 +++---- recipes/LibriSpeech/ASR/transformer/train.py | 18 +- .../G2P/hparams/hparams_g2p_rnn.yaml | 2 +- .../G2P/hparams/hparams_g2p_transformer.yaml | 2 +- .../G2P/hparams/hparams_lm_rnn.yaml | 4 +- .../G2P/hparams/hparams_lm_transformer.yaml | 4 +- recipes/LibriSpeech/LM/hparams/RNNLM.yaml | 4 +- .../LibriSpeech/LM/hparams/transformer.yaml | 2 +- .../hparams/1K_unigram_subword_bpe.yaml | 2 +- .../hparams/5K_unigram_subword_bpe.yaml | 2 +- .../wav2vec2/hparams/wav2vec2_base.yaml | 4 +- .../ASR/CTC/hparams/train_hf_wav2vec.yaml | 4 +- .../CTC/hparams/train_hf_wav2vec_full.yaml | 4 +- .../CTC/hparams/train_hf_wav2vec_relax.yaml | 4 +- .../gpt/hparams/train_gpt.yaml | 2 +- .../llama2/hparams/train_llama2.yaml | 2 +- .../hparams/pool_sisnrestimator.yaml | 2 +- .../noise-robust/hparams/robust_asr_16k.yaml | 4 +- recipes/SLURP/NLU/hparams/train.yaml | 4 +- .../Tokenizer/hparams/tokenizer_bpe58.yaml | 2 +- recipes/SLURP/direct/hparams/train.yaml | 41 ++--- .../direct/hparams/train_with_wav2vec2.yaml | 42 ++--- .../ASR/CTC/hparams/train_with_wav2vec.yaml | 44 ++--- .../ASR/seq2seq/hparams/train_BPE_2000.yaml | 51 ++--- recipes/Switchboard/ASR/seq2seq/train.py | 14 +- .../ASR/transformer/hparams/transformer.yaml | 59 ++---- .../hparams/transformer_finetuned_LM.yaml | 53 ++---- recipes/Switchboard/ASR/transformer/train.py | 18 +- .../Switchboard/LM/hparams/transformer.yaml | 2 +- .../LM/hparams/transformer_finetune.yaml | 2 +- .../hparams/2K_unigram_subword_bpe.yaml | 2 +- recipes/TIMIT/ASR/CTC/hparams/train.yaml | 52 ++---- recipes/TIMIT/ASR/seq2seq/hparams/train.yaml | 45 ++--- .../seq2seq/hparams/train_with_wav2vec2.yaml | 44 ++--- .../TIMIT/ASR/transducer/hparams/train.yaml | 52 ++---- .../ASR/transducer/hparams/train_wav2vec.yaml | 45 ++--- recipes/TIMIT/Alignment/hparams/train.yaml | 4 +- .../hparams/branchformer_large.yaml | 4 +- .../Tokenizer/hparams/tedlium2_500_bpe.yaml | 2 +- .../hparams/train_ecapa_tdnn.yaml | 2 +- recipes/Voicebank/ASR/CTC/hparams/train.yaml | 44 ++--- .../ASR_enhance/hparams/enhance_mimic.yaml | 2 +- .../hparams/pretrain_perceptual.yaml | 2 +- .../MTL/ASR_enhance/hparams/robust_asr.yaml | 49 ++--- .../SpeakerRec/hparams/train_ecapa_tdnn.yaml | 40 ++-- .../hparams/train_ecapa_tdnn_mel_spec.yaml | 40 ++-- .../SpeakerRec/hparams/train_resnet.yaml | 42 ++--- .../SpeakerRec/hparams/train_x_vectors.yaml | 40 ++-- .../lang_id/hparams/train_ecapa.yaml | 13 +- .../hparams/cnntransformer-wham-DM.yaml | 2 +- .../hparams/cnntransformer-whamr-DM.yaml | 2 +- .../hparams/convtasnet-whamr-DM.yaml | 2 +- .../enhancement/hparams/dprnn-whamr-DM.yaml | 2 +- .../enhancement/hparams/sepformer-wham.yaml | 2 +- .../hparams/sepformer-whamr-16k-DM.yaml | 2 +- .../hparams/sepformer-whamr-16k.yaml | 2 +- .../hparams/sepformer-whamr-DM.yaml | 2 +- .../enhancement/hparams/sepformer-whamr.yaml | 2 +- .../separation/hparams/sepformer-wham.yaml | 2 +- .../separation/hparams/sepformer-whamr.yaml | 2 +- .../separation/hparams/convtasnet.yaml | 2 +- recipes/WSJ0Mix/separation/hparams/dprnn.yaml | 2 +- .../separation/hparams/resepformer.yaml | 2 +- .../hparams/sepformer-conformerintra.yaml | 2 +- .../hparams/sepformer-customdataset.yaml | 2 +- .../WSJ0Mix/separation/hparams/sepformer.yaml | 2 +- recipes/WSJ0Mix/separation/hparams/skim.yaml | 2 +- .../emotion_diarization/hparams/train.yaml | 2 +- .../Tokenizer/hparams/tokenizer_bpe51.yaml | 2 +- .../direct/hparams/train.yaml | 44 ++--- recipes/timers-and-such/LM/hparams/train.yaml | 2 +- .../Tokenizer/hparams/tokenizer_bpe51.yaml | 2 +- .../decoupled/hparams/train_LS_LM.yaml | 2 +- .../decoupled/hparams/train_TAS_LM.yaml | 2 +- .../timers-and-such/direct/hparams/train.yaml | 50 ++--- .../direct/hparams/train_with_wav2vec2.yaml | 42 ++--- .../multistage/hparams/train_LS_LM.yaml | 51 ++--- .../multistage/hparams/train_TAS_LM.yaml | 50 ++--- speechbrain/augment/augmenter.py | 39 +++- 186 files changed, 2124 insertions(+), 3354 deletions(-) diff --git a/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml b/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml index 9bd0b52d2..486685f25 100644 --- a/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml +++ b/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml @@ -28,7 +28,8 @@ test_data: !ref <output_folder>/test.csv wav2vec2_hub: TencentGameMate/chinese-wav2vec2-large wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 80 lr: 1.0 lr_wav2vec: 0.0001 @@ -76,7 +77,8 @@ tokenizer: !apply:transformers.BertTokenizer.from_pretrained # bert-base-chinese tokens length output_neurons: 21128 -# Decoding parameters +############################## Decoding ######################################## + # Be sure that the bos and eos index match with the BPEs ones # Decoding parameters test_searcher: !name:speechbrain.decoders.CTCBeamSearcher @@ -98,64 +100,37 @@ beta: 0.5 # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. -# (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref <remove_spaces>] epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [90, 100, 110] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> # Time Drop -time_drop_length_low: 35 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 45 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 2 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 2 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 35 + drop_length_high: 45 + drop_count_low: 2 + drop_count_high: 2 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -164,6 +139,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear @@ -230,6 +207,8 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py b/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py index 227204f44..43783eed7 100644 --- a/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py +++ b/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py @@ -56,6 +56,8 @@ class ASR(sb.Brain): ids = batch.id tokens, tokens_lens = batch.tokens + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN: if hasattr(self.hparams, "fea_augment"): tokens = self.hparams.fea_augment.replicate_labels(tokens) diff --git a/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml b/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml index 75b303f66..e6fda7de2 100644 --- a/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml +++ b/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml @@ -29,7 +29,8 @@ test_data: !ref <output_folder>/test.csv noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script tokenizer_file: speechbrain/asr-transformer-aishell/tokenizer.ckpt -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 40 number_of_ctc_epochs: 10 batch_size: 16 @@ -71,7 +72,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -108,7 +109,6 @@ scorer_beam_scale: 0.5 # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. -# (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref <remove_spaces>] @@ -118,6 +118,8 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global +############################## Augmentations ################################### + compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref <sample_rate> n_fft: !ref <n_fft> @@ -132,57 +134,37 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 -# Augmenter: Combines previously defined augmentations to perform data augmentation # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -192,6 +174,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> @@ -268,7 +252,8 @@ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer paths: tokenizer: !ref <tokenizer_file> -# Scorer +############################## Decoding ######################################## + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -305,6 +290,8 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.8 patient: 0 +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/AISHELL-1/ASR/seq2seq/train.py b/recipes/AISHELL-1/ASR/seq2seq/train.py index 69d2e75d6..bc2c49b88 100644 --- a/recipes/AISHELL-1/ASR/seq2seq/train.py +++ b/recipes/AISHELL-1/ASR/seq2seq/train.py @@ -29,10 +29,6 @@ class ASR(sb.Brain): # Forward pass feats = self.hparams.compute_features(wavs) - - if stage == sb.Stage.TRAIN and hasattr(self.hparams, "fea_augment"): - feats, fea_lens = self.hparams.fea_augment(feats, wav_lens) - feats = self.modules.normalize(feats, wav_lens) x = self.modules.enc(feats.detach()) e_in = self.modules.emb(tokens_bos) # y_in bos + tokens @@ -65,12 +61,16 @@ class ASR(sb.Brain): tokens_eos, tokens_eos_lens = batch.tokens_eos tokens, tokens_lens = batch.tokens + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - tokens_eos = self.hparams.wav_augment.replicate_labels(tokens_eos) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml index b98e371b2..408c9e680 100644 --- a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml +++ b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml @@ -30,7 +30,8 @@ test_data: !ref <save_folder>/test.csv noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script tokenizer_file: speechbrain/asr-transformer-aishell/tokenizer.ckpt -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 50 batch_size: 8 ctc_weight: 0.3 @@ -77,7 +78,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -####################### Model parameters ########################### +####################### Model Parameters ####################################### # Transformer d_model: 256 nhead: 4 @@ -103,7 +104,7 @@ valid_beam_size: 10 test_beam_size: 10 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -157,7 +158,8 @@ SGD: !name:torch.optim.SGD momentum: 0.99 nesterov: True -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -227,7 +229,7 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# ----- WAVEFORM AUGMENTATION ----- # +############################## Augmentation #################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -238,75 +240,43 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 1 max_augmentations: 1 augment_prob: 1.0 augmentations: [ !ref <add_noise>] - - # ----- FEATURE AUGMENTATION ----- # -time_drop_length_low: 0 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 100 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 2 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 2 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -freq_drop_length_low: 30 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 40 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks + drop_length_low: 0 + drop_length_high: 100 + drop_count_low: 2 + drop_count_high: 2 # Frequency Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 30 + drop_length_high: 40 + drop_count_low: 2 + drop_count_high: 2 dim: 2 -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - # Time warp time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 1 max_augmentations: 1 augment_start_index: !ref <batch_size> # This leaves unchanges original inputs @@ -317,6 +287,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> @@ -324,7 +296,6 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. -# (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref <remove_spaces>] diff --git a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer_with_wav2vect.yaml b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer_with_wav2vect.yaml index 13a7826ad..a196afc58 100644 --- a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer_with_wav2vect.yaml +++ b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer_with_wav2vect.yaml @@ -30,7 +30,8 @@ wav2vec2_hub: facebook/wav2vec2-large-100k-voxpopuli wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint freeze_wav2vec: False -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 80 batch_size: 2 grad_accumulation_factor: 16 @@ -72,7 +73,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -####################### Model parameters ########################### +####################### Model Parameters ####################################### # Transformer d_model: 256 nhead: 4 @@ -98,7 +99,7 @@ valid_beam_size: 10 test_beam_size: 10 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref <wav2vec2_hub> @@ -140,44 +141,27 @@ model: !new:torch.nn.ModuleList - [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>] # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -186,6 +170,7 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Decoding & optimiser ############################ # define two optimizers here for two-stage training Adam: !name:torch.optim.Adam @@ -257,6 +242,7 @@ noam_annealing_wav2vect: !new:speechbrain.nnet.schedulers.NoamScheduler n_warmup_steps: 25000 model_size: !ref <d_model> +############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> @@ -278,7 +264,6 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. -# (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref <remove_spaces>] diff --git a/recipes/AISHELL-1/ASR/transformer/train.py b/recipes/AISHELL-1/ASR/transformer/train.py index 977563ac8..63361bf0d 100644 --- a/recipes/AISHELL-1/ASR/transformer/train.py +++ b/recipes/AISHELL-1/ASR/transformer/train.py @@ -82,28 +82,26 @@ class ASR(sb.core.Brain): tokens, tokens_lens = batch.tokens if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - tokens_lens = self.hparams.fea_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py b/recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py index 94196ea7b..53aa47375 100644 --- a/recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py +++ b/recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py @@ -74,16 +74,16 @@ class ASR(sb.core.Brain): tokens, tokens_lens = batch.tokens if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/AISHELL-1/Tokenizer/hparams/tokenizer_bpe5000.yaml b/recipes/AISHELL-1/Tokenizer/hparams/tokenizer_bpe5000.yaml index 886d22bda..d2cb23018 100644 --- a/recipes/AISHELL-1/Tokenizer/hparams/tokenizer_bpe5000.yaml +++ b/recipes/AISHELL-1/Tokenizer/hparams/tokenizer_bpe5000.yaml @@ -15,7 +15,7 @@ train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/AISHELL-1/Tokenizer/hparams/train_transformer_tokenizer_bpe5000.yaml b/recipes/AISHELL-1/Tokenizer/hparams/train_transformer_tokenizer_bpe5000.yaml index bc286156b..973df9a11 100644 --- a/recipes/AISHELL-1/Tokenizer/hparams/train_transformer_tokenizer_bpe5000.yaml +++ b/recipes/AISHELL-1/Tokenizer/hparams/train_transformer_tokenizer_bpe5000.yaml @@ -15,7 +15,7 @@ train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2-wham.yaml b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2-wham.yaml index 24571404f..d3cb9493e 100644 --- a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2-wham.yaml +++ b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2-wham.yaml @@ -40,7 +40,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2.yaml b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2.yaml index d4689378a..168471dbb 100644 --- a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2.yaml +++ b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2.yaml @@ -40,7 +40,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3-wham.yaml b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3-wham.yaml index 65add025a..834857ed7 100644 --- a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3-wham.yaml +++ b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3-wham.yaml @@ -40,7 +40,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3.yaml b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3.yaml index 3b27b796b..d48fdecb2 100644 --- a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3.yaml +++ b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3.yaml @@ -40,7 +40,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-cross.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-cross.yaml index f609c746e..043845aeb 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-cross.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-cross.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-independent.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-independent.yaml index c3483f5f2..164ccc45b 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-independent.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-independent.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-noise.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-noise.yaml index 74941dfea..fef85267f 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-noise.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-noise.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-reverb.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-reverb.yaml index 6b1518d39..4ec5054f9 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-reverb.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-reverb.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel.yaml index a5d615679..adb31ddc6 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/CVSS/S2ST/hparams/train_fr-en.yaml b/recipes/CVSS/S2ST/hparams/train_fr-en.yaml index 8ff7d59be..678dd1c17 100644 --- a/recipes/CVSS/S2ST/hparams/train_fr-en.yaml +++ b/recipes/CVSS/S2ST/hparams/train_fr-en.yaml @@ -59,7 +59,7 @@ wav2vec2_download_path: !ref <save_folder>/pretrained_models wav2vec2_frozen: False wav2vec2_freeze_steps: 10000 -# Training parameters +####################### Training Parameters #################################### lr: 0.0005 lr_wav2vec: 0.00001 loss_reduction: batchmean diff --git a/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml b/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml index 0c4d91aa9..d4722f45c 100644 --- a/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml +++ b/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml @@ -38,10 +38,10 @@ error_stats: !name:speechbrain.utils.metric_stats.MetricStats metric: !name:speechbrain.nnet.losses.classification_error reduction: batch +####################### Training Parameters #################################### + # Feature parameters btw: 40 - 80 n_mels: 80 - -# Training Parameters sample_rate: 16000 number_of_epochs: 30 batch_size: 4 @@ -64,6 +64,8 @@ test_dataloader_options: batch_size: !ref <batch_size> shuffle: True +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -78,7 +80,6 @@ prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL ext: wav csv_file: !ref <rir_annotation> - # Add reverberation to input signal add_reverb: !new:speechbrain.augment.time_domain.AddReverb csv_file: !ref <rir_annotation> @@ -87,27 +88,21 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [90, 100, 110] # List of speed changes for time-stretching speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 shuffle_augmentations: True min_augmentations: 1 max_augmentations: 3 @@ -125,6 +120,8 @@ mean_var_norm_input: !new:speechbrain.processing.features.InputNormalization norm_type: sentence std_norm: False +############################## Models ########################################## + # To design a custom model, either just edit the simple CustomModel # class that's listed here, or replace this `!new` call with a line # pointing to a different file you've defined. @@ -182,6 +179,8 @@ lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler final_value: !ref <lr_final> epoch_count: !ref <number_of_epochs> +############################## Logging and Pretrainer ########################## + # This object is used for saving the state of training both so that it # can be resumed if it gets interrupted, and also so that the best checkpoint # can be later loaded for evaluation or inference. diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_ar_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_ar_with_wav2vec.yaml index caf7f2d3b..643df0994 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_ar_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_ar_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +59,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -97,45 +98,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -144,6 +130,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml index dc4ae34b2..adb8e5bb5 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml @@ -33,7 +33,8 @@ skip_prep: False # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 45 lr: 1.0 lr_wav2vec: 0.0001 @@ -61,7 +62,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU dnn_neurons: 1024 wav2vec_output_dim: !ref <dnn_neurons> @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -142,6 +128,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml index f3c68ee9b..d8aaea36e 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +59,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -142,6 +128,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_es_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_es_with_wav2vec.yaml index 8e2056f83..e32a242d1 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_es_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_es_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +59,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -96,45 +97,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -143,6 +129,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml index 44f1523f3..079cfe73f 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +59,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 @@ -94,45 +95,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -141,6 +127,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml index 4f39ad2a0..033299752 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml @@ -33,7 +33,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 8.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 45 lr: 1.0 lr_wav2vec: 0.0001 @@ -59,7 +60,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -141,6 +127,9 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <speed_perturb>, !ref <drop_freq>, !ref <drop_chunk>] + +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_pt_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_pt_with_wav2vec.yaml index da8a28de5..d4b703eb4 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_pt_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_pt_with_wav2vec.yaml @@ -31,7 +31,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -57,7 +58,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -142,6 +128,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml index f92d8ad13..ed15a8aad 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -59,7 +60,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -142,6 +128,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_zh-CN_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_zh-CN_with_wav2vec.yaml index 513d8d324..a1709931a 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_zh-CN_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_zh-CN_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -59,7 +60,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -97,45 +98,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -144,6 +130,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_de.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_de.yaml index b66374147..cb6f2b3be 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_de.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_de.yaml @@ -30,7 +30,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 20 lr: 1.0 @@ -62,7 +63,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -104,51 +105,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -165,6 +142,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml index c74d25c66..49f9a0d2b 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml @@ -29,7 +29,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 10 lr: 1.0 @@ -60,7 +61,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -102,50 +103,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -162,6 +140,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_es.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_es.yaml index b56a75b69..b94373e9b 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_es.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_es.yaml @@ -30,7 +30,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 20 lr: 1.0 @@ -61,7 +62,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -103,50 +104,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -163,6 +141,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_fr.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_fr.yaml index 1c2a85ec8..cc9b0aa99 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_fr.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_fr.yaml @@ -29,7 +29,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 20 lr: 1.0 @@ -60,7 +61,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -102,50 +103,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -162,6 +140,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_it.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_it.yaml index bf9211caf..2c0355ae5 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_it.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_it.yaml @@ -29,7 +29,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 8.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 50 number_of_ctc_epochs: 40 lr: 1.0 @@ -59,7 +60,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -101,50 +102,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -161,6 +139,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw.yaml index dd665ab24..8bc89c1c4 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw.yaml @@ -29,7 +29,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 8.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 20 lr: 1.0 @@ -59,7 +60,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -102,50 +103,27 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -162,6 +140,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/transducer/hparams/train_de.yaml b/recipes/CommonVoice/ASR/transducer/hparams/train_de.yaml index 8faaa805f..9bbab1669 100644 --- a/recipes/CommonVoice/ASR/transducer/hparams/train_de.yaml +++ b/recipes/CommonVoice/ASR/transducer/hparams/train_de.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 6 batch_size_valid: 1 @@ -71,7 +71,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <batch_size_valid> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -109,50 +109,28 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 3 + drop_count_high: 3 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -161,6 +139,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/transducer/hparams/train_fr.yaml b/recipes/CommonVoice/ASR/transducer/hparams/train_fr.yaml index 6c3f0bc7d..c96a09394 100644 --- a/recipes/CommonVoice/ASR/transducer/hparams/train_fr.yaml +++ b/recipes/CommonVoice/ASR/transducer/hparams/train_fr.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 6 batch_size_valid: 1 @@ -71,7 +71,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <batch_size_valid> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -109,50 +109,28 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 3 + drop_count_high: 3 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -161,6 +139,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/transducer/hparams/train_it.yaml b/recipes/CommonVoice/ASR/transducer/hparams/train_it.yaml index a645f9815..cf366205e 100644 --- a/recipes/CommonVoice/ASR/transducer/hparams/train_it.yaml +++ b/recipes/CommonVoice/ASR/transducer/hparams/train_it.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 6 batch_size_valid: 1 @@ -71,7 +71,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <batch_size_valid> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -109,50 +109,28 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 3 + drop_count_high: 3 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -161,6 +139,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/transducer/train.py b/recipes/CommonVoice/ASR/transducer/train.py index 1782408b8..0304aabc8 100644 --- a/recipes/CommonVoice/ASR/transducer/train.py +++ b/recipes/CommonVoice/ASR/transducer/train.py @@ -134,26 +134,22 @@ class ASR(sb.Brain): if stage == sb.Stage.TRAIN: if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - token_lens = self.hparams.wav_augment.replicate_labels( - token_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - token_eos_lens = self.hparams.wav_augment.replicate_labels( - token_eos_lens + ( + tokens, + token_lens, + tokens_eos, + token_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, token_lens, tokens_eos, token_eos_lens ) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - token_lens = self.hparams.fea_augment.replicate_labels( - token_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - token_eos_lens = self.hparams.fea_augment.replicate_labels( - token_eos_lens + ( + tokens, + token_lens, + tokens_eos, + token_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, token_lens, tokens_eos, token_eos_lens ) if stage == sb.Stage.TRAIN: diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_ar_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_ar_hf_whisper.yaml index 2b358e6d4..d33c50c2b 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_ar_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_ar_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,7 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_de.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_de.yaml index dbeb56be5..c5533e9bb 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_de.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_de.yaml @@ -33,7 +33,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 32 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 ! ctc_weight: 0.3 @@ -70,7 +70,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: 6 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 768 nhead: 8 @@ -213,50 +213,27 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 3 -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_fa_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_fa_hf_whisper.yaml index bc0181044..bb23c98a6 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_fa_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_fa_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_fr.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_fr.yaml index 120305e73..e62d9c390 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_fr.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_fr.yaml @@ -33,7 +33,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 32 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 ! ctc_weight: 0.3 @@ -70,7 +70,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: 6 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 768 nhead: 8 @@ -213,50 +213,27 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 3 -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_fr_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_fr_hf_whisper.yaml index da5cbd28f..62363bdad 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_fr_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_fr_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_hi_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_hi_hf_whisper.yaml index 8b130a83d..e21852639 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_hi_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_hi_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_it.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_it.yaml index d937359b5..d95fbaffa 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_it.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_it.yaml @@ -33,7 +33,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 32 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 ! ctc_weight: 0.3 @@ -70,7 +70,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: 6 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 768 nhead: 8 @@ -213,50 +213,27 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 3 -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_it_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_it_hf_whisper.yaml index 5670f4fe8..e1fc08263 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_it_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_it_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_mn_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_mn_hf_whisper.yaml index 9ffdc95fd..fe4fd6f17 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_mn_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_mn_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_sr_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_sr_hf_whisper.yaml index 4f257094d..d7390d9a5 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_sr_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_sr_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -83,45 +83,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -130,6 +115,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/train.py b/recipes/CommonVoice/ASR/transformer/train.py index 0aee6735e..89847d352 100644 --- a/recipes/CommonVoice/ASR/transformer/train.py +++ b/recipes/CommonVoice/ASR/transformer/train.py @@ -107,27 +107,25 @@ class ASR(sb.core.Brain): # Augment Labels if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - tokens_lens = self.hparams.fea_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.fea_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml b/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml index 5ead6dbc0..e7ceed4f5 100644 --- a/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml +++ b/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml @@ -27,11 +27,11 @@ skip_prep: False # We remove utterance slonger than 10s in the train/dev/test sets as -# longer sentences certainly correspond to "open microphones". +# longer sentences certainly correspond to open microphones. avoid_if_longer_than: 10.0 avoid_if_shorter_than: 1.0 -# Training parameters +####################### Training Parameters #################################### # Parameters are corresponding the the ones reported in the official wav2vec2 # paper (for the masking). mask_length: 10 @@ -52,8 +52,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # IMPORTANT: To train w2v2 model, we recommand to have the effective batch_size # higher than 100 (batch_size * nb_gpu * grad_accumulation_factor) # Examples are: -# 32 Tesla V100 32GB — 12 * 32 * 1 -# 4 Tesla V100 32GB — 12 * 4 * {6-8} +# 32 Tesla V100 32GB = 12 * 32 * 1 +# 4 Tesla V100 32GB = 12 * 4 * (6-8) batch_size: 12 test_batch_size: 8 grad_accumulation_factor: 8 @@ -104,7 +104,7 @@ modules: wav2vec2: !ref <wav2vec2> opt_class: !name:torch.optim.AdamW - lr: 0 # Will be changed by the scheduler, but we start at 0! + lr: 0 # Will be changed by the scheduler, but we start at 0 betas: (0.9, 0.98) eps: 0.000000001 weight_decay: !ref <weight_decay> diff --git a/recipes/DNS/enhancement/hparams/sepformer-dns-16k.yaml b/recipes/DNS/enhancement/hparams/sepformer-dns-16k.yaml index d43a2f0a9..87a07c97a 100644 --- a/recipes/DNS/enhancement/hparams/sepformer-dns-16k.yaml +++ b/recipes/DNS/enhancement/hparams/sepformer-dns-16k.yaml @@ -39,7 +39,7 @@ sample_rate: 16000 audio_length: 4 # seconds n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 100 batch_size: 4 batch_size_test: 1 diff --git a/recipes/DVoice/ASR/CTC/hparams/train_amh_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_amh_with_wav2vec.yaml index 551ba2c19..e9e1f4310 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_amh_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_amh_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,7 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_dar_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_dar_with_wav2vec.yaml index 0b3647705..d1e2c6684 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_dar_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_dar_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_fon_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_fon_with_wav2vec.yaml index 946ca0b6f..fca0230de 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_fon_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_fon_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_multi_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_multi_with_wav2vec.yaml index 14aef36c9..89fedade8 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_multi_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_multi_with_wav2vec.yaml @@ -31,7 +31,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -57,7 +57,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -77,45 +77,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -124,6 +110,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_sw_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_sw_with_wav2vec.yaml index f00e330a4..0194fd877 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_sw_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_sw_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_wol_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_wol_with_wav2vec.yaml index b1188dcb9..8470ce3a1 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_wol_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_wol_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/ESC50/classification/hparams/cnn14_classifier.yaml b/recipes/ESC50/classification/hparams/cnn14_classifier.yaml index e8034bfdd..bc0a83bbd 100644 --- a/recipes/ESC50/classification/hparams/cnn14_classifier.yaml +++ b/recipes/ESC50/classification/hparams/cnn14_classifier.yaml @@ -41,7 +41,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 32 lr: 0.0002 diff --git a/recipes/ESC50/classification/hparams/conv2d_classifier.yaml b/recipes/ESC50/classification/hparams/conv2d_classifier.yaml index 2b0a49bcd..284d5681f 100644 --- a/recipes/ESC50/classification/hparams/conv2d_classifier.yaml +++ b/recipes/ESC50/classification/hparams/conv2d_classifier.yaml @@ -41,7 +41,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 32 lr: 0.00002 diff --git a/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml b/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml index 6f57a843b..00acd1ff3 100644 --- a/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml +++ b/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml @@ -39,7 +39,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 2 lr: 0.0001 diff --git a/recipes/ESC50/interpret/hparams/l2i_conv2dclassifier.yaml b/recipes/ESC50/interpret/hparams/l2i_conv2dclassifier.yaml index 7292f89af..4f6cb9b90 100644 --- a/recipes/ESC50/interpret/hparams/l2i_conv2dclassifier.yaml +++ b/recipes/ESC50/interpret/hparams/l2i_conv2dclassifier.yaml @@ -39,7 +39,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 16 lr: 0.0002 diff --git a/recipes/ESC50/interpret/hparams/nmf.yaml b/recipes/ESC50/interpret/hparams/nmf.yaml index 7b6c9905d..e4da313ba 100644 --- a/recipes/ESC50/interpret/hparams/nmf.yaml +++ b/recipes/ESC50/interpret/hparams/nmf.yaml @@ -40,7 +40,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 batch_size: 2 lr: 0.0002 diff --git a/recipes/ESC50/interpret/hparams/piq.yaml b/recipes/ESC50/interpret/hparams/piq.yaml index c45f50a20..68f8c06de 100644 --- a/recipes/ESC50/interpret/hparams/piq.yaml +++ b/recipes/ESC50/interpret/hparams/piq.yaml @@ -42,7 +42,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 16 lr: 0.0002 diff --git a/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/conformer.yaml b/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/conformer.yaml index ec8653ade..49a7321f7 100644 --- a/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/conformer.yaml +++ b/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/conformer.yaml @@ -81,7 +81,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 256 nhead: 4 diff --git a/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/transformer.yaml b/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/transformer.yaml index 59c3782e1..4310e2d6b 100644 --- a/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/transformer.yaml +++ b/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/transformer.yaml @@ -91,7 +91,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 256 nhead: 4 diff --git a/recipes/Google-speech-commands/hparams/xvect.yaml b/recipes/Google-speech-commands/hparams/xvect.yaml index 8eb842ba9..417cecfdf 100644 --- a/recipes/Google-speech-commands/hparams/xvect.yaml +++ b/recipes/Google-speech-commands/hparams/xvect.yaml @@ -40,7 +40,7 @@ percentage_silence: 10 # Set this to 0 for the V2 35 task skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 batch_size: 32 lr: 0.001 diff --git a/recipes/Google-speech-commands/hparams/xvect_leaf.yaml b/recipes/Google-speech-commands/hparams/xvect_leaf.yaml index e06101850..f2897af22 100644 --- a/recipes/Google-speech-commands/hparams/xvect_leaf.yaml +++ b/recipes/Google-speech-commands/hparams/xvect_leaf.yaml @@ -42,7 +42,7 @@ percentage_silence: 10 # Set this to 0 for the V2 35 task skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 batch_size: 32 lr: 0.001 diff --git a/recipes/IEMOCAP/emotion_recognition/hparams/train_with_wav2vec2.yaml b/recipes/IEMOCAP/emotion_recognition/hparams/train_with_wav2vec2.yaml index ae3452882..d1b63d7bf 100644 --- a/recipes/IEMOCAP/emotion_recognition/hparams/train_with_wav2vec2.yaml +++ b/recipes/IEMOCAP/emotion_recognition/hparams/train_with_wav2vec2.yaml @@ -38,7 +38,7 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 4 lr: 0.0001 @@ -50,7 +50,7 @@ freeze_wav2vec2: False # We see an improvement of 2% with freezing CNNs freeze_wav2vec2_conv: True -# Model parameters +####################### Model Parameters ####################################### encoder_dim: 768 # Number of emotions diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu.yaml index 05bca5c1e..3901391a5 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu.yaml @@ -30,7 +30,7 @@ wav2vec2_hub: LIA-AvignonUniversity/IWSLT2022-tamasheq-only # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 lr: 0.001 lr_wav2vec: 0.00001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_mbart_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_mbart_st.yaml index a3a2f1c99..6887c3a40 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_mbart_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_mbart_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 500 lr: 0.001 lr_wav2vec: 0.0001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_nllb_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_nllb_st.yaml index 11ebc937a..b86cef685 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_nllb_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_nllb_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 500 lr: 0.001 lr_wav2vec: 0.0001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_mbart_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_mbart_st.yaml index 68f74d9b4..77b7c8cd6 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_mbart_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_mbart_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 500 lr: 0.001 lr_wav2vec: 0.0001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_nllb_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_nllb_st.yaml index b62e366f1..d384bf3a8 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_nllb_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_nllb_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 500 lr: 0.001 lr_wav2vec: 0.0001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_st.yaml index 6bfb9db12..beafeba86 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint wav2vec2_frozen: False keep_n_layers: 6 # keep first N layers from the Transformer Encoder stack inside the wav2vec 2.0 model -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 lr: 0.001 lr_wav2vec: 0.00001 diff --git a/recipes/KsponSpeech/ASR/transformer/hparams/conformer_medium.yaml b/recipes/KsponSpeech/ASR/transformer/hparams/conformer_medium.yaml index aee256973..3c0d43e2a 100644 --- a/recipes/KsponSpeech/ASR/transformer/hparams/conformer_medium.yaml +++ b/recipes/KsponSpeech/ASR/transformer/hparams/conformer_medium.yaml @@ -34,7 +34,7 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -78,7 +78,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 256 nhead: 4 diff --git a/recipes/KsponSpeech/LM/hparams/transformer.yaml b/recipes/KsponSpeech/LM/hparams/transformer.yaml index cd9685e28..5b64cc196 100644 --- a/recipes/KsponSpeech/LM/hparams/transformer.yaml +++ b/recipes/KsponSpeech/LM/hparams/transformer.yaml @@ -24,7 +24,7 @@ test_csv: # Tokenizer model tokenizer_file: ddwkim/asr-conformer-transformerlm-ksponspeech/tokenizer.ckpt -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 256 lr: 0.1 diff --git a/recipes/KsponSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml b/recipes/KsponSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml index dd7cd4906..04ef0ebfd 100644 --- a/recipes/KsponSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml +++ b/recipes/KsponSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml @@ -16,7 +16,7 @@ skip_prep: False train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/LibriMix/separation/hparams/sepformer-libri2mix.yaml b/recipes/LibriMix/separation/hparams/sepformer-libri2mix.yaml index 8fb195c03..ffa5a1ef2 100644 --- a/recipes/LibriMix/separation/hparams/sepformer-libri2mix.yaml +++ b/recipes/LibriMix/separation/hparams/sepformer-libri2mix.yaml @@ -37,7 +37,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml b/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml index cf68e9a81..abc9c76c7 100644 --- a/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml +++ b/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml @@ -37,7 +37,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/LibriParty/VAD/hparams/train.yaml b/recipes/LibriParty/VAD/hparams/train.yaml index e07258c44..be9191685 100644 --- a/recipes/LibriParty/VAD/hparams/train.yaml +++ b/recipes/LibriParty/VAD/hparams/train.yaml @@ -41,7 +41,7 @@ speech_csv: !ref <save_folder>/speech.csv multilang_speech_csv: !ref <save_folder>/multilang_speech.csv skip_prep: False # Skip data preparation -# Training parameters +####################### Training Parameters #################################### N_epochs: 100 lr: 1.0 lr_final: 0.1 @@ -65,7 +65,7 @@ test_dataloader_opts: n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU # dropout: 0.15 # cnn_blocks: 2 diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml index b609bb766..fdbd7e86d 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml @@ -33,7 +33,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -62,7 +63,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -73,76 +75,12 @@ ctc_neurons: 29 output_neurons: 29 # Characters size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 200 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -10.0 - token_prune_min_logp: -5 - prune_history: True - alpha: 0.5 - beta: 1.5 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] activation: !ref <activation> @@ -211,8 +149,60 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 200 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -10.0 + token_prune_min_logp: -5 + prune_history: True + alpha: 0.5 + beta: 1.5 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml index f92f4f8fc..1b84596dc 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml @@ -34,7 +34,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -63,7 +64,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -74,76 +76,12 @@ ctc_neurons: 29 output_neurons: 29 # Characters size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 200 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -10.0 - token_prune_min_logp: -5 - prune_history: True - alpha: 0.5 - beta: 1.5 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - - # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -214,8 +152,60 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 200 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -10.0 + token_prune_min_logp: -5 + prune_history: True + alpha: 0.5 + beta: 1.5 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml index 6c0e7207d..d0daf5b77 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml @@ -33,7 +33,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -61,7 +62,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -72,75 +74,12 @@ ctc_neurons: 58 # Twice bigger than the number of characters for upsampling output_neurons: 29 # Characters size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 200 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -10.0 - token_prune_min_logp: -5 - prune_history: True - alpha: 0.5 - beta: 1.5 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -210,8 +149,58 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 200 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -10.0 + token_prune_min_logp: -5 + prune_history: True + alpha: 0.5 + beta: 1.5 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml index 2d91909f2..1d860a29f 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml @@ -32,7 +32,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -56,7 +57,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -66,75 +67,14 @@ freeze_wav2vec: True output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - # # Functions and classes # -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] +label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref <number_of_epochs> enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -198,7 +138,58 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Decoding ######################################## + +# Decoding parameters +test_beam_search: + beam_size: 143 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -1.2 + prune_history: True + alpha: 0.8 + beta: 1.2 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + +############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml index 01a31cdd8..c946b0243 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml @@ -32,7 +32,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -56,7 +57,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -68,49 +70,6 @@ output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 pretrained_lm_tokenizer_path: speechbrain/asr-crdnn-rnnlm-librispeech -# This is the RNNLM that is used according to the Huggingface repository -# NB: It has to match the pre-trained RNNLM!! -lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM - output_neurons: 1000 - embedding_dim: 128 - activation: !name:torch.nn.LeakyReLU - dropout: 0.0 - rnn_layers: 2 - rnn_neurons: 2048 - dnn_blocks: 1 - dnn_neurons: 512 - return_hidden: True # For inference - -tokenizer: !new:sentencepiece.SentencePieceProcessor - -# Decoding parameters -lm_weight: 0.5 -blank_index: 0 -# topk is the number of hypotheses that will be rescored in the rescorer -# lowering this value might decrease the wer, but will increase speed. - -test_beam_search: - beam_size: 20 - topk: 20 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -12.0 - prune_history: False - alpha: 0.8 - beta: 1.2 - -rnnlm: !new:speechbrain.decoders.scorer.RNNLMRescorer - language_model: !ref <lm_model> - tokenizer: !ref <tokenizer> - bos_index: 0 - eos_index: 0 - pad_index: 0 - -rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder - rescorers: [!ref <rnnlm>] - weights: - rnnlm: !ref <lm_weight> # # Functions and classes @@ -118,53 +77,6 @@ rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -230,6 +142,84 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +# This is the RNNLM that is used according to the Huggingface repository +# NB: It has to match the pre-trained RNNLM!! +lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM + output_neurons: 1000 + embedding_dim: 128 + activation: !name:torch.nn.LeakyReLU + dropout: 0.0 + rnn_layers: 2 + rnn_neurons: 2048 + dnn_blocks: 1 + dnn_neurons: 512 + return_hidden: True # For inference + + +tokenizer: !new:sentencepiece.SentencePieceProcessor + +############################## Decoding ######################################## + +# topk is the number of hypotheses that will be rescored in the rescorer +# lowering this value might decrease the wer, but will increase speed. +test_beam_search: + beam_size: 20 + topk: 20 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -12.0 + prune_history: False + alpha: 0.8 + beta: 1.2 + +rnnlm: !new:speechbrain.decoders.scorer.RNNLMRescorer + language_model: !ref <lm_model> + tokenizer: !ref <tokenizer> + bos_index: 0 + eos_index: 0 + pad_index: 0 + +rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder + rescorers: [!ref <rnnlm>] + weights: + rnnlm: !ref <lm_weight> + +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml index 724c3bf1e..d806b20cf 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml @@ -32,7 +32,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -56,7 +57,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -88,30 +89,6 @@ tokenizer: !new:sentencepiece.SentencePieceProcessor # Decoding parameters lm_weight: 0.5 blank_index: 0 -# topk is the number of hypotheses that will be rescored in the rescorer -# lowering this value might decrease the wer, but will increase speed. -test_beam_search: - beam_size: 20 - topk: 20 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -12.0 - prune_history: False - alpha: 0.8 - beta: 1.2 - -transformerlm: !new:speechbrain.decoders.scorer.TransformerLMRescorer - language_model: !ref <lm_model> - tokenizer: !ref <tokenizer> - pad_index: 0 - bos_index: 1 - eos_index: 2 - -rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder - rescorers: [!ref <transformerlm>] - weights: - transformerlm: !ref <lm_weight> # # Functions and classes @@ -119,53 +96,6 @@ rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -229,8 +159,68 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Decoding ######################################## + +# topk is the number of hypotheses that will be rescored in the rescorer +# lowering this value might decrease the wer, but will increase speed. +test_beam_search: + beam_size: 20 + topk: 20 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -12.0 + prune_history: False + alpha: 0.8 + beta: 1.2 + +transformerlm: !new:speechbrain.decoders.scorer.TransformerLMRescorer + language_model: !ref <lm_model> + tokenizer: !ref <tokenizer> + pad_index: 0 + bos_index: 1 + eos_index: 2 + +rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder + rescorers: [!ref <transformerlm>] + weights: + transformerlm: !ref <lm_weight> + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml index 735b29db9..ba20bf2ac 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml @@ -31,7 +31,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 15 warmup_steps: 1000 # We freeze whisper for 1000 steps to let the CTC adapt lr: 0.0008 @@ -61,7 +62,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### dnn_neurons: 1024 freeze_whisper: False whisper_output_dim: 512 @@ -71,74 +72,12 @@ whisper_output_dim: 512 output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <whisper_output_dim>] @@ -204,6 +143,57 @@ lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.75 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 143 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -1.2 + prune_history: True + alpha: 0.8 + beta: 1.2 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml index cf9cf7ec8..1b281b35c 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml @@ -33,7 +33,7 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 0.0003 lr_wav2vec: 0.00005 @@ -58,7 +58,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### dnn_activation: !new:torch.nn.LeakyReLU dnn_neurons: 1280 dnn_dropout: 0.15 @@ -68,75 +68,12 @@ freeze_wav2vec: False output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 200 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -10.0 - token_prune_min_logp: -5.0 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] @@ -227,6 +164,58 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.7 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 200 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -10.0 + token_prune_min_logp: -5.0 + prune_history: True + alpha: 0.8 + beta: 1.2 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec.py b/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec.py index f2b6373b2..1f4ccdd2c 100644 --- a/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec.py +++ b/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec.py @@ -101,10 +101,15 @@ class ASR(sb.Brain): ids = batch.id tokens, tokens_lens = batch.tokens - # Label Augmentation + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + ( + tokens, + tokens_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens + ) loss_ctc = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) loss = loss_ctc diff --git a/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py b/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py index e3db36334..d575265e8 100644 --- a/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py +++ b/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py @@ -72,10 +72,15 @@ class ASR(sb.Brain): ids = batch.id tokens, tokens_lens = batch.tokens - # Label Augmentation + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + ( + tokens, + tokens_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens + ) loss_ctc = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) loss = loss_ctc diff --git a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000.yaml b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000.yaml index e38b545fb..3d0aaa200 100644 --- a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000.yaml +++ b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000.yaml @@ -44,7 +44,8 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 15 number_of_ctc_epochs: 5 batch_size: 8 @@ -89,7 +90,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -128,74 +130,6 @@ coverage_penalty: 1.5 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Download and prepare the dataset of noisy sequences for augmentation -prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL - URL: !ref <NOISE_DATASET_URL> - dest_folder: !ref <data_folder_noise> - ext: wav - csv_file: !ref <noise_annotation> - - -# Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - -add_noise: !new:speechbrain.augment.time_domain.AddNoise - csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> - noise_sample_rate: !ref <sample_rate> - clean_sample_rate: !ref <sample_rate> - num_workers: !ref <num_workers> - -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <add_noise>, - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global @@ -288,7 +222,8 @@ modules: model: !new:torch.nn.ModuleList - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>] -# Scorer +############################## Decoding & optimiser ############################ + coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer vocab_size: !ref <output_neurons> @@ -339,6 +274,57 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.8 patient: 0 +############################## Augmentations ################################### + +prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL + URL: !ref <NOISE_DATASET_URL> + dest_folder: !ref <data_folder_noise> + ext: wav + csv_file: !ref <noise_annotation> + +# Add noise to input signal +add_noise: !new:speechbrain.augment.time_domain.AddNoise + csv_file: !ref <noise_annotation> + snr_low: 0 + snr_high: 15 + noise_sample_rate: !ref <sample_rate> + clean_sample_rate: !ref <sample_rate> + num_workers: !ref <num_workers> + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <add_noise>, + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000_sligru.yaml b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000_sligru.yaml index 164f1ffe7..355c49d36 100644 --- a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000_sligru.yaml +++ b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000_sligru.yaml @@ -44,7 +44,8 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 15 number_of_ctc_epochs: 15 batch_size: 24 @@ -89,7 +90,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -126,75 +128,6 @@ temperature_lm: 1.25 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Download and prepare the dataset of noisy sequences for augmentation -prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL - URL: !ref <NOISE_DATASET_URL> - dest_folder: !ref <data_folder_noise> - ext: wav - csv_file: !ref <noise_annotation> - - -# Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - -add_noise: !new:speechbrain.augment.time_domain.AddNoise - csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> - noise_sample_rate: !ref <sample_rate> - clean_sample_rate: !ref <sample_rate> - num_workers: !ref <num_workers> - -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <add_noise>, - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - - normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global @@ -286,7 +219,8 @@ modules: model: !new:torch.nn.ModuleList - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>] -# Scorer +############################## Decoding & optimiser ############################ + coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer vocab_size: !ref <output_neurons> @@ -337,6 +271,57 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.8 patient: 0 +############################## Augmentations ################################### + +prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL + URL: !ref <NOISE_DATASET_URL> + dest_folder: !ref <data_folder_noise> + ext: wav + csv_file: !ref <noise_annotation> + +# Add noise to input signal +add_noise: !new:speechbrain.augment.time_domain.AddNoise + csv_file: !ref <noise_annotation> + snr_low: 0 + snr_high: 15 + noise_sample_rate: !ref <sample_rate> + clean_sample_rate: !ref <sample_rate> + num_workers: !ref <num_workers> + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <add_noise>, + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_5000.yaml b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_5000.yaml index cc0647562..3046dfea8 100644 --- a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_5000.yaml +++ b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_5000.yaml @@ -45,7 +45,8 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 25 batch_size: 8 @@ -90,7 +91,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -132,75 +134,6 @@ coverage_penalty: 1.5 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Download and prepare the dataset of noisy sequences for augmentation -prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL - URL: !ref <NOISE_DATASET_URL> - dest_folder: !ref <data_folder_noise> - ext: wav - csv_file: !ref <noise_annotation> - - -# Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - -add_noise: !new:speechbrain.augment.time_domain.AddNoise - csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> - noise_sample_rate: !ref <sample_rate> - clean_sample_rate: !ref <sample_rate> - num_workers: !ref <num_workers> - -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <add_noise>, - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - - normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global @@ -294,7 +227,8 @@ modules: model: !new:torch.nn.ModuleList - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>] -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -360,6 +294,57 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.8 patient: 0 +############################## Augmentations ################################### + +prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL + URL: !ref <NOISE_DATASET_URL> + dest_folder: !ref <data_folder_noise> + ext: wav + csv_file: !ref <noise_annotation> + +# Add noise to input signal +add_noise: !new:speechbrain.augment.time_domain.AddNoise + csv_file: !ref <noise_annotation> + snr_low: 0 + snr_high: 15 + noise_sample_rate: !ref <sample_rate> + clean_sample_rate: !ref <sample_rate> + num_workers: !ref <num_workers> + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <add_noise>, + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/seq2seq/train.py b/recipes/LibriSpeech/ASR/seq2seq/train.py index b3adaa67a..7f5351008 100644 --- a/recipes/LibriSpeech/ASR/seq2seq/train.py +++ b/recipes/LibriSpeech/ASR/seq2seq/train.py @@ -97,12 +97,16 @@ class ASR(sb.Brain): tokens_eos, tokens_eos_lens = batch.tokens_eos tokens, tokens_lens = batch.tokens + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - tokens_eos = self.hparams.wav_augment.replicate_labels(tokens_eos) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml b/recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml index c7ad99c63..e9757e208 100644 --- a/recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml +++ b/recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml @@ -40,7 +40,8 @@ test_csv: skip_prep: False ckpt_interval_minutes: 5 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -120,7 +121,8 @@ dynamic_batch_sampler: batch_ordering: random max_batch_ex: 256 -# Model parameters +####################### Model Parameters ####################################### + # Transformer d_model: 512 joint_dim: 640 @@ -164,18 +166,15 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_mels: !ref <n_mels> win_length: !ref <win_length> +############################## Augmentations ################################### + # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 1 max_augmentations: 1 augment_prob: 1.0 @@ -183,43 +182,24 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 + replace: "zeros" # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 + replace: "zeros" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: False @@ -234,6 +214,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) num_blocks: 2 @@ -355,6 +337,8 @@ modules: model: !new:torch.nn.ModuleList - [!ref <CNN>, !ref <enc>, !ref <emb>, !ref <dec>, !ref <proj_enc>, !ref <proj_dec>, !ref <proj_ctc>, !ref <transducer_lin>] +############################## Decoding & optimiser ############################ + # Tokenizer initialization tokenizer: !new:sentencepiece.SentencePieceProcessor @@ -388,6 +372,8 @@ noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref <lr> n_warmup_steps: !ref <warmup_steps> +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/transducer/train.py b/recipes/LibriSpeech/ASR/transducer/train.py index 497912c83..84d7e05ff 100644 --- a/recipes/LibriSpeech/ASR/transducer/train.py +++ b/recipes/LibriSpeech/ASR/transducer/train.py @@ -155,27 +155,16 @@ class ASR(sb.Brain): logits_transducer, wav_lens, predicted_tokens = predictions if stage == sb.Stage.TRAIN: - if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - token_lens = self.hparams.wav_augment.replicate_labels( - token_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - token_eos_lens = self.hparams.wav_augment.replicate_labels( - token_eos_lens - ) + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - token_lens = self.hparams.fea_augment.replicate_labels( - token_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - token_eos_lens = self.hparams.fea_augment.replicate_labels( - token_eos_lens + ( + tokens, + token_lens, + tokens_eos, + token_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, token_lens, tokens_eos, token_eos_lens ) if stage == sb.Stage.TRAIN: diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/bayesspeech.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/bayesspeech.yaml index 7772517e9..2eee3646e 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/bayesspeech.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/bayesspeech.yaml @@ -42,7 +42,8 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -116,7 +117,7 @@ test_dataloader_opts: padding_kwargs: value: !ref <pad_index> -####################### Model parameters ########################### +####################### Model Parameters ####################################### # Transformer d_model: 512 nhead: 4 @@ -148,7 +149,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -216,7 +217,8 @@ Adam: !name:torch.optim.Adam eps: 0.000000001 -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -290,57 +292,34 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" # Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -354,6 +333,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/branchformer_large.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/branchformer_large.yaml index c3f66ec9f..02fc2eac4 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/branchformer_large.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/branchformer_large.yaml @@ -41,9 +41,11 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. -# The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. +# The global batch size is computed as batch_size * n_gpus * +# grad_accumulation_factor. # Empirically, we found that this value should be >= 128. # Please, set your parameters accordingly. number_of_epochs: 120 @@ -103,7 +105,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 512 nhead: 8 @@ -131,7 +134,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -204,7 +207,8 @@ Adam: !name:torch.optim.AdamW eps: 0.000000001 weight_decay: !ref <weight_decay> -# Scorer +####################### Decoding & optimiser ################################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -275,57 +279,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +####################### Augmentations ########################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -340,6 +321,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank win_length: !ref <win_length> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml index 5d252a4b5..7cdd4c06f 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml @@ -41,7 +41,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -102,7 +103,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 512 nhead: 8 @@ -129,7 +131,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -200,7 +202,8 @@ Adam: !name:torch.optim.AdamW model: !new:torch.nn.ModuleList - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>] -# Scorer +####################### Decoding & optimiser ########################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -271,57 +274,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -336,6 +316,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_mels: !ref <n_mels> win_length: !ref <win_length> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/conformer_small.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/conformer_small.yaml index eddc96780..a24e6649a 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/conformer_small.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/conformer_small.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -102,7 +103,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 144 nhead: 4 @@ -129,7 +131,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -200,12 +202,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -#SGD: !name:torch.optim.SGD -# lr: !ref <lr_sgd> -# momentum: 0.99 -# nesterov: True +############################## Decoding & optimiser ############################ -# Scorer ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -277,57 +275,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -341,6 +316,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_13M.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_13M.yaml index 7b2912ec3..4b6ca718f 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_13M.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_13M.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +############################## Training Parameters ############################# + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -99,7 +100,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 144 nhead: 8 @@ -133,7 +135,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -206,7 +208,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -276,57 +279,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -340,6 +320,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_25M.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_25M.yaml index 71d97cdb9..2e0242e31 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_25M.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_25M.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +############################## Training Parameters ############################# + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -100,7 +101,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### # Transformer d_model: 256 nhead: 8 @@ -134,7 +135,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -207,7 +208,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -277,57 +279,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -335,11 +314,14 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <time_drop>, !ref <freq_drop>, !ref <time_warp>] + compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref <sample_rate> n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_22M.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_22M.yaml index fdf65fde3..6e165ed5c 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_22M.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_22M.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -87,6 +88,7 @@ dynamic_batch_sampler_valid: batch_ordering: !ref <batch_ordering> max_batch_ex: !ref <max_batch_ex> + # Dataloader options train_dataloader_opts: batch_size: !ref <batch_size> @@ -99,7 +101,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 256 nhead: 8 @@ -132,7 +135,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -203,7 +206,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -# Scorer +####################### Decoding & optimiser ################################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -273,57 +277,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -337,6 +318,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_8M.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_8M.yaml index 47053b5ff..fe3bd599c 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_8M.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_8M.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -100,7 +101,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 144 nhead: 8 @@ -133,7 +135,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -204,7 +206,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -# Scorer +####################### Decoding & optimiser ########################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -274,57 +277,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -338,6 +318,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/train_hf_whisper.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/train_hf_whisper.yaml index 4805d7c6c..4891ca617 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/train_hf_whisper.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/train_hf_whisper.yaml @@ -35,7 +35,8 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +############################## Training Parameters ############################# + number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -61,7 +62,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False @@ -74,52 +75,34 @@ valid_loader_kwargs: test_loader_kwargs: batch_size: !ref <test_batch_size> - -# -# Functions and classes -# epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 # Min frequency band dropout probability + drop_freq_high: 1 # Max frequency band dropout probability + drop_freq_count_low: 1 # Min number of frequency bands to drop + drop_freq_count_high: 3 # Max number of frequency bands to drop + drop_freq_width: 0.05 # Width of frequency bands to drop # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1 + drop_length_high: 5 + drop_count_low: 1000 + drop_count_high: 2000 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -128,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> freeze: !ref <freeze_whisper> @@ -142,6 +127,8 @@ nll_loss: !name:speechbrain.nnet.losses.nll_loss modules: whisper: !ref <whisper> +############################## Decoding & optimiser ############################ + whisper_opt_class: !name:torch.optim.AdamW lr: !ref <lr_whisper> weight_decay: 0.01 @@ -167,6 +154,8 @@ lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/transformer.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/transformer.yaml index 36626f0d9..173453e9d 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/transformer.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/transformer.yaml @@ -42,7 +42,8 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -116,7 +117,8 @@ test_dataloader_opts: padding_kwargs: value: !ref <pad_index> -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 512 nhead: 4 @@ -142,7 +144,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -210,7 +212,8 @@ Adam: !name:torch.optim.Adam eps: 0.000000001 -# Scorer +####################### Decoding & optimiser ################################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -284,57 +287,34 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -348,6 +328,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/train.py b/recipes/LibriSpeech/ASR/transformer/train.py index b69763e26..292d7cc42 100644 --- a/recipes/LibriSpeech/ASR/transformer/train.py +++ b/recipes/LibriSpeech/ASR/transformer/train.py @@ -114,16 +114,16 @@ class ASR(sb.core.Brain): tokens, tokens_lens = batch.tokens if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - tokens_lens = self.hparams.fea_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.fea_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml b/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml index a565905c8..f487ffbe1 100644 --- a/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml +++ b/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml @@ -95,7 +95,7 @@ homograph_loss_weight: 2.0 lr: 0.002 save_for_pretrained: True -# Model parameters +####################### Model Parameters ####################################### output_neurons: !apply:speechbrain.utils.hparams.choice value: !ref <phn_tokenize> choices: diff --git a/recipes/LibriSpeech/G2P/hparams/hparams_g2p_transformer.yaml b/recipes/LibriSpeech/G2P/hparams/hparams_g2p_transformer.yaml index c75cd97bf..e1c0f44c7 100644 --- a/recipes/LibriSpeech/G2P/hparams/hparams_g2p_transformer.yaml +++ b/recipes/LibriSpeech/G2P/hparams/hparams_g2p_transformer.yaml @@ -95,7 +95,7 @@ lr_dont_halve_until_epoch: 1 lr_patience: 1 save_for_pretrained: True -# Model parameters +####################### Model Parameters ####################################### output_neurons: !apply:speechbrain.utils.hparams.choice value: !ref <phn_tokenize> choices: diff --git a/recipes/LibriSpeech/G2P/hparams/hparams_lm_rnn.yaml b/recipes/LibriSpeech/G2P/hparams/hparams_lm_rnn.yaml index dcb768259..7e1b7bc4a 100644 --- a/recipes/LibriSpeech/G2P/hparams/hparams_lm_rnn.yaml +++ b/recipes/LibriSpeech/G2P/hparams/hparams_lm_rnn.yaml @@ -50,7 +50,7 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger tokenizer_file: <output_folder>/save/phoneme_tokenizer.model -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 80 lr: 0.001 @@ -68,7 +68,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### model_dim: !apply:speechbrain.utils.hparams.choice value: !ref <phn_tokenize> choices: diff --git a/recipes/LibriSpeech/G2P/hparams/hparams_lm_transformer.yaml b/recipes/LibriSpeech/G2P/hparams/hparams_lm_transformer.yaml index 2a9a434d6..5e319e3d8 100644 --- a/recipes/LibriSpeech/G2P/hparams/hparams_lm_transformer.yaml +++ b/recipes/LibriSpeech/G2P/hparams/hparams_lm_transformer.yaml @@ -39,7 +39,7 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger # Tokenizer model (you must use the same tokenizer for LM and ASR training) tokenizer_file: <output_folder>/save/phoneme_tokenizer.model -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 80 lr: 0.001 @@ -57,7 +57,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### emb_dim: 256 # dimension of the embeddings transformer_num_heads: 4 diff --git a/recipes/LibriSpeech/LM/hparams/RNNLM.yaml b/recipes/LibriSpeech/LM/hparams/RNNLM.yaml index b061b4fcc..0896de960 100644 --- a/recipes/LibriSpeech/LM/hparams/RNNLM.yaml +++ b/recipes/LibriSpeech/LM/hparams/RNNLM.yaml @@ -29,7 +29,7 @@ test_transcripts_pattern: "test*/**/*.trans.txt" # Tokenizer model tokenizer_file: https://www.dropbox.com/s/o7gnouwdoqchotj/1000_unigram.model?dl=1 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 80 lr: 0.001 @@ -47,7 +47,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### emb_size: 128 activation: !name:torch.nn.LeakyReLU dropout: 0.0 diff --git a/recipes/LibriSpeech/LM/hparams/transformer.yaml b/recipes/LibriSpeech/LM/hparams/transformer.yaml index c79ef5769..50123a4c3 100644 --- a/recipes/LibriSpeech/LM/hparams/transformer.yaml +++ b/recipes/LibriSpeech/LM/hparams/transformer.yaml @@ -29,7 +29,7 @@ test_transcripts_pattern: "test*/**/*.trans.txt" # Tokenizer model tokenizer_file: speechbrain/asr-transformer-transformerlm-librispeech/tokenizer.ckpt -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 16 lr: 10 diff --git a/recipes/LibriSpeech/Tokenizer/hparams/1K_unigram_subword_bpe.yaml b/recipes/LibriSpeech/Tokenizer/hparams/1K_unigram_subword_bpe.yaml index b5a9fa60e..9dda21f82 100644 --- a/recipes/LibriSpeech/Tokenizer/hparams/1K_unigram_subword_bpe.yaml +++ b/recipes/LibriSpeech/Tokenizer/hparams/1K_unigram_subword_bpe.yaml @@ -16,7 +16,7 @@ skip_prep: False train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev-clean.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 1000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/LibriSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml b/recipes/LibriSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml index c312ce5bb..1f328c6f1 100644 --- a/recipes/LibriSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml +++ b/recipes/LibriSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml @@ -16,7 +16,7 @@ skip_prep: False train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev-clean.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/LibriSpeech/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml b/recipes/LibriSpeech/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml index 4806f3e06..13ce0d220 100644 --- a/recipes/LibriSpeech/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml +++ b/recipes/LibriSpeech/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml @@ -48,7 +48,7 @@ test_dataloader_options: batch_size: 8 # DynamicBatching not used at testing time num_workers: 4 -# Training parameters +####################### Training Parameters #################################### lr: 0.0005 warmup: 30000 # This is equivalent to optimizer_step_limit - warmup @@ -63,7 +63,7 @@ mask_prob: 0.65 mask_length: 10 num_negatives: 100 -# Model parameters +####################### Model Parameters ####################################### embedding_dim: 768 extractor_dim: 512 final_dim: 256 diff --git a/recipes/MEDIA/ASR/CTC/hparams/train_hf_wav2vec.yaml b/recipes/MEDIA/ASR/CTC/hparams/train_hf_wav2vec.yaml index 924579a9c..70ef38de7 100644 --- a/recipes/MEDIA/ASR/CTC/hparams/train_hf_wav2vec.yaml +++ b/recipes/MEDIA/ASR/CTC/hparams/train_hf_wav2vec.yaml @@ -55,7 +55,7 @@ test_dataloader_options: sample_rate: 16000 feats_dim: 1024 -# Training parameters: +####################### Training Parameters ####################################: number_of_epochs: 30 lr: 1 lr_wav2vec: 0.0001 @@ -67,7 +67,7 @@ patient: 0 patient_wav2vec: 0 sorting: ascending -# Model parameters: +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_blocks: 3 dnn_neurons: 512 diff --git a/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_full.yaml b/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_full.yaml index 7be6f6b94..4f9bad2e7 100644 --- a/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_full.yaml +++ b/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_full.yaml @@ -57,7 +57,7 @@ test_dataloader_options: sample_rate: 16000 feats_dim: 1024 -# Training parameters: +####################### Training Parameters ####################################: number_of_epochs: 30 lr: 1 lr_wav2vec: 0.0001 @@ -69,7 +69,7 @@ patient: 0 patient_wav2vec: 0 sorting: ascending -# Model parameters: +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_blocks: 3 dnn_neurons: 512 diff --git a/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_relax.yaml b/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_relax.yaml index d631a6da8..8631e6e88 100644 --- a/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_relax.yaml +++ b/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_relax.yaml @@ -57,7 +57,7 @@ test_dataloader_options: sample_rate: 16000 feats_dim: 1024 -# Training parameters: +####################### Training Parameters ####################################: number_of_epochs: 30 lr: 1 lr_wav2vec: 0.0001 @@ -69,7 +69,7 @@ patient: 0 patient_wav2vec: 0 sorting: ascending -# Model parameters: +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_blocks: 3 dnn_neurons: 512 diff --git a/recipes/MultiWOZ/response_generation/gpt/hparams/train_gpt.yaml b/recipes/MultiWOZ/response_generation/gpt/hparams/train_gpt.yaml index 05b16bea3..5bb3b8ed8 100644 --- a/recipes/MultiWOZ/response_generation/gpt/hparams/train_gpt.yaml +++ b/recipes/MultiWOZ/response_generation/gpt/hparams/train_gpt.yaml @@ -58,7 +58,7 @@ max_history: 5 ignore_index: -100 label_smoothing: 0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 4 batch_size: 8 test_batch_size: 4 diff --git a/recipes/MultiWOZ/response_generation/llama2/hparams/train_llama2.yaml b/recipes/MultiWOZ/response_generation/llama2/hparams/train_llama2.yaml index f7fd2b087..507115e83 100644 --- a/recipes/MultiWOZ/response_generation/llama2/hparams/train_llama2.yaml +++ b/recipes/MultiWOZ/response_generation/llama2/hparams/train_llama2.yaml @@ -40,7 +40,7 @@ max_history: 2 ignore_index: -100 label_smoothing: 0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 4 batch_size: 1 test_batch_size: 1 diff --git a/recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml b/recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml index 3f1da0919..c23c11c53 100644 --- a/recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml +++ b/recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml @@ -67,7 +67,7 @@ num_spks: 2 noprogressbar: False sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.0001 diff --git a/recipes/RescueSpeech/ASR/noise-robust/hparams/robust_asr_16k.yaml b/recipes/RescueSpeech/ASR/noise-robust/hparams/robust_asr_16k.yaml index 302dabe57..10e8e58e4 100644 --- a/recipes/RescueSpeech/ASR/noise-robust/hparams/robust_asr_16k.yaml +++ b/recipes/RescueSpeech/ASR/noise-robust/hparams/robust_asr_16k.yaml @@ -41,7 +41,7 @@ skip_prep: False # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -## Model parameters- Enhance model +## Model Parameters- Enhance model dereverberate: False save_audio: True sample_rate: 16000 @@ -54,7 +54,7 @@ use_rand_shift: False min_shift: -8000 max_shift: 8000 -## Training parameters- ASR +######################## Training Parameters ####################################- ASR number_of_epochs: 10 lr_whisper: 0.00003 sorting: ascending diff --git a/recipes/SLURP/NLU/hparams/train.yaml b/recipes/SLURP/NLU/hparams/train.yaml index e2201d96b..7d88d62a9 100644 --- a/recipes/SLURP/NLU/hparams/train.yaml +++ b/recipes/SLURP/NLU/hparams/train.yaml @@ -28,14 +28,14 @@ asr_tokenizer_file: https://www.dropbox.com/s/o7gnouwdoqchotj/1000_unigram.model slu_tokenizer_file: https://www.dropbox.com/s/tmwq12r5vgcsif9/58_unigram.model?dl=1 skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 16 lr: 0.0003 # token_type: unigram # ["unigram", "bpe", "char"] sorting: random -# Model parameters +####################### Model Parameters ####################################### # sample_rate: 1600 emb_size: 128 dec_neurons: 512 diff --git a/recipes/SLURP/Tokenizer/hparams/tokenizer_bpe58.yaml b/recipes/SLURP/Tokenizer/hparams/tokenizer_bpe58.yaml index 51f805b07..bf935024a 100644 --- a/recipes/SLURP/Tokenizer/hparams/tokenizer_bpe58.yaml +++ b/recipes/SLURP/Tokenizer/hparams/tokenizer_bpe58.yaml @@ -14,7 +14,7 @@ train_csv: !ref <output_folder>/train-type=direct.csv valid_csv: !ref <output_folder>/devel-type=direct.csv skip_prep: False -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 58 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/SLURP/direct/hparams/train.yaml b/recipes/SLURP/direct/hparams/train.yaml index 5a42c738c..038d2e59e 100644 --- a/recipes/SLURP/direct/hparams/train.yaml +++ b/recipes/SLURP/direct/hparams/train.yaml @@ -34,7 +34,7 @@ rir_annotation: !ref <save_folder>/rir.csv tokenizer_file: https://www.dropbox.com/s/tmwq12r5vgcsif9/58_unigram.model?dl=1 skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 16 lr: 0.0003 @@ -42,7 +42,7 @@ lr: 0.0003 sorting: random ckpt_interval_minutes: 15 # save checkpoint every N min -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -100,41 +100,31 @@ add_noise: !new:speechbrain.augment.time_domain.AddNoise clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> +############################## Augmentations ################################### + # Speed perturbation -speed_changes: [90, 95, 105, 110] # List of speed changes for time-stretching speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [90, 95, 105, 110] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 3 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 3 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 shuffle_augmentations: True min_augmentations: 1 max_augmentations: 4 @@ -146,7 +136,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] -# Models +############################## Models ########################################## + asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech slu_enc: !new:speechbrain.nnet.containers.Sequential diff --git a/recipes/SLURP/direct/hparams/train_with_wav2vec2.yaml b/recipes/SLURP/direct/hparams/train_with_wav2vec2.yaml index b383da5cb..84222db5f 100644 --- a/recipes/SLURP/direct/hparams/train_with_wav2vec2.yaml +++ b/recipes/SLURP/direct/hparams/train_with_wav2vec2.yaml @@ -32,7 +32,7 @@ skip_prep: False # URL for the wav2vec2 model, you can change to benchmark diffrenet models wav2vec2_hub: "facebook/hubert-base-ls960" -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 35 batch_size: 6 lr: 0.0003 @@ -47,7 +47,7 @@ freeze_wav2vec2: False #set to true to freeze the CONV part of the wav2vec2 model freeze_wav2vec2_conv: True -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -96,45 +96,31 @@ seq_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref <dec_neurons> n_neurons: !ref <output_neurons> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/Switchboard/ASR/CTC/hparams/train_with_wav2vec.yaml b/recipes/Switchboard/ASR/CTC/hparams/train_with_wav2vec.yaml index 2933e0fd4..7741680bd 100644 --- a/recipes/Switchboard/ASR/CTC/hparams/train_with_wav2vec.yaml +++ b/recipes/Switchboard/ASR/CTC/hparams/train_with_wav2vec.yaml @@ -49,7 +49,7 @@ test_csv: - !ref <output_folder>/test_callhome.csv - !ref <output_folder>/test.csv -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -74,7 +74,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -109,45 +109,31 @@ kenlm_model_path: null epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -156,6 +142,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/Switchboard/ASR/seq2seq/hparams/train_BPE_2000.yaml b/recipes/Switchboard/ASR/seq2seq/hparams/train_BPE_2000.yaml index d20001f1c..743467bcf 100644 --- a/recipes/Switchboard/ASR/seq2seq/hparams/train_BPE_2000.yaml +++ b/recipes/Switchboard/ASR/seq2seq/hparams/train_BPE_2000.yaml @@ -57,7 +57,7 @@ test_csv: - !ref <save_folder>/test_callhome.csv - !ref <save_folder>/test.csv -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 number_of_ctc_epochs: 5 batch_size: 10 @@ -103,7 +103,7 @@ test_dataloader_opts: num_workers: !ref <num_workers> batch_size: !ref <batch_size> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -144,57 +144,40 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U ext: wav csv_file: !ref <noise_annotation> -# Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation +############################## Augmentations ################################### +# Add noise to input signal add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -215,6 +198,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/Switchboard/ASR/seq2seq/train.py b/recipes/Switchboard/ASR/seq2seq/train.py index 57ce5c966..d0cd3ce91 100644 --- a/recipes/Switchboard/ASR/seq2seq/train.py +++ b/recipes/Switchboard/ASR/seq2seq/train.py @@ -127,12 +127,16 @@ class ASR(sb.Brain): tokens_eos, tokens_eos_lens = batch.tokens_eos tokens, tokens_lens = batch.tokens + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - tokens_eos = self.hparams.wav_augment.replicate_labels(tokens_eos) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/Switchboard/ASR/transformer/hparams/transformer.yaml b/recipes/Switchboard/ASR/transformer/hparams/transformer.yaml index bd84a8a19..674c03719 100644 --- a/recipes/Switchboard/ASR/transformer/hparams/transformer.yaml +++ b/recipes/Switchboard/ASR/transformer/hparams/transformer.yaml @@ -51,7 +51,7 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### # To make Transformers converge, the global batch size should be large enough. # The global batch size is computed as: # batch_size * n_gpus * grad_accumulation_factor. @@ -96,7 +96,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer transformer_input_size: 1280 d_model: 256 @@ -271,50 +271,32 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -323,14 +305,7 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] - -# Speed perturbation do_speed_perturb: True -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref <sample_rate> diff --git a/recipes/Switchboard/ASR/transformer/hparams/transformer_finetuned_LM.yaml b/recipes/Switchboard/ASR/transformer/hparams/transformer_finetuned_LM.yaml index 45a765cd1..8dd221ca4 100644 --- a/recipes/Switchboard/ASR/transformer/hparams/transformer_finetuned_LM.yaml +++ b/recipes/Switchboard/ASR/transformer/hparams/transformer_finetuned_LM.yaml @@ -51,7 +51,7 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### # To make Transformers converge, the global batch size should be large enough. # The global batch size is computed as: # batch_size * n_gpus * grad_accumulation_factor. @@ -96,7 +96,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 512 nhead: 4 @@ -126,7 +126,7 @@ lm_weight: 0.60 ctc_weight_decode: 0.40 temperature: 1.15 temperature_lm: 1.15 -############################## models ################################ +############################## Models ################################ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -258,57 +258,32 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 diff --git a/recipes/Switchboard/ASR/transformer/train.py b/recipes/Switchboard/ASR/transformer/train.py index 5fb6ebc47..dcc527952 100644 --- a/recipes/Switchboard/ASR/transformer/train.py +++ b/recipes/Switchboard/ASR/transformer/train.py @@ -133,16 +133,16 @@ class ASR(sb.core.Brain): tokens, tokens_lens = batch.tokens if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - tokens_lens = self.hparams.fea_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.fea_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/Switchboard/LM/hparams/transformer.yaml b/recipes/Switchboard/LM/hparams/transformer.yaml index 2f27463af..b501faf55 100644 --- a/recipes/Switchboard/LM/hparams/transformer.yaml +++ b/recipes/Switchboard/LM/hparams/transformer.yaml @@ -36,7 +36,7 @@ test_csv: !ref <save_folder>/test.csv # (e.g. /path/to/2000_unigram.model) tokenizer_file: !PLACEHOLDER -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 batch_size: 164 lr: 1 diff --git a/recipes/Switchboard/LM/hparams/transformer_finetune.yaml b/recipes/Switchboard/LM/hparams/transformer_finetune.yaml index f5657c76c..5b0860e41 100644 --- a/recipes/Switchboard/LM/hparams/transformer_finetune.yaml +++ b/recipes/Switchboard/LM/hparams/transformer_finetune.yaml @@ -39,7 +39,7 @@ test_csv: !ref <save_folder>/test.csv # instead. E.g if you want to use your own LM / tokenizer. pretrained_lm_tokenizer_path: speechbrain/asr-transformer-transformerlm-librispeech -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 5 batch_size: 128 lr: 2 diff --git a/recipes/Switchboard/Tokenizer/hparams/2K_unigram_subword_bpe.yaml b/recipes/Switchboard/Tokenizer/hparams/2K_unigram_subword_bpe.yaml index e6c546bdf..d07d83e70 100644 --- a/recipes/Switchboard/Tokenizer/hparams/2K_unigram_subword_bpe.yaml +++ b/recipes/Switchboard/Tokenizer/hparams/2K_unigram_subword_bpe.yaml @@ -20,7 +20,7 @@ train_csv: !ref <output_folder>/train_lm.csv valid_csv: !ref <output_folder>/dev.csv skip_prep: False -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 2000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/TIMIT/ASR/CTC/hparams/train.yaml b/recipes/TIMIT/ASR/CTC/hparams/train.yaml index dce350b7e..145fa1a3e 100644 --- a/recipes/TIMIT/ASR/CTC/hparams/train.yaml +++ b/recipes/TIMIT/ASR/CTC/hparams/train.yaml @@ -25,7 +25,7 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 lr: 1.0 @@ -36,7 +36,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -69,6 +69,8 @@ test_dataloader_opts: normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -76,58 +78,38 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U ext: wav csv_file: !ref <noise_annotation> - # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -137,6 +119,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> diff --git a/recipes/TIMIT/ASR/seq2seq/hparams/train.yaml b/recipes/TIMIT/ASR/seq2seq/hparams/train.yaml index cc3276506..d61179fa9 100644 --- a/recipes/TIMIT/ASR/seq2seq/hparams/train.yaml +++ b/recipes/TIMIT/ASR/seq2seq/hparams/train.yaml @@ -22,7 +22,7 @@ test_annotation: !ref <save_folder>/test.json skip_prep: False # Skip data preparation uppercase: False # Must be True when the TIMIT dataset is in the upper-case version -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 # Used if dynamic_batching is False lr: 0.0003 @@ -34,7 +34,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -88,45 +88,30 @@ dynamic_batch_sampler: shuffle: !ref <shuffle> batch_ordering: !ref <batch_ordering> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -135,6 +120,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global diff --git a/recipes/TIMIT/ASR/seq2seq/hparams/train_with_wav2vec2.yaml b/recipes/TIMIT/ASR/seq2seq/hparams/train_with_wav2vec2.yaml index 9756e9356..705f79e9a 100644 --- a/recipes/TIMIT/ASR/seq2seq/hparams/train_with_wav2vec2.yaml +++ b/recipes/TIMIT/ASR/seq2seq/hparams/train_with_wav2vec2.yaml @@ -23,7 +23,7 @@ test_annotation: !ref <save_folder>/test.json skip_prep: False # Skip data preparation uppercase: False # Must be True when the TIMIT dataset is in the upper-case version -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 8 lr: 0.0003 @@ -33,7 +33,7 @@ sorting: ascending precision: fp32 # bf16, fp16 or fp32 sample_rate: 16000 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -66,45 +66,30 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <batch_size> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -113,6 +98,7 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> diff --git a/recipes/TIMIT/ASR/transducer/hparams/train.yaml b/recipes/TIMIT/ASR/transducer/hparams/train.yaml index 5b8e53809..204297dc6 100644 --- a/recipes/TIMIT/ASR/transducer/hparams/train.yaml +++ b/recipes/TIMIT/ASR/transducer/hparams/train.yaml @@ -28,7 +28,7 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 lr: 1.0 @@ -40,7 +40,7 @@ n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -92,6 +92,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -99,58 +101,38 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U ext: wav csv_file: !ref <noise_annotation> - # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -160,6 +142,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] diff --git a/recipes/TIMIT/ASR/transducer/hparams/train_wav2vec.yaml b/recipes/TIMIT/ASR/transducer/hparams/train_wav2vec.yaml index 90f899d26..9ead09f56 100644 --- a/recipes/TIMIT/ASR/transducer/hparams/train_wav2vec.yaml +++ b/recipes/TIMIT/ASR/transducer/hparams/train_wav2vec.yaml @@ -28,7 +28,7 @@ test_annotation: !ref <save_folder>/test.json skip_prep: False # Skip data preparation uppercase: False # Must be True when the TIMIT dataset is in the upper-case version -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 8 lr: 0.0003 @@ -41,7 +41,7 @@ sample_rate: 16000 # n_fft: 400 # n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU # dropout: 0.15 dnn_blocks: 1 @@ -74,45 +74,30 @@ test_dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -121,6 +106,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref <wav2vec2_hub> output_norm: True diff --git a/recipes/TIMIT/Alignment/hparams/train.yaml b/recipes/TIMIT/Alignment/hparams/train.yaml index 7a2a581d9..aaf06b7ff 100644 --- a/recipes/TIMIT/Alignment/hparams/train.yaml +++ b/recipes/TIMIT/Alignment/hparams/train.yaml @@ -20,7 +20,7 @@ valid_annotation: !ref <data_folder>/dev.json test_annotation: !ref <data_folder>/test.json skip_prep: False # Skip data prep -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 10 batch_size: 256 lr: 0.0003 @@ -40,7 +40,7 @@ phn_set: 60 # {60, 48, 39} output_neurons: 183 blank_index: 182 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_blocks: 1 dnn_neurons: 2000 diff --git a/recipes/Tedlium2/ASR/transformer/hparams/branchformer_large.yaml b/recipes/Tedlium2/ASR/transformer/hparams/branchformer_large.yaml index 6ffad0b00..2f9f924c7 100644 --- a/recipes/Tedlium2/ASR/transformer/hparams/branchformer_large.yaml +++ b/recipes/Tedlium2/ASR/transformer/hparams/branchformer_large.yaml @@ -34,7 +34,7 @@ valid_csv: !ref <output_folder>/dev/dev.csv test_csv: - !ref <output_folder>/test/test.csv -# Training parameters +####################### Training Parameters #################################### # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -97,7 +97,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 512 nhead: 8 diff --git a/recipes/Tedlium2/Tokenizer/hparams/tedlium2_500_bpe.yaml b/recipes/Tedlium2/Tokenizer/hparams/tedlium2_500_bpe.yaml index a97290f13..03c91b126 100644 --- a/recipes/Tedlium2/Tokenizer/hparams/tedlium2_500_bpe.yaml +++ b/recipes/Tedlium2/Tokenizer/hparams/tedlium2_500_bpe.yaml @@ -14,7 +14,7 @@ skip_prep: False train_csv: !ref <output_folder>/train/train.csv valid_csv: !ref <output_folder>/dev/dev.csv -# Training parameters +####################### Training Parameters #################################### token_type: bpe # ["unigram", "bpe", "char"] token_output: 500 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/UrbanSound8k/SoundClassification/hparams/train_ecapa_tdnn.yaml b/recipes/UrbanSound8k/SoundClassification/hparams/train_ecapa_tdnn.yaml index 1d9613a16..3ecb1119b 100644 --- a/recipes/UrbanSound8k/SoundClassification/hparams/train_ecapa_tdnn.yaml +++ b/recipes/UrbanSound8k/SoundClassification/hparams/train_ecapa_tdnn.yaml @@ -48,7 +48,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 25 batch_size: 32 lr: 0.001 diff --git a/recipes/Voicebank/ASR/CTC/hparams/train.yaml b/recipes/Voicebank/ASR/CTC/hparams/train.yaml index 65b833a0b..a49bae5fa 100644 --- a/recipes/Voicebank/ASR/CTC/hparams/train.yaml +++ b/recipes/Voicebank/ASR/CTC/hparams/train.yaml @@ -20,7 +20,7 @@ valid_annotation: !ref <output_folder>/valid.json test_annotation: !ref <output_folder>/test.json skip_prep: False # Skip data preparation -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 sorting: ascending @@ -37,7 +37,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -61,45 +61,31 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -108,6 +94,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + model: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/Voicebank/MTL/ASR_enhance/hparams/enhance_mimic.yaml b/recipes/Voicebank/MTL/ASR_enhance/hparams/enhance_mimic.yaml index 2a96e25db..c3391498a 100644 --- a/recipes/Voicebank/MTL/ASR_enhance/hparams/enhance_mimic.yaml +++ b/recipes/Voicebank/MTL/ASR_enhance/hparams/enhance_mimic.yaml @@ -18,7 +18,7 @@ valid_annotation: !ref <data_folder>/valid.json test_annotation: !ref <data_folder>/test.json skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 lr: 0.0001 diff --git a/recipes/Voicebank/MTL/ASR_enhance/hparams/pretrain_perceptual.yaml b/recipes/Voicebank/MTL/ASR_enhance/hparams/pretrain_perceptual.yaml index a86fbc4cc..d384d026a 100644 --- a/recipes/Voicebank/MTL/ASR_enhance/hparams/pretrain_perceptual.yaml +++ b/recipes/Voicebank/MTL/ASR_enhance/hparams/pretrain_perceptual.yaml @@ -18,7 +18,7 @@ valid_annotation: !ref <data_folder>/valid.json test_annotation: !ref <data_folder>/test.json skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 ctc_epochs: 4 batch_size: 8 diff --git a/recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml b/recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml index 1bf087d85..1835342c3 100644 --- a/recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml +++ b/recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml @@ -24,7 +24,7 @@ test_annotation: !ref <data_folder>/test.json noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 ctc_epochs: 0 batch_size: 8 @@ -141,6 +141,8 @@ compute_stft: !new:speechbrain.processing.features.STFT spectral_magnitude: !name:speechbrain.processing.features.spectral_magnitude power: 0.5 +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -148,58 +150,38 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U ext: wav csv_file: !ref <noise_annotation> - # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -209,7 +191,6 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] - fbank: !new:speechbrain.lobes.features.Fbank n_mels: !ref <n_mels> sample_rate: !ref <sample_rate> diff --git a/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml b/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml index d7d2be081..c93ba21ec 100644 --- a/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml +++ b/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml @@ -85,6 +85,7 @@ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -95,18 +96,14 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> - # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <RIR_DATASET_URL> @@ -122,37 +119,24 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: True concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn_mel_spec.yaml b/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn_mel_spec.yaml index 6cb17c6bc..becd8e4d4 100644 --- a/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn_mel_spec.yaml +++ b/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn_mel_spec.yaml @@ -104,6 +104,7 @@ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -114,18 +115,14 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> - # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <RIR_DATASET_URL> @@ -141,37 +138,24 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: True concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/VoxCeleb/SpeakerRec/hparams/train_resnet.yaml b/recipes/VoxCeleb/SpeakerRec/hparams/train_resnet.yaml index 617457f1a..a20786574 100644 --- a/recipes/VoxCeleb/SpeakerRec/hparams/train_resnet.yaml +++ b/recipes/VoxCeleb/SpeakerRec/hparams/train_resnet.yaml @@ -55,7 +55,7 @@ right_frames: 0 deltas: False # Number of speakers -# 1211 for vox1, 5994 for vox2, 7205 for vox1+vox2 +# 1211 for vox1, 5994 for vox2, 7205 for vox1+vox2 out_n_neurons: 7205 num_workers: 4 @@ -85,6 +85,7 @@ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -95,18 +96,14 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> - # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <RIR_DATASET_URL> @@ -122,37 +119,24 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: True concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml b/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml index 8a70462c5..ab628c681 100644 --- a/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml +++ b/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml @@ -88,6 +88,7 @@ classifier: !new:speechbrain.lobes.models.Xvector.Classifier epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -98,18 +99,14 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> - # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <RIR_DATASET_URL> @@ -125,37 +122,24 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: True concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml b/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml index a2d872486..db29a301b 100644 --- a/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml +++ b/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml @@ -58,6 +58,8 @@ val_dataloader_options: num_workers: 1 batch_size: !ref <batch_size_val> +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -72,7 +74,6 @@ prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL ext: wav csv_file: !ref <rir_annotation> - # Add reverberation to input signal add_reverb: !new:speechbrain.augment.time_domain.AddReverb csv_file: !ref <rir_annotation> @@ -81,27 +82,21 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [90, 100, 110] # List of speed changes for time-stretching speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 shuffle_augmentations: True min_augmentations: 1 max_augmentations: 3 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-wham-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-wham-DM.yaml index a38bb8c88..843c9fb09 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-wham-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-wham-DM.yaml @@ -45,7 +45,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 8 lr: 0.0001 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-whamr-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-whamr-DM.yaml index af3fcb0d7..b55f05a11 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-whamr-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-whamr-DM.yaml @@ -45,7 +45,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 8 lr: 0.0001 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/convtasnet-whamr-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/convtasnet-whamr-DM.yaml index 7c2990442..545428d5a 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/convtasnet-whamr-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/convtasnet-whamr-DM.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 10 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/dprnn-whamr-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/dprnn-whamr-DM.yaml index f3158a625..d974b03c1 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/dprnn-whamr-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/dprnn-whamr-DM.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-wham.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-wham.yaml index 75c90a0f1..df1935306 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-wham.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-wham.yaml @@ -44,7 +44,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k-DM.yaml index 625801e51..dc2783491 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k-DM.yaml @@ -46,7 +46,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 16000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k.yaml index d42060642..d11332c7e 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 16000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-DM.yaml index 2acd34370..3721698bc 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-DM.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr.yaml index 536a46f49..14a38a06b 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/separation/hparams/sepformer-wham.yaml b/recipes/WHAMandWHAMR/separation/hparams/sepformer-wham.yaml index 1661d68f2..db920e7fa 100644 --- a/recipes/WHAMandWHAMR/separation/hparams/sepformer-wham.yaml +++ b/recipes/WHAMandWHAMR/separation/hparams/sepformer-wham.yaml @@ -42,7 +42,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml b/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml index d4bc250d0..8538529a6 100644 --- a/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml +++ b/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml @@ -40,7 +40,7 @@ num_spks: 2 # set to 3 for wsj0-3mix save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/convtasnet.yaml b/recipes/WSJ0Mix/separation/hparams/convtasnet.yaml index 038330504..0305c6236 100644 --- a/recipes/WSJ0Mix/separation/hparams/convtasnet.yaml +++ b/recipes/WSJ0Mix/separation/hparams/convtasnet.yaml @@ -36,7 +36,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/dprnn.yaml b/recipes/WSJ0Mix/separation/hparams/dprnn.yaml index a78a78266..df1952d8c 100644 --- a/recipes/WSJ0Mix/separation/hparams/dprnn.yaml +++ b/recipes/WSJ0Mix/separation/hparams/dprnn.yaml @@ -36,7 +36,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/resepformer.yaml b/recipes/WSJ0Mix/separation/hparams/resepformer.yaml index 2b2711f52..406f2aa76 100644 --- a/recipes/WSJ0Mix/separation/hparams/resepformer.yaml +++ b/recipes/WSJ0Mix/separation/hparams/resepformer.yaml @@ -37,7 +37,7 @@ num_spks: 2 # set to 3 for wsj0-3mix save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/sepformer-conformerintra.yaml b/recipes/WSJ0Mix/separation/hparams/sepformer-conformerintra.yaml index 15550c9a2..2cf2b7ac5 100644 --- a/recipes/WSJ0Mix/separation/hparams/sepformer-conformerintra.yaml +++ b/recipes/WSJ0Mix/separation/hparams/sepformer-conformerintra.yaml @@ -37,7 +37,7 @@ num_spks: 2 # set to 3 for wsj0-3mix save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/sepformer-customdataset.yaml b/recipes/WSJ0Mix/separation/hparams/sepformer-customdataset.yaml index 82a2d3009..c896f2dfd 100644 --- a/recipes/WSJ0Mix/separation/hparams/sepformer-customdataset.yaml +++ b/recipes/WSJ0Mix/separation/hparams/sepformer-customdataset.yaml @@ -39,7 +39,7 @@ noprogressbar: False save_audio: True # Save estimated sources on disk sample_rate: 16000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/sepformer.yaml b/recipes/WSJ0Mix/separation/hparams/sepformer.yaml index 4787fb3aa..77319604d 100644 --- a/recipes/WSJ0Mix/separation/hparams/sepformer.yaml +++ b/recipes/WSJ0Mix/separation/hparams/sepformer.yaml @@ -40,7 +40,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 20 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/skim.yaml b/recipes/WSJ0Mix/separation/hparams/skim.yaml index 53b312efd..606c7060a 100644 --- a/recipes/WSJ0Mix/separation/hparams/skim.yaml +++ b/recipes/WSJ0Mix/separation/hparams/skim.yaml @@ -37,7 +37,7 @@ num_spks: 2 # set to 3 for wsj0-3mix save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/ZaionEmotionDataset/emotion_diarization/hparams/train.yaml b/recipes/ZaionEmotionDataset/emotion_diarization/hparams/train.yaml index 644ca89a2..0d9601d20 100644 --- a/recipes/ZaionEmotionDataset/emotion_diarization/hparams/train.yaml +++ b/recipes/ZaionEmotionDataset/emotion_diarization/hparams/train.yaml @@ -30,7 +30,7 @@ train_annotation: !ref <output_folder>/train.json valid_annotation: !ref <output_folder>/valid.json test_annotation: !ref <output_folder>/test.json -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 15 lr: 0.0001 lr_wav2vec: 0.00001 diff --git a/recipes/fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml b/recipes/fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml index db7c1ddb7..eff38c7bf 100644 --- a/recipes/fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml +++ b/recipes/fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml @@ -13,7 +13,7 @@ train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/valid.csv skip_prep: False -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 51 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/fluent-speech-commands/direct/hparams/train.yaml b/recipes/fluent-speech-commands/direct/hparams/train.yaml index a7c072343..428faf144 100644 --- a/recipes/fluent-speech-commands/direct/hparams/train.yaml +++ b/recipes/fluent-speech-commands/direct/hparams/train.yaml @@ -32,14 +32,14 @@ rir_annotation: !ref <save_folder>/rir.csv tokenizer_file: https://www.dropbox.com/s/hvf2huofnq0sjbn/51_unigram.model?dl=1 skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 6 batch_size: 16 lr: 0.0003 # token_type: unigram # ["unigram", "bpe", "char"] sorting: random -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -65,6 +65,8 @@ dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -87,45 +89,32 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 9 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 3 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 2 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 shuffle_augmentations: True min_augmentations: 1 max_augmentations: 4 @@ -136,7 +125,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] -# Models +############################## Models ########################################## + asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech slu_enc: !new:speechbrain.nnet.containers.Sequential diff --git a/recipes/timers-and-such/LM/hparams/train.yaml b/recipes/timers-and-such/LM/hparams/train.yaml index 485dd5426..f3ba652ed 100644 --- a/recipes/timers-and-such/LM/hparams/train.yaml +++ b/recipes/timers-and-such/LM/hparams/train.yaml @@ -23,7 +23,7 @@ csv_test_synth: !ref <output_folder>/test-synth-type=decoupled.csv csv_test_real: !ref <output_folder>/test-real-type=decoupled.csv skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 10 batch_size: 128 lr: 0.0003 diff --git a/recipes/timers-and-such/Tokenizer/hparams/tokenizer_bpe51.yaml b/recipes/timers-and-such/Tokenizer/hparams/tokenizer_bpe51.yaml index 7554a0342..2a9f39161 100644 --- a/recipes/timers-and-such/Tokenizer/hparams/tokenizer_bpe51.yaml +++ b/recipes/timers-and-such/Tokenizer/hparams/tokenizer_bpe51.yaml @@ -15,7 +15,7 @@ train_csv: !ref <output_folder>/train-type=direct.csv valid_csv: !ref <output_folder>/dev-real-type=direct.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 51 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml b/recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml index 212752668..1ee56d561 100644 --- a/recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml +++ b/recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml @@ -34,7 +34,7 @@ skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min test_on_all_real: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 diff --git a/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml b/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml index fcf6393e6..5f0d93d09 100644 --- a/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml +++ b/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml @@ -34,7 +34,7 @@ skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min test_on_all_real: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 diff --git a/recipes/timers-and-such/direct/hparams/train.yaml b/recipes/timers-and-such/direct/hparams/train.yaml index 4fb574fc3..01909eb5b 100644 --- a/recipes/timers-and-such/direct/hparams/train.yaml +++ b/recipes/timers-and-such/direct/hparams/train.yaml @@ -38,14 +38,14 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 # token_type: unigram # ["unigram", "bpe", "char"] sorting: random -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -71,6 +71,7 @@ dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -80,56 +81,37 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U csv_file: !ref <noise_annotation> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -139,8 +121,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## -# Models asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech slu_enc: !new:speechbrain.nnet.containers.Sequential diff --git a/recipes/timers-and-such/direct/hparams/train_with_wav2vec2.yaml b/recipes/timers-and-such/direct/hparams/train_with_wav2vec2.yaml index b9b451e91..b9ad3cfc2 100644 --- a/recipes/timers-and-such/direct/hparams/train_with_wav2vec2.yaml +++ b/recipes/timers-and-such/direct/hparams/train_with_wav2vec2.yaml @@ -37,7 +37,7 @@ ckpt_interval_minutes: 15 # save checkpoint every N min test_on_all_real: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 lr: 0.0004 @@ -49,7 +49,7 @@ freeze_wav2vec: False # token_type: unigram # ["unigram", "bpe", "char"] sorting: ascending -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -171,45 +171,31 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_annealing_wav2vec2: !ref <lr_annealing_wav2vec2> counter: !ref <epoch_counter> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml b/recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml index 3d1c4e156..b804df9f9 100644 --- a/recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml +++ b/recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml @@ -39,7 +39,7 @@ NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.z noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 @@ -73,6 +73,8 @@ dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -81,57 +83,37 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U csv_file: !ref <noise_annotation> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 -# Augmenter: Combines previously defined augmentations to perform data augmentation # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -141,7 +123,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] -# Models +############################## Models ########################################## + asr_model: !apply:speechbrain.inference.ASR.EncoderDecoderASR.from_hparams source: speechbrain/asr-crdnn-rnnlm-librispeech run_opts: {"device":"cuda:0"} diff --git a/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml b/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml index d21f309c5..56eb59d20 100644 --- a/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml +++ b/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml @@ -39,7 +39,7 @@ NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.z noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 @@ -73,6 +73,8 @@ dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -81,57 +83,37 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U csv_file: !ref <noise_annotation> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 -# Augmenter: Combines previously defined augmentations to perform data augmentation # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -141,6 +123,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + # Models asr_model: !apply:speechbrain.inference.ASR.EncoderDecoderASR.from_hparams source: speechbrain/asr-crdnn-rnnlm-librispeech diff --git a/speechbrain/augment/augmenter.py b/speechbrain/augment/augmenter.py index 25ea17480..55aee785c 100644 --- a/speechbrain/augment/augmenter.py +++ b/speechbrain/augment/augmenter.py @@ -438,20 +438,52 @@ class Augmenter(torch.nn.Module): return output, output_lengths + def replicate_multiple_labels(self, *args): + """ + Replicates the labels along the batch axis a number of times that + corresponds to the number of augmentations. Indeed parallel and + concatenation augmentations alter the time dimension. + + Arguments + --------- + args : torch.Tensor + Input label tensors to be replicated. Can be a uniq or a list of + Tensors. + + Returns + ------- + augmented_labels: torch.Tensor + Labels corresponding to the augmented input. Returns as many Tensor + as given in input. + """ + + # Determine whether to apply data augmentation + if not self.do_augment: + return args + + list_of_augmented_labels = [] + + for labels in args: + list_of_augmented_labels.append(self.replicate_labels(labels)) + + return list_of_augmented_labels + def replicate_labels(self, labels): """ Replicates the labels along the batch axis a number of times that - corresponds to the number of augmentations. + corresponds to the number of augmentations. Indeed parallel and + concatenation augmentations alter the time dimension. Arguments --------- labels : torch.Tensor - Input label tensor to be replicated. + Input label tensors to be replicated. Returns ------- augmented_labels: torch.Tensor - Labels corresponding to the augmented input. + Labels corresponding to the augmented input. Returns as many Tensor + as given in input. """ # Determine whether to apply data augmentation @@ -477,6 +509,7 @@ class Augmenter(torch.nn.Module): ) augmented_labels = torch.cat(augmented_labels, dim=0) + return augmented_labels def check_min_max_augmentations(self): -- GitLab