diff --git a/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml b/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml index 9bd0b52d22511aac7235cf05f165a3b25b8a6124..486685f258aba538b0262cdd5339b23c9dc266a2 100644 --- a/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml +++ b/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml @@ -28,7 +28,8 @@ test_data: !ref <output_folder>/test.csv wav2vec2_hub: TencentGameMate/chinese-wav2vec2-large wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 80 lr: 1.0 lr_wav2vec: 0.0001 @@ -76,7 +77,8 @@ tokenizer: !apply:transformers.BertTokenizer.from_pretrained # bert-base-chinese tokens length output_neurons: 21128 -# Decoding parameters +############################## Decoding ######################################## + # Be sure that the bos and eos index match with the BPEs ones # Decoding parameters test_searcher: !name:speechbrain.decoders.CTCBeamSearcher @@ -98,64 +100,37 @@ beta: 0.5 # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. -# (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref <remove_spaces>] epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [90, 100, 110] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> # Time Drop -time_drop_length_low: 35 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 45 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 2 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 2 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 35 + drop_length_high: 45 + drop_count_low: 2 + drop_count_high: 2 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -164,6 +139,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear @@ -230,6 +207,8 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py b/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py index 227204f44c3dc7f7f9d630610509d264665ce4cb..43783eed7f49f54a3f41e8e6adffdb2381eee50e 100644 --- a/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py +++ b/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py @@ -56,6 +56,8 @@ class ASR(sb.Brain): ids = batch.id tokens, tokens_lens = batch.tokens + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN: if hasattr(self.hparams, "fea_augment"): tokens = self.hparams.fea_augment.replicate_labels(tokens) diff --git a/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml b/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml index 75b303f6691ed73a720606b0eab472724c996315..e6fda7de26ed417a2cae1ddf6389aacb717d420d 100644 --- a/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml +++ b/recipes/AISHELL-1/ASR/seq2seq/hparams/train.yaml @@ -29,7 +29,8 @@ test_data: !ref <output_folder>/test.csv noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script tokenizer_file: speechbrain/asr-transformer-aishell/tokenizer.ckpt -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 40 number_of_ctc_epochs: 10 batch_size: 16 @@ -71,7 +72,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -108,7 +109,6 @@ scorer_beam_scale: 0.5 # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. -# (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref <remove_spaces>] @@ -118,6 +118,8 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global +############################## Augmentations ################################### + compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref <sample_rate> n_fft: !ref <n_fft> @@ -132,57 +134,37 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 -# Augmenter: Combines previously defined augmentations to perform data augmentation # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -192,6 +174,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> @@ -268,7 +252,8 @@ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer paths: tokenizer: !ref <tokenizer_file> -# Scorer +############################## Decoding ######################################## + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -305,6 +290,8 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.8 patient: 0 +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/AISHELL-1/ASR/seq2seq/train.py b/recipes/AISHELL-1/ASR/seq2seq/train.py index 69d2e75d6b2bd88008671f0ce4e5ed7eb237dce0..bc2c49b888ce0cb997d03a87da151a0cc524d35d 100644 --- a/recipes/AISHELL-1/ASR/seq2seq/train.py +++ b/recipes/AISHELL-1/ASR/seq2seq/train.py @@ -29,10 +29,6 @@ class ASR(sb.Brain): # Forward pass feats = self.hparams.compute_features(wavs) - - if stage == sb.Stage.TRAIN and hasattr(self.hparams, "fea_augment"): - feats, fea_lens = self.hparams.fea_augment(feats, wav_lens) - feats = self.modules.normalize(feats, wav_lens) x = self.modules.enc(feats.detach()) e_in = self.modules.emb(tokens_bos) # y_in bos + tokens @@ -65,12 +61,16 @@ class ASR(sb.Brain): tokens_eos, tokens_eos_lens = batch.tokens_eos tokens, tokens_lens = batch.tokens + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - tokens_eos = self.hparams.wav_augment.replicate_labels(tokens_eos) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml index b98e371b2d32c131949f8255b3a42042c5479877..408c9e68008f74950c1916b53b6b9c29da7efdec 100644 --- a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml +++ b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer.yaml @@ -30,7 +30,8 @@ test_data: !ref <save_folder>/test.csv noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script tokenizer_file: speechbrain/asr-transformer-aishell/tokenizer.ckpt -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 50 batch_size: 8 ctc_weight: 0.3 @@ -77,7 +78,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -####################### Model parameters ########################### +####################### Model Parameters ####################################### # Transformer d_model: 256 nhead: 4 @@ -103,7 +104,7 @@ valid_beam_size: 10 test_beam_size: 10 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -157,7 +158,8 @@ SGD: !name:torch.optim.SGD momentum: 0.99 nesterov: True -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -227,7 +229,7 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# ----- WAVEFORM AUGMENTATION ----- # +############################## Augmentation #################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -238,75 +240,43 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 1 max_augmentations: 1 augment_prob: 1.0 augmentations: [ !ref <add_noise>] - - # ----- FEATURE AUGMENTATION ----- # -time_drop_length_low: 0 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 100 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 2 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 2 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -freq_drop_length_low: 30 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 40 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks + drop_length_low: 0 + drop_length_high: 100 + drop_count_low: 2 + drop_count_high: 2 # Frequency Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 30 + drop_length_high: 40 + drop_count_low: 2 + drop_count_high: 2 dim: 2 -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - # Time warp time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 1 max_augmentations: 1 augment_start_index: !ref <batch_size> # This leaves unchanges original inputs @@ -317,6 +287,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> @@ -324,7 +296,6 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. -# (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref <remove_spaces>] diff --git a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer_with_wav2vect.yaml b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer_with_wav2vect.yaml index 13a7826ade7debc99f9c99b3d69e7eed526152b1..a196afc5842ef5235d0f6e9c5179984319c327bf 100644 --- a/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer_with_wav2vect.yaml +++ b/recipes/AISHELL-1/ASR/transformer/hparams/train_ASR_transformer_with_wav2vect.yaml @@ -30,7 +30,8 @@ wav2vec2_hub: facebook/wav2vec2-large-100k-voxpopuli wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint freeze_wav2vec: False -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 80 batch_size: 2 grad_accumulation_factor: 16 @@ -72,7 +73,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -####################### Model parameters ########################### +####################### Model Parameters ####################################### # Transformer d_model: 256 nhead: 4 @@ -98,7 +99,7 @@ valid_beam_size: 10 test_beam_size: 10 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref <wav2vec2_hub> @@ -140,44 +141,27 @@ model: !new:torch.nn.ModuleList - [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>] # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -186,6 +170,7 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Decoding & optimiser ############################ # define two optimizers here for two-stage training Adam: !name:torch.optim.Adam @@ -257,6 +242,7 @@ noam_annealing_wav2vect: !new:speechbrain.nnet.schedulers.NoamScheduler n_warmup_steps: 25000 model_size: !ref <d_model> +############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> @@ -278,7 +264,6 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. -# (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref <remove_spaces>] diff --git a/recipes/AISHELL-1/ASR/transformer/train.py b/recipes/AISHELL-1/ASR/transformer/train.py index 977563ac8ec772c69faa801e8d6bb8fb92726cec..63361bf0dd9d597664ce175238529ad10f6442be 100644 --- a/recipes/AISHELL-1/ASR/transformer/train.py +++ b/recipes/AISHELL-1/ASR/transformer/train.py @@ -82,28 +82,26 @@ class ASR(sb.core.Brain): tokens, tokens_lens = batch.tokens if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - tokens_lens = self.hparams.fea_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py b/recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py index 94196ea7b846d773c2a800a06bb60b377c886261..53aa47375146d3707224211740ad31ccbdc00c66 100644 --- a/recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py +++ b/recipes/AISHELL-1/ASR/transformer/train_with_wav2vect.py @@ -74,16 +74,16 @@ class ASR(sb.core.Brain): tokens, tokens_lens = batch.tokens if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/AISHELL-1/Tokenizer/hparams/tokenizer_bpe5000.yaml b/recipes/AISHELL-1/Tokenizer/hparams/tokenizer_bpe5000.yaml index 886d22bda076a76cfe78ffad4a35071784f6f215..d2cb230189911a83d54c5b6caedf7fffa334c57b 100644 --- a/recipes/AISHELL-1/Tokenizer/hparams/tokenizer_bpe5000.yaml +++ b/recipes/AISHELL-1/Tokenizer/hparams/tokenizer_bpe5000.yaml @@ -15,7 +15,7 @@ train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/AISHELL-1/Tokenizer/hparams/train_transformer_tokenizer_bpe5000.yaml b/recipes/AISHELL-1/Tokenizer/hparams/train_transformer_tokenizer_bpe5000.yaml index bc286156b73559a3c414efacf79a5b56fc54b657..973df9a1194b9828e6002062fdefbb871da0fc77 100644 --- a/recipes/AISHELL-1/Tokenizer/hparams/train_transformer_tokenizer_bpe5000.yaml +++ b/recipes/AISHELL-1/Tokenizer/hparams/train_transformer_tokenizer_bpe5000.yaml @@ -15,7 +15,7 @@ train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2-wham.yaml b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2-wham.yaml index 24571404fe0f1303075c768d3691e412d51d01b5..d3cb9493e4ce2a4a9a83ef00f7379fe6c58c2df4 100644 --- a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2-wham.yaml +++ b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2-wham.yaml @@ -40,7 +40,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2.yaml b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2.yaml index d4689378aedbf08a33c3bc2eb81caf2e88d14d04..168471dbb1d2f12207ec51e1f7a2eb0e89afe096 100644 --- a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2.yaml +++ b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix2.yaml @@ -40,7 +40,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3-wham.yaml b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3-wham.yaml index 65add025a2eb96b84240089ac2c6daf061f36b80..834857ed77f61a2a2da292de742fae950c2bd05e 100644 --- a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3-wham.yaml +++ b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3-wham.yaml @@ -40,7 +40,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3.yaml b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3.yaml index 3b27b796b6d2303b62520d8c6b3ad83cf91b66fd..d48fdecb214c6acef700f1186817142a90af2f0d 100644 --- a/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3.yaml +++ b/recipes/Aishell1Mix/separation/hparams/sepformer-aishell1mix3.yaml @@ -40,7 +40,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-cross.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-cross.yaml index f609c746e9dc7d957d38173807f793dc3d8c78bf..043845aebd245c6ef40c8a41b9635cac018f02cd 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-cross.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-cross.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-independent.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-independent.yaml index c3483f5f281682ac99ca5c179543e969b1769862..164ccc45b17fd617f2c6a6342bbf23d26001f68f 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-independent.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-independent.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-noise.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-noise.yaml index 74941dfea9a48ed348af82a713bfc8d2206ed243..fef85267f644aad3f01047cc64100e7a55ba8beb 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-noise.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-noise.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-reverb.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-reverb.yaml index 6b1518d39a9f968c120fa088de0589e4bd2b2f23..4ec5054f982631250b29b6c03147ced3fbddb586 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-reverb.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel-reverb.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel.yaml b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel.yaml index a5d6156790a661ec91b37e766d9bcea292801500..adb31ddc68fd3284774c58e4e60b3a28b65457c5 100644 --- a/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel.yaml +++ b/recipes/BinauralWSJ0Mix/separation/hparams/convtasnet-parallel.yaml @@ -43,7 +43,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 10 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/CVSS/S2ST/hparams/train_fr-en.yaml b/recipes/CVSS/S2ST/hparams/train_fr-en.yaml index 8ff7d59be93f292618273f757f31c6f51fbefe57..678dd1c178017fd8f71e7fbbbfccdf048c4cfdd9 100644 --- a/recipes/CVSS/S2ST/hparams/train_fr-en.yaml +++ b/recipes/CVSS/S2ST/hparams/train_fr-en.yaml @@ -59,7 +59,7 @@ wav2vec2_download_path: !ref <save_folder>/pretrained_models wav2vec2_frozen: False wav2vec2_freeze_steps: 10000 -# Training parameters +####################### Training Parameters #################################### lr: 0.0005 lr_wav2vec: 0.00001 loss_reduction: batchmean diff --git a/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml b/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml index 0c4d91aa9c08caff1a76db8288dac522fa68d054..d4722f45ca06cc58946ff78f0aed3d3185b8c15c 100644 --- a/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml +++ b/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml @@ -38,10 +38,10 @@ error_stats: !name:speechbrain.utils.metric_stats.MetricStats metric: !name:speechbrain.nnet.losses.classification_error reduction: batch +####################### Training Parameters #################################### + # Feature parameters btw: 40 - 80 n_mels: 80 - -# Training Parameters sample_rate: 16000 number_of_epochs: 30 batch_size: 4 @@ -64,6 +64,8 @@ test_dataloader_options: batch_size: !ref <batch_size> shuffle: True +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -78,7 +80,6 @@ prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL ext: wav csv_file: !ref <rir_annotation> - # Add reverberation to input signal add_reverb: !new:speechbrain.augment.time_domain.AddReverb csv_file: !ref <rir_annotation> @@ -87,27 +88,21 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [90, 100, 110] # List of speed changes for time-stretching speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 shuffle_augmentations: True min_augmentations: 1 max_augmentations: 3 @@ -125,6 +120,8 @@ mean_var_norm_input: !new:speechbrain.processing.features.InputNormalization norm_type: sentence std_norm: False +############################## Models ########################################## + # To design a custom model, either just edit the simple CustomModel # class that's listed here, or replace this `!new` call with a line # pointing to a different file you've defined. @@ -182,6 +179,8 @@ lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler final_value: !ref <lr_final> epoch_count: !ref <number_of_epochs> +############################## Logging and Pretrainer ########################## + # This object is used for saving the state of training both so that it # can be resumed if it gets interrupted, and also so that the best checkpoint # can be later loaded for evaluation or inference. diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_ar_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_ar_with_wav2vec.yaml index caf7f2d3b8f5ceeeda1c00e273b4332910fdebcb..643df09944098da647e4067ff7d6546b433ee036 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_ar_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_ar_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +59,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -97,45 +98,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -144,6 +130,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml index dc4ae34b2e2c00b01262bfa053eb9f25f7a76d9d..adb8e5bb52626992dfd6899871a8f74a30bc62dd 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_de_with_wav2vec.yaml @@ -33,7 +33,8 @@ skip_prep: False # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 45 lr: 1.0 lr_wav2vec: 0.0001 @@ -61,7 +62,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU dnn_neurons: 1024 wav2vec_output_dim: !ref <dnn_neurons> @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -142,6 +128,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml index f3c68ee9b9bdfc354640ba9ca82d8948231fd6b1..d8aaea36e46f52efaf947e3cf10a14b339c8cb0f 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_en_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +59,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -142,6 +128,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_es_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_es_with_wav2vec.yaml index 8e2056f83735607bf555f5d1be0dfaa63fdb7888..e32a242d1a4f6d09b5d01b0b614e94df08298167 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_es_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_es_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +59,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -96,45 +97,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -143,6 +129,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml index 44f1523f3a666643677ccf5f9808cecab39bcd72..079cfe73fce4ce69541d86017ac971f2daa4f2fc 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_fr_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +59,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 @@ -94,45 +95,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -141,6 +127,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml index 4f39ad2a06a9d7ea3106cd4e0b6565379e8526c3..0332997523960e7d5ff2cb002335f464bb04a70b 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_it_with_wav2vec.yaml @@ -33,7 +33,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 8.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 45 lr: 1.0 lr_wav2vec: 0.0001 @@ -59,7 +60,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -141,6 +127,9 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <speed_perturb>, !ref <drop_freq>, !ref <drop_chunk>] + +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_pt_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_pt_with_wav2vec.yaml index da8a28de58c8c9f97ea2e2d10fd33928f13c9297..d4b703eb456fc66104146de3e1b45e986d6eb9b0 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_pt_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_pt_with_wav2vec.yaml @@ -31,7 +31,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -57,7 +58,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -142,6 +128,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml index f92d8ad137f5d2423032adfd5e8997f9ad006cf5..ed15a8aadada5e157843d953264dc518abed1658 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_rw_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -59,7 +60,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 @@ -95,45 +96,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -142,6 +128,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/CTC/hparams/train_zh-CN_with_wav2vec.yaml b/recipes/CommonVoice/ASR/CTC/hparams/train_zh-CN_with_wav2vec.yaml index 513d8d3246ab1aa7ae0454b65b58362d11c5b8a4..a1709931a46d9f6f2c7146e901749b2ac25b2ba1 100644 --- a/recipes/CommonVoice/ASR/CTC/hparams/train_zh-CN_with_wav2vec.yaml +++ b/recipes/CommonVoice/ASR/CTC/hparams/train_zh-CN_with_wav2vec.yaml @@ -32,7 +32,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -59,7 +60,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -97,45 +98,30 @@ test_beam_search: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -144,6 +130,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_de.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_de.yaml index b66374147565ae932bd2a9674416a0de2809159a..cb6f2b3be095a6db5d948d76cc5beca09691d96f 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_de.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_de.yaml @@ -30,7 +30,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 20 lr: 1.0 @@ -62,7 +63,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -104,51 +105,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -165,6 +142,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml index c74d25c663d4b3a8cc06b1f323a3901f4f215970..49f9a0d2b76464c2dbc840a3a414fff0c8fd946e 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml @@ -29,7 +29,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 10 lr: 1.0 @@ -60,7 +61,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -102,50 +103,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -162,6 +140,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_es.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_es.yaml index b56a75b697ac15ec55431ae278316d78982dad3b..b94373e9bd0028e1668b37da3736259516fe14e1 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_es.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_es.yaml @@ -30,7 +30,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 20 lr: 1.0 @@ -61,7 +62,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -103,50 +104,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -163,6 +141,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_fr.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_fr.yaml index 1c2a85ec8e08d1c7c3ac3d7ff8baf5930dbbd0ca..cc9b0aa995ece9f1e011d0671e59b7fa63780389 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_fr.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_fr.yaml @@ -29,7 +29,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 20 lr: 1.0 @@ -60,7 +61,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -102,50 +103,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -162,6 +140,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_it.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_it.yaml index bf9211caf5c728f89148c07ff2710c8120d8f823..2c0355ae55c9fa226a43f7081309c95fb3559814 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_it.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_it.yaml @@ -29,7 +29,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 8.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 50 number_of_ctc_epochs: 40 lr: 1.0 @@ -59,7 +60,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -101,50 +102,27 @@ temperature: 1.50 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -161,6 +139,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw.yaml index dd665ab24a6fa38c61b7fcc5bbcd43e521609865..8bc89c1c4bbced9dc62234b0dfbe0507ffd50374 100644 --- a/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw.yaml +++ b/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw.yaml @@ -29,7 +29,8 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 8.0 -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 20 lr: 1.0 @@ -59,7 +60,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 80 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -102,50 +103,27 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### + # Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -162,6 +140,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/transducer/hparams/train_de.yaml b/recipes/CommonVoice/ASR/transducer/hparams/train_de.yaml index 8faaa805f53ce65bdd026115294a5cede6727d3c..9bbab166930f6e31a086be82c595a307961db551 100644 --- a/recipes/CommonVoice/ASR/transducer/hparams/train_de.yaml +++ b/recipes/CommonVoice/ASR/transducer/hparams/train_de.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 6 batch_size_valid: 1 @@ -71,7 +71,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <batch_size_valid> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -109,50 +109,28 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 3 + drop_count_high: 3 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -161,6 +139,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/transducer/hparams/train_fr.yaml b/recipes/CommonVoice/ASR/transducer/hparams/train_fr.yaml index 6c3f0bc7d181b47f0462747281874f56795d4152..c96a0939466a88d7e364efe5d69226f020d702e2 100644 --- a/recipes/CommonVoice/ASR/transducer/hparams/train_fr.yaml +++ b/recipes/CommonVoice/ASR/transducer/hparams/train_fr.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 6 batch_size_valid: 1 @@ -71,7 +71,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <batch_size_valid> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -109,50 +109,28 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 3 + drop_count_high: 3 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -161,6 +139,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/transducer/hparams/train_it.yaml b/recipes/CommonVoice/ASR/transducer/hparams/train_it.yaml index a645f981546215a71cea77bceca53503137a1c01..cf366205efffe515ddc91d4a31bb48d75f505bc3 100644 --- a/recipes/CommonVoice/ASR/transducer/hparams/train_it.yaml +++ b/recipes/CommonVoice/ASR/transducer/hparams/train_it.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 6 batch_size_valid: 1 @@ -71,7 +71,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <batch_size_valid> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 3 @@ -109,50 +109,28 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 3 + drop_count_high: 3 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -161,6 +139,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/CommonVoice/ASR/transducer/train.py b/recipes/CommonVoice/ASR/transducer/train.py index 1782408b80c8d1305a99b7cb02c0de7f9df02d51..0304aabc810bfba7df541982bbc5d62c8c3f83b2 100644 --- a/recipes/CommonVoice/ASR/transducer/train.py +++ b/recipes/CommonVoice/ASR/transducer/train.py @@ -134,26 +134,22 @@ class ASR(sb.Brain): if stage == sb.Stage.TRAIN: if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - token_lens = self.hparams.wav_augment.replicate_labels( - token_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - token_eos_lens = self.hparams.wav_augment.replicate_labels( - token_eos_lens + ( + tokens, + token_lens, + tokens_eos, + token_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, token_lens, tokens_eos, token_eos_lens ) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - token_lens = self.hparams.fea_augment.replicate_labels( - token_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - token_eos_lens = self.hparams.fea_augment.replicate_labels( - token_eos_lens + ( + tokens, + token_lens, + tokens_eos, + token_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, token_lens, tokens_eos, token_eos_lens ) if stage == sb.Stage.TRAIN: diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_ar_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_ar_hf_whisper.yaml index 2b358e6d401b3b2acfde98dcccc28f77849b9ce4..d33c50c2bacb21e3bea276f428933d63f92cdb22 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_ar_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_ar_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,7 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_de.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_de.yaml index dbeb56be51634d702208568dad9dd8a7cdd285e0..c5533e9bb30af916ba32d6554e7dd84ebf90f864 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_de.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_de.yaml @@ -33,7 +33,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 32 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 ! ctc_weight: 0.3 @@ -70,7 +70,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: 6 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 768 nhead: 8 @@ -213,50 +213,27 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 3 -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_fa_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_fa_hf_whisper.yaml index bc01810449814b8f7ff293e4315aa4557c46f31f..bb23c98a6ede2b4aeedbba67a8fb4ba61071cf53 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_fa_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_fa_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_fr.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_fr.yaml index 120305e73eb0a295fd1824600e6cc5d8c40ccc29..e62d9c390a91a49253f93e6c6e1587ff18f67639 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_fr.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_fr.yaml @@ -33,7 +33,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 32 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 ! ctc_weight: 0.3 @@ -70,7 +70,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: 6 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 768 nhead: 8 @@ -213,50 +213,27 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 3 -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_fr_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_fr_hf_whisper.yaml index da5cbd28fbf685b9d37f982901bae38e16d20f0b..62363bdada17730280ee59bc48ad90b5ed17f3a7 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_fr_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_fr_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_hi_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_hi_hf_whisper.yaml index 8b130a83d4fae7e448408b06497ce9fdd075d84b..e21852639b9b75437dc27e1dd9e10a963de7f0b9 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_hi_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_hi_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_it.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_it.yaml index d937359b54b33c6c8ed4f77a20ea45f54d59d70a..d95fbaffae6d07fda47183b3a97d192b487bf649 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_it.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_it.yaml @@ -33,7 +33,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 32 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 ! ctc_weight: 0.3 @@ -70,7 +70,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: 6 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 768 nhead: 8 @@ -213,50 +213,27 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 3 -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_it_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_it_hf_whisper.yaml index 5670f4fe813f5915174f7e7593bd951453952eab..e1fc08263096249cc2a2396263ed07029769c8e4 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_it_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_it_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_mn_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_mn_hf_whisper.yaml index 9ffdc95fd6a7cfe58245e1fb3179a7d21d92a7fe..fe4fd6f17314fd33df1cce88f14f5ae5330c6fb8 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_mn_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_mn_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -82,45 +82,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -129,6 +114,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/hparams/train_sr_hf_whisper.yaml b/recipes/CommonVoice/ASR/transformer/hparams/train_sr_hf_whisper.yaml index 4f257094d23ed50c6a12720d88fb41987beeb6df..d7390d9a59374591e54f9e69cd40dbf08b0f7ef7 100644 --- a/recipes/CommonVoice/ASR/transformer/hparams/train_sr_hf_whisper.yaml +++ b/recipes/CommonVoice/ASR/transformer/hparams/train_sr_hf_whisper.yaml @@ -37,7 +37,7 @@ avoid_if_longer_than: 10.0 ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -63,7 +63,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False freeze_encoder: True @@ -83,45 +83,30 @@ test_loader_kwargs: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -130,6 +115,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> diff --git a/recipes/CommonVoice/ASR/transformer/train.py b/recipes/CommonVoice/ASR/transformer/train.py index 0aee6735efe0f21aaf46bcc2a248477e254233f2..89847d352e28d65dd2b23717b2dc134f89ed65dc 100644 --- a/recipes/CommonVoice/ASR/transformer/train.py +++ b/recipes/CommonVoice/ASR/transformer/train.py @@ -107,27 +107,25 @@ class ASR(sb.core.Brain): # Augment Labels if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - tokens_lens = self.hparams.fea_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.fea_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml b/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml index 5ead6dbc0616594e2c7e6d9225d51895795398d6..e7ceed4f5cf9c0b0c47bc1ed433c9cb8dfe90282 100644 --- a/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml +++ b/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml @@ -27,11 +27,11 @@ skip_prep: False # We remove utterance slonger than 10s in the train/dev/test sets as -# longer sentences certainly correspond to "open microphones". +# longer sentences certainly correspond to open microphones. avoid_if_longer_than: 10.0 avoid_if_shorter_than: 1.0 -# Training parameters +####################### Training Parameters #################################### # Parameters are corresponding the the ones reported in the official wav2vec2 # paper (for the masking). mask_length: 10 @@ -52,8 +52,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # IMPORTANT: To train w2v2 model, we recommand to have the effective batch_size # higher than 100 (batch_size * nb_gpu * grad_accumulation_factor) # Examples are: -# 32 Tesla V100 32GB — 12 * 32 * 1 -# 4 Tesla V100 32GB — 12 * 4 * {6-8} +# 32 Tesla V100 32GB = 12 * 32 * 1 +# 4 Tesla V100 32GB = 12 * 4 * (6-8) batch_size: 12 test_batch_size: 8 grad_accumulation_factor: 8 @@ -104,7 +104,7 @@ modules: wav2vec2: !ref <wav2vec2> opt_class: !name:torch.optim.AdamW - lr: 0 # Will be changed by the scheduler, but we start at 0! + lr: 0 # Will be changed by the scheduler, but we start at 0 betas: (0.9, 0.98) eps: 0.000000001 weight_decay: !ref <weight_decay> diff --git a/recipes/DNS/enhancement/hparams/sepformer-dns-16k.yaml b/recipes/DNS/enhancement/hparams/sepformer-dns-16k.yaml index d43a2f0a951de5a2be2a1c6e0a509cd57ee4fc10..87a07c97aba76e5bf0d0e3f500ee261f1b935cec 100644 --- a/recipes/DNS/enhancement/hparams/sepformer-dns-16k.yaml +++ b/recipes/DNS/enhancement/hparams/sepformer-dns-16k.yaml @@ -39,7 +39,7 @@ sample_rate: 16000 audio_length: 4 # seconds n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 100 batch_size: 4 batch_size_test: 1 diff --git a/recipes/DVoice/ASR/CTC/hparams/train_amh_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_amh_with_wav2vec.yaml index 551ba2c19e1a64e8b0ab5467ec938851b0da86be..e9e1f43100f823790eec7e5d2649688dbb5d5574 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_amh_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_amh_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,7 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_dar_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_dar_with_wav2vec.yaml index 0b3647705fde6caf51a350bb86313a24ba657b7a..d1e2c66842028ae18a908eb06ba949e019977d66 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_dar_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_dar_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_fon_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_fon_with_wav2vec.yaml index 946ca0b6ff62e1c9ab049a2e7f59a09f3e1ab491..fca0230de227bd68b964653f04808e43d0e484c9 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_fon_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_fon_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_multi_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_multi_with_wav2vec.yaml index 14aef36c9cd16da21ae4a997ae8e9e47a84186ef..89fedade8f51ce16f72897b6d763993ed0f36d04 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_multi_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_multi_with_wav2vec.yaml @@ -31,7 +31,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -57,7 +57,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -77,45 +77,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -124,6 +110,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_sw_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_sw_with_wav2vec.yaml index f00e330a4484731402641ba5ff4882bf2ab2b83f..0194fd8776e231be71a999e9f87d5c5b675cdf69 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_sw_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_sw_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] diff --git a/recipes/DVoice/ASR/CTC/hparams/train_wol_with_wav2vec.yaml b/recipes/DVoice/ASR/CTC/hparams/train_wol_with_wav2vec.yaml index b1188dcb9905f182d580f7654cef50e5a3ae05f7..8470ce3a1c81b603cd54093f763c8323a28032cc 100644 --- a/recipes/DVoice/ASR/CTC/hparams/train_wol_with_wav2vec.yaml +++ b/recipes/DVoice/ASR/CTC/hparams/train_wol_with_wav2vec.yaml @@ -32,7 +32,7 @@ skip_prep: False # Skip data preparation # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 15.0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -58,7 +58,7 @@ test_dataloader_options: token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -78,45 +78,31 @@ eos_index: 2 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -125,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/ESC50/classification/hparams/cnn14_classifier.yaml b/recipes/ESC50/classification/hparams/cnn14_classifier.yaml index e8034bfdd169bd85bf10eda4504bd5f46a9a8656..bc0a83bbd7415221e3f4c5610363cb0fdfc0ffde 100644 --- a/recipes/ESC50/classification/hparams/cnn14_classifier.yaml +++ b/recipes/ESC50/classification/hparams/cnn14_classifier.yaml @@ -41,7 +41,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 32 lr: 0.0002 diff --git a/recipes/ESC50/classification/hparams/conv2d_classifier.yaml b/recipes/ESC50/classification/hparams/conv2d_classifier.yaml index 2b0a49bcd36172932655a544e2c436d528392988..284d5681fc5799e1d84bd7ce3c865e60e5b92e46 100644 --- a/recipes/ESC50/classification/hparams/conv2d_classifier.yaml +++ b/recipes/ESC50/classification/hparams/conv2d_classifier.yaml @@ -41,7 +41,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 32 lr: 0.00002 diff --git a/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml b/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml index 6f57a843b87ddf401379f3fa37bcab852ed7287f..00acd1ff372cb8b3a97e13a11f531181dfcb3502 100644 --- a/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml +++ b/recipes/ESC50/interpret/hparams/l2i_cnn14.yaml @@ -39,7 +39,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 2 lr: 0.0001 diff --git a/recipes/ESC50/interpret/hparams/l2i_conv2dclassifier.yaml b/recipes/ESC50/interpret/hparams/l2i_conv2dclassifier.yaml index 7292f89affe52105159e68260d673a972580fe39..4f6cb9b909871f749b60b8be054b9ed68b64c6e9 100644 --- a/recipes/ESC50/interpret/hparams/l2i_conv2dclassifier.yaml +++ b/recipes/ESC50/interpret/hparams/l2i_conv2dclassifier.yaml @@ -39,7 +39,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 16 lr: 0.0002 diff --git a/recipes/ESC50/interpret/hparams/nmf.yaml b/recipes/ESC50/interpret/hparams/nmf.yaml index 7b6c9905da9e1a356adca52933b05aead8986004..e4da313ba1a107174e8e584541fce92d2e811ba0 100644 --- a/recipes/ESC50/interpret/hparams/nmf.yaml +++ b/recipes/ESC50/interpret/hparams/nmf.yaml @@ -40,7 +40,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 batch_size: 2 lr: 0.0002 diff --git a/recipes/ESC50/interpret/hparams/piq.yaml b/recipes/ESC50/interpret/hparams/piq.yaml index c45f50a20c4a903e9177392213bb03a651c23827..68f8c06deb3f2ab2c13c19b93d31f119b650416a 100644 --- a/recipes/ESC50/interpret/hparams/piq.yaml +++ b/recipes/ESC50/interpret/hparams/piq.yaml @@ -42,7 +42,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 200 batch_size: 16 lr: 0.0002 diff --git a/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/conformer.yaml b/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/conformer.yaml index ec8653adeb3e1cf5354cc4d9c0f09ee773dbcf6e..49a7321f79c6268d9058ab16b2a920543e707a9f 100644 --- a/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/conformer.yaml +++ b/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/conformer.yaml @@ -81,7 +81,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 256 nhead: 4 diff --git a/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/transformer.yaml b/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/transformer.yaml index 59c3782e1282ace82e5b20b6b3f209481c2849e2..4310e2d6b5cc19f3fd8766f062b41a367c27cd1a 100644 --- a/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/transformer.yaml +++ b/recipes/Fisher-Callhome-Spanish/ST/transformer/hparams/transformer.yaml @@ -91,7 +91,7 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <num_workers> -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 256 nhead: 4 diff --git a/recipes/Google-speech-commands/hparams/xvect.yaml b/recipes/Google-speech-commands/hparams/xvect.yaml index 8eb842ba9accf3211240bb8fb40c97cc61507190..417cecfdf3b8bd2be10529bcdd4c2515d930cb5b 100644 --- a/recipes/Google-speech-commands/hparams/xvect.yaml +++ b/recipes/Google-speech-commands/hparams/xvect.yaml @@ -40,7 +40,7 @@ percentage_silence: 10 # Set this to 0 for the V2 35 task skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 batch_size: 32 lr: 0.001 diff --git a/recipes/Google-speech-commands/hparams/xvect_leaf.yaml b/recipes/Google-speech-commands/hparams/xvect_leaf.yaml index e06101850b4a213b43f3dd8764083720ec78c211..f2897af22c1252385255b99cf4dfa5e4fa0a03e3 100644 --- a/recipes/Google-speech-commands/hparams/xvect_leaf.yaml +++ b/recipes/Google-speech-commands/hparams/xvect_leaf.yaml @@ -42,7 +42,7 @@ percentage_silence: 10 # Set this to 0 for the V2 35 task skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 batch_size: 32 lr: 0.001 diff --git a/recipes/IEMOCAP/emotion_recognition/hparams/train_with_wav2vec2.yaml b/recipes/IEMOCAP/emotion_recognition/hparams/train_with_wav2vec2.yaml index ae34528828cb70df824c2a815ad8a73a501ac01b..d1b63d7bf65549ce4d301b89d86aa2c3686ff3da 100644 --- a/recipes/IEMOCAP/emotion_recognition/hparams/train_with_wav2vec2.yaml +++ b/recipes/IEMOCAP/emotion_recognition/hparams/train_with_wav2vec2.yaml @@ -38,7 +38,7 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 4 lr: 0.0001 @@ -50,7 +50,7 @@ freeze_wav2vec2: False # We see an improvement of 2% with freezing CNNs freeze_wav2vec2_conv: True -# Model parameters +####################### Model Parameters ####################################### encoder_dim: 768 # Number of emotions diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu.yaml index 05bca5c1ea37f37f166626920433ad10c26ddb9e..3901391a5480ca4f6ad9ddba1a951ef442262402 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu.yaml @@ -30,7 +30,7 @@ wav2vec2_hub: LIA-AvignonUniversity/IWSLT2022-tamasheq-only # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 lr: 0.001 lr_wav2vec: 0.00001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_mbart_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_mbart_st.yaml index a3a2f1c994107b1c993c6b5f391b516fc58324e1..6887c3a4084cd6f4d8eff06de83bcf620d2aa939 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_mbart_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_mbart_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 500 lr: 0.001 lr_wav2vec: 0.0001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_nllb_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_nllb_st.yaml index 11ebc937a7a1edf93d70ee894b2b8c191c57a11b..b86cef685336bf528a8e1bd65941870283121c8e 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_nllb_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_samu_nllb_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 500 lr: 0.001 lr_wav2vec: 0.0001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_mbart_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_mbart_st.yaml index 68f74d9b4a04e82d927e4a3c5f5749362bb09803..77b7c8cd6ceb36b004fb81e798a65a08dce43025 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_mbart_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_mbart_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 500 lr: 0.001 lr_wav2vec: 0.0001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_nllb_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_nllb_st.yaml index b62e366f1e05b77ea79a5f25cadc933371f7be31..d384bf3a86cbbe391c645bc4eabd49d1d74dc1cd 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_nllb_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_nllb_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint # wav2vec 2.0 specific parameters wav2vec2_frozen: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 500 lr: 0.001 lr_wav2vec: 0.0001 diff --git a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_st.yaml b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_st.yaml index 6bfb9db12708e14546670e9dc87c5d7dd487c773..beafeba864cf0113598622bfe70b390bafc6f18d 100644 --- a/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_st.yaml +++ b/recipes/IWSLT22_lowresource/AST/transformer/hparams/train_w2v2_st.yaml @@ -36,7 +36,7 @@ wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint wav2vec2_frozen: False keep_n_layers: 6 # keep first N layers from the Transformer Encoder stack inside the wav2vec 2.0 model -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 lr: 0.001 lr_wav2vec: 0.00001 diff --git a/recipes/KsponSpeech/ASR/transformer/hparams/conformer_medium.yaml b/recipes/KsponSpeech/ASR/transformer/hparams/conformer_medium.yaml index aee256973f98a3d35022ce9ac87470c0412cb281..3c0d43e2ad36b0ee39ca69b1c4b967d530e4fb18 100644 --- a/recipes/KsponSpeech/ASR/transformer/hparams/conformer_medium.yaml +++ b/recipes/KsponSpeech/ASR/transformer/hparams/conformer_medium.yaml @@ -34,7 +34,7 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -78,7 +78,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 256 nhead: 4 diff --git a/recipes/KsponSpeech/LM/hparams/transformer.yaml b/recipes/KsponSpeech/LM/hparams/transformer.yaml index cd9685e28a2bc016c73a12835491c3f89509ec63..5b64cc196c4d87d6453f152c3b537be168ab1e6f 100644 --- a/recipes/KsponSpeech/LM/hparams/transformer.yaml +++ b/recipes/KsponSpeech/LM/hparams/transformer.yaml @@ -24,7 +24,7 @@ test_csv: # Tokenizer model tokenizer_file: ddwkim/asr-conformer-transformerlm-ksponspeech/tokenizer.ckpt -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 batch_size: 256 lr: 0.1 diff --git a/recipes/KsponSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml b/recipes/KsponSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml index dd7cd490646b4b900fe74bc61867ee3cd7135d3a..04ef0ebfd9cf041c20668e89801433d6d5518387 100644 --- a/recipes/KsponSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml +++ b/recipes/KsponSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml @@ -16,7 +16,7 @@ skip_prep: False train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/LibriMix/separation/hparams/sepformer-libri2mix.yaml b/recipes/LibriMix/separation/hparams/sepformer-libri2mix.yaml index 8fb195c034aa3adc6e82951782d9509f53c589ee..ffa5a1ef2edde5bb435f812248d6b78ee278155a 100644 --- a/recipes/LibriMix/separation/hparams/sepformer-libri2mix.yaml +++ b/recipes/LibriMix/separation/hparams/sepformer-libri2mix.yaml @@ -37,7 +37,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml b/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml index cf68e9a8110596af9e6bf2d6b206ba0d975d600f..abc9c76c7e07df2847eea7e551cd7270fa14ea00 100644 --- a/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml +++ b/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml @@ -37,7 +37,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/LibriParty/VAD/hparams/train.yaml b/recipes/LibriParty/VAD/hparams/train.yaml index e07258c44b4b3a8886e9157c6033ac32e5b7277c..be91916855eb846b941aecf5d2b648ce26acbf2a 100644 --- a/recipes/LibriParty/VAD/hparams/train.yaml +++ b/recipes/LibriParty/VAD/hparams/train.yaml @@ -41,7 +41,7 @@ speech_csv: !ref <save_folder>/speech.csv multilang_speech_csv: !ref <save_folder>/multilang_speech.csv skip_prep: False # Skip data preparation -# Training parameters +####################### Training Parameters #################################### N_epochs: 100 lr: 1.0 lr_final: 0.1 @@ -65,7 +65,7 @@ test_dataloader_opts: n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### # activation: !name:torch.nn.LeakyReLU # dropout: 0.15 # cnn_blocks: 2 diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml index b609bb766291635878dae6b4311712c88302243b..fdbd7e86d8364dc02fecae7127a94927800c8b13 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_average_downsampling.yaml @@ -33,7 +33,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -62,7 +63,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -73,76 +75,12 @@ ctc_neurons: 29 output_neurons: 29 # Characters size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 200 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -10.0 - token_prune_min_logp: -5 - prune_history: True - alpha: 0.5 - beta: 1.5 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] activation: !ref <activation> @@ -211,8 +149,60 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 200 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -10.0 + token_prune_min_logp: -5 + prune_history: True + alpha: 0.5 + beta: 1.5 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml index f92f4f8fca27d3ef438377575b6c5881c0d80ade..1b84596dcd99a075af3d2266b44d04395031d619 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_conv_downsampling.yaml @@ -34,7 +34,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -63,7 +64,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -74,76 +76,12 @@ ctc_neurons: 29 output_neurons: 29 # Characters size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 200 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -10.0 - token_prune_min_logp: -5 - prune_history: True - alpha: 0.5 - beta: 1.5 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - - # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -214,8 +152,60 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 200 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -10.0 + token_prune_min_logp: -5 + prune_history: True + alpha: 0.5 + beta: 1.5 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml index 6c0e7207df65fafc4630b400a98800fa9989e245..d0daf5b77759c5b87410893abd71e77ea00725c2 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/downsampled/train_hf_wavlm_signal_downsampling.yaml @@ -33,7 +33,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -61,7 +62,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -72,75 +74,12 @@ ctc_neurons: 58 # Twice bigger than the number of characters for upsampling output_neurons: 29 # Characters size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 200 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -10.0 - token_prune_min_logp: -5 - prune_history: True - alpha: 0.5 - beta: 1.5 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -210,8 +149,58 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 200 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -10.0 + token_prune_min_logp: -5 + prune_history: True + alpha: 0.5 + beta: 1.5 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml index 2d91909f229b7f816f058bc5f8e608d2e4041c5f..1d860a29f1cf0955b277e934748cde06db9d516f 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec.yaml @@ -32,7 +32,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -56,7 +57,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -66,75 +67,14 @@ freeze_wav2vec: True output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - # # Functions and classes # -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] +label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref <number_of_epochs> enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -198,7 +138,58 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Decoding ######################################## + +# Decoding parameters +test_beam_search: + beam_size: 143 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -1.2 + prune_history: True + alpha: 0.8 + beta: 1.2 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + +############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml index 01a31cdd86b0e8da7db2bc59c8802955b5161612..c946b024314e5c26813bf5d4253f1bf86e9ce870 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_rnn_rescoring.yaml @@ -32,7 +32,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -56,7 +57,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -68,49 +70,6 @@ output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 pretrained_lm_tokenizer_path: speechbrain/asr-crdnn-rnnlm-librispeech -# This is the RNNLM that is used according to the Huggingface repository -# NB: It has to match the pre-trained RNNLM!! -lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM - output_neurons: 1000 - embedding_dim: 128 - activation: !name:torch.nn.LeakyReLU - dropout: 0.0 - rnn_layers: 2 - rnn_neurons: 2048 - dnn_blocks: 1 - dnn_neurons: 512 - return_hidden: True # For inference - -tokenizer: !new:sentencepiece.SentencePieceProcessor - -# Decoding parameters -lm_weight: 0.5 -blank_index: 0 -# topk is the number of hypotheses that will be rescored in the rescorer -# lowering this value might decrease the wer, but will increase speed. - -test_beam_search: - beam_size: 20 - topk: 20 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -12.0 - prune_history: False - alpha: 0.8 - beta: 1.2 - -rnnlm: !new:speechbrain.decoders.scorer.RNNLMRescorer - language_model: !ref <lm_model> - tokenizer: !ref <tokenizer> - bos_index: 0 - eos_index: 0 - pad_index: 0 - -rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder - rescorers: [!ref <rnnlm>] - weights: - rnnlm: !ref <lm_weight> # # Functions and classes @@ -118,53 +77,6 @@ rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -230,6 +142,84 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +# This is the RNNLM that is used according to the Huggingface repository +# NB: It has to match the pre-trained RNNLM!! +lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM + output_neurons: 1000 + embedding_dim: 128 + activation: !name:torch.nn.LeakyReLU + dropout: 0.0 + rnn_layers: 2 + rnn_neurons: 2048 + dnn_blocks: 1 + dnn_neurons: 512 + return_hidden: True # For inference + + +tokenizer: !new:sentencepiece.SentencePieceProcessor + +############################## Decoding ######################################## + +# topk is the number of hypotheses that will be rescored in the rescorer +# lowering this value might decrease the wer, but will increase speed. +test_beam_search: + beam_size: 20 + topk: 20 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -12.0 + prune_history: False + alpha: 0.8 + beta: 1.2 + +rnnlm: !new:speechbrain.decoders.scorer.RNNLMRescorer + language_model: !ref <lm_model> + tokenizer: !ref <tokenizer> + bos_index: 0 + eos_index: 0 + pad_index: 0 + +rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder + rescorers: [!ref <rnnlm>] + weights: + rnnlm: !ref <lm_weight> + +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml index 724c3bf1eafe4d7ade5cc231c878afcb94db3162..d806b20cfd5ebf408d0f8b60e83909521563288a 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_wav2vec_transformer_rescoring.yaml @@ -32,7 +32,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 1 lr: 0.9 lr_wav2vec: 0.0001 @@ -56,7 +57,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -88,30 +89,6 @@ tokenizer: !new:sentencepiece.SentencePieceProcessor # Decoding parameters lm_weight: 0.5 blank_index: 0 -# topk is the number of hypotheses that will be rescored in the rescorer -# lowering this value might decrease the wer, but will increase speed. -test_beam_search: - beam_size: 20 - topk: 20 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -12.0 - prune_history: False - alpha: 0.8 - beta: 1.2 - -transformerlm: !new:speechbrain.decoders.scorer.TransformerLMRescorer - language_model: !ref <lm_model> - tokenizer: !ref <tokenizer> - pad_index: 0 - bos_index: 1 - eos_index: 2 - -rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder - rescorers: [!ref <transformerlm>] - weights: - transformerlm: !ref <lm_weight> # # Functions and classes @@ -119,53 +96,6 @@ rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] @@ -229,8 +159,68 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder +############################## Decoding ######################################## + +# topk is the number of hypotheses that will be rescored in the rescorer +# lowering this value might decrease the wer, but will increase speed. +test_beam_search: + beam_size: 20 + topk: 20 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -12.0 + prune_history: False + alpha: 0.8 + beta: 1.2 + +transformerlm: !new:speechbrain.decoders.scorer.TransformerLMRescorer + language_model: !ref <lm_model> + tokenizer: !ref <tokenizer> + pad_index: 0 + bos_index: 1 + eos_index: 2 + +rescorer: !new:speechbrain.decoders.scorer.RescorerBuilder + rescorers: [!ref <transformerlm>] + weights: + transformerlm: !ref <lm_weight> + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml index 735b29db919073fd3062ce5a184c42c2e23a27cf..ba20bf2acfc55dc14ff95b92d070f69261321761 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_hf_whisper_encoder.yaml @@ -31,7 +31,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 15 warmup_steps: 1000 # We freeze whisper for 1000 steps to let the CTC adapt lr: 0.0008 @@ -61,7 +62,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### dnn_neurons: 1024 freeze_whisper: False whisper_output_dim: 512 @@ -71,74 +72,12 @@ whisper_output_dim: 512 output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <whisper_output_dim>] @@ -204,6 +143,57 @@ lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.75 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 143 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -12.0 + token_prune_min_logp: -1.2 + prune_history: True + alpha: 0.8 + beta: 1.2 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> diff --git a/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml b/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml index cf9cf7ec89110d79e0e7223e44d8ffbbfac75081..1b281b35c5afc35265592e26d4f3ff04d03a8d5b 100644 --- a/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml +++ b/recipes/LibriSpeech/ASR/CTC/hparams/train_sb_wav2vec.yaml @@ -33,7 +33,7 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 0.0003 lr_wav2vec: 0.00005 @@ -58,7 +58,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: !ref <test_batch_size> -# Model parameters +####################### Model Parameters ####################################### dnn_activation: !new:torch.nn.LeakyReLU dnn_neurons: 1280 dnn_dropout: 0.15 @@ -68,75 +68,12 @@ freeze_wav2vec: False output_neurons: 29 # BPE size, index(blank/eos/bos) = 0 blank_index: 0 -# Decoding parameters -test_beam_search: - beam_size: 200 - topk: 1 - blank_index: !ref <blank_index> - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -10.0 - token_prune_min_logp: -5.0 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] @@ -227,6 +164,58 @@ lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.7 patient: 0 +############################## Decoding ######################################## + +test_beam_search: + beam_size: 200 + topk: 1 + blank_index: !ref <blank_index> + space_token: ' ' # make sure this is the same as the one used in the tokenizer + beam_prune_logp: -10.0 + token_prune_min_logp: -5.0 + prune_history: True + alpha: 0.8 + beta: 1.2 + # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM + # It can either be a .bin or .arpa ; note: .arpa is much slower at loading + # If you don't want to use an LM, comment it out or set it to null + kenlm_model_path: null + +############################## Augmentations ################################### + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec.py b/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec.py index f2b6373b2ae5d476dc1c372c0513564b94c2191c..1f4ccdd2c6bdcdf62563989fb2c4d2e5de916fd6 100644 --- a/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec.py +++ b/recipes/LibriSpeech/ASR/CTC/train_with_wav2vec.py @@ -101,10 +101,15 @@ class ASR(sb.Brain): ids = batch.id tokens, tokens_lens = batch.tokens - # Label Augmentation + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + ( + tokens, + tokens_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens + ) loss_ctc = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) loss = loss_ctc diff --git a/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py b/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py index e3db36334d2ec34cab95d858ea427cd00993b9e1..d575265e86f0749f1cec8f30e26241b08c07d281 100644 --- a/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py +++ b/recipes/LibriSpeech/ASR/CTC/train_with_whisper.py @@ -72,10 +72,15 @@ class ASR(sb.Brain): ids = batch.id tokens, tokens_lens = batch.tokens - # Label Augmentation + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + ( + tokens, + tokens_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens + ) loss_ctc = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) loss = loss_ctc diff --git a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000.yaml b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000.yaml index e38b545fb3901046869bfaf96f7e3cb2a96422f4..3d0aaa200486f70767bec1c3e20c9058a0c978f1 100644 --- a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000.yaml +++ b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000.yaml @@ -44,7 +44,8 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 15 number_of_ctc_epochs: 5 batch_size: 8 @@ -89,7 +90,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -128,74 +130,6 @@ coverage_penalty: 1.5 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Download and prepare the dataset of noisy sequences for augmentation -prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL - URL: !ref <NOISE_DATASET_URL> - dest_folder: !ref <data_folder_noise> - ext: wav - csv_file: !ref <noise_annotation> - - -# Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - -add_noise: !new:speechbrain.augment.time_domain.AddNoise - csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> - noise_sample_rate: !ref <sample_rate> - clean_sample_rate: !ref <sample_rate> - num_workers: !ref <num_workers> - -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <add_noise>, - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global @@ -288,7 +222,8 @@ modules: model: !new:torch.nn.ModuleList - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>] -# Scorer +############################## Decoding & optimiser ############################ + coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer vocab_size: !ref <output_neurons> @@ -339,6 +274,57 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.8 patient: 0 +############################## Augmentations ################################### + +prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL + URL: !ref <NOISE_DATASET_URL> + dest_folder: !ref <data_folder_noise> + ext: wav + csv_file: !ref <noise_annotation> + +# Add noise to input signal +add_noise: !new:speechbrain.augment.time_domain.AddNoise + csv_file: !ref <noise_annotation> + snr_low: 0 + snr_high: 15 + noise_sample_rate: !ref <sample_rate> + clean_sample_rate: !ref <sample_rate> + num_workers: !ref <num_workers> + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <add_noise>, + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000_sligru.yaml b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000_sligru.yaml index 164f1ffe7431875b3c4ae5485b21be4c643beb34..355c49d36be26aeed18eac40ef91ea9249d13905 100644 --- a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000_sligru.yaml +++ b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_1000_sligru.yaml @@ -44,7 +44,8 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 15 number_of_ctc_epochs: 15 batch_size: 24 @@ -89,7 +90,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -126,75 +128,6 @@ temperature_lm: 1.25 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Download and prepare the dataset of noisy sequences for augmentation -prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL - URL: !ref <NOISE_DATASET_URL> - dest_folder: !ref <data_folder_noise> - ext: wav - csv_file: !ref <noise_annotation> - - -# Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - -add_noise: !new:speechbrain.augment.time_domain.AddNoise - csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> - noise_sample_rate: !ref <sample_rate> - clean_sample_rate: !ref <sample_rate> - num_workers: !ref <num_workers> - -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <add_noise>, - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - - normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global @@ -286,7 +219,8 @@ modules: model: !new:torch.nn.ModuleList - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>] -# Scorer +############################## Decoding & optimiser ############################ + coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer vocab_size: !ref <output_neurons> @@ -337,6 +271,57 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.8 patient: 0 +############################## Augmentations ################################### + +prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL + URL: !ref <NOISE_DATASET_URL> + dest_folder: !ref <data_folder_noise> + ext: wav + csv_file: !ref <noise_annotation> + +# Add noise to input signal +add_noise: !new:speechbrain.augment.time_domain.AddNoise + csv_file: !ref <noise_annotation> + snr_low: 0 + snr_high: 15 + noise_sample_rate: !ref <sample_rate> + clean_sample_rate: !ref <sample_rate> + num_workers: !ref <num_workers> + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <add_noise>, + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_5000.yaml b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_5000.yaml index cc0647562067be9cd70dfb69a03d40f2c68287ab..3046dfea80643d7c90025e21e33a830f6b615613 100644 --- a/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_5000.yaml +++ b/recipes/LibriSpeech/ASR/seq2seq/hparams/train_BPE_5000.yaml @@ -45,7 +45,8 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### + number_of_epochs: 25 number_of_ctc_epochs: 25 batch_size: 8 @@ -90,7 +91,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### + activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -132,75 +134,6 @@ coverage_penalty: 1.5 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Download and prepare the dataset of noisy sequences for augmentation -prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL - URL: !ref <NOISE_DATASET_URL> - dest_folder: !ref <data_folder_noise> - ext: wav - csv_file: !ref <noise_annotation> - - -# Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - -add_noise: !new:speechbrain.augment.time_domain.AddNoise - csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> - noise_sample_rate: !ref <sample_rate> - clean_sample_rate: !ref <sample_rate> - num_workers: !ref <num_workers> - -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> - -# Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - -drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> - -# Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - -drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> - -# Augmenter: Combines previously defined augmentations to perform data augmentation -wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: True - repeat_augment: 1 - shuffle_augmentations: False - min_augmentations: 4 - max_augmentations: 4 - augment_prob: 1.0 - augmentations: [ - !ref <add_noise>, - !ref <speed_perturb>, - !ref <drop_freq>, - !ref <drop_chunk>] - - normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global @@ -294,7 +227,8 @@ modules: model: !new:torch.nn.ModuleList - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>] -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -360,6 +294,57 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.8 patient: 0 +############################## Augmentations ################################### + +prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL + URL: !ref <NOISE_DATASET_URL> + dest_folder: !ref <data_folder_noise> + ext: wav + csv_file: !ref <noise_annotation> + +# Add noise to input signal +add_noise: !new:speechbrain.augment.time_domain.AddNoise + csv_file: !ref <noise_annotation> + snr_low: 0 + snr_high: 15 + noise_sample_rate: !ref <sample_rate> + clean_sample_rate: !ref <sample_rate> + num_workers: !ref <num_workers> + +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Frequency drop: randomly drops a number of frequency bands to zero. +drop_freq: !new:speechbrain.augment.time_domain.DropFreq + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 + +# Time drop: randomly drops a number of temporal chunks. +drop_chunk: !new:speechbrain.augment.time_domain.DropChunk + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 + +# Augmenter: Combines previously defined augmentations to perform data augmentation +wav_augment: !new:speechbrain.augment.augmenter.Augmenter + concat_original: True + min_augmentations: 4 + max_augmentations: 4 + augment_prob: 1.0 + augmentations: [ + !ref <add_noise>, + !ref <speed_perturb>, + !ref <drop_freq>, + !ref <drop_chunk>] + +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/seq2seq/train.py b/recipes/LibriSpeech/ASR/seq2seq/train.py index b3adaa67ab1d799211ebda84c01653a2a59d825f..7f535100876f9b05a935b30af641bc66f47dc168 100644 --- a/recipes/LibriSpeech/ASR/seq2seq/train.py +++ b/recipes/LibriSpeech/ASR/seq2seq/train.py @@ -97,12 +97,16 @@ class ASR(sb.Brain): tokens_eos, tokens_eos_lens = batch.tokens_eos tokens, tokens_lens = batch.tokens + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - tokens_eos = self.hparams.wav_augment.replicate_labels(tokens_eos) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml b/recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml index c7ad99c6386a4626a4c5a29e75920818d197ebf0..e9757e2080cc29edbd96c459f2b5552d0d17655c 100644 --- a/recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml +++ b/recipes/LibriSpeech/ASR/transducer/hparams/conformer_transducer.yaml @@ -40,7 +40,8 @@ test_csv: skip_prep: False ckpt_interval_minutes: 5 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -120,7 +121,8 @@ dynamic_batch_sampler: batch_ordering: random max_batch_ex: 256 -# Model parameters +####################### Model Parameters ####################################### + # Transformer d_model: 512 joint_dim: 640 @@ -164,18 +166,15 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_mels: !ref <n_mels> win_length: !ref <win_length> +############################## Augmentations ################################### + # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 1 max_augmentations: 1 augment_prob: 1.0 @@ -183,43 +182,24 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 + replace: "zeros" # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 + replace: "zeros" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: False @@ -234,6 +214,8 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] +############################## Models ########################################## + CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) num_blocks: 2 @@ -355,6 +337,8 @@ modules: model: !new:torch.nn.ModuleList - [!ref <CNN>, !ref <enc>, !ref <emb>, !ref <dec>, !ref <proj_enc>, !ref <proj_dec>, !ref <proj_ctc>, !ref <transducer_lin>] +############################## Decoding & optimiser ############################ + # Tokenizer initialization tokenizer: !new:sentencepiece.SentencePieceProcessor @@ -388,6 +372,8 @@ noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref <lr> n_warmup_steps: !ref <warmup_steps> +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/transducer/train.py b/recipes/LibriSpeech/ASR/transducer/train.py index 497912c83b97e10cfea19ec78974195e065123a2..84d7e05ffa3701c172be2c79765a59bade434d30 100644 --- a/recipes/LibriSpeech/ASR/transducer/train.py +++ b/recipes/LibriSpeech/ASR/transducer/train.py @@ -155,27 +155,16 @@ class ASR(sb.Brain): logits_transducer, wav_lens, predicted_tokens = predictions if stage == sb.Stage.TRAIN: - if hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - token_lens = self.hparams.wav_augment.replicate_labels( - token_lens - ) - tokens_eos = self.hparams.wav_augment.replicate_labels( - tokens_eos - ) - token_eos_lens = self.hparams.wav_augment.replicate_labels( - token_eos_lens - ) + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - token_lens = self.hparams.fea_augment.replicate_labels( - token_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - token_eos_lens = self.hparams.fea_augment.replicate_labels( - token_eos_lens + ( + tokens, + token_lens, + tokens_eos, + token_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, token_lens, tokens_eos, token_eos_lens ) if stage == sb.Stage.TRAIN: diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/bayesspeech.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/bayesspeech.yaml index 7772517e905a8482527334a7f923dbefa6cc85b0..2eee3646ecc87e806ae067ba4417178a17dbb021 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/bayesspeech.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/bayesspeech.yaml @@ -42,7 +42,8 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -116,7 +117,7 @@ test_dataloader_opts: padding_kwargs: value: !ref <pad_index> -####################### Model parameters ########################### +####################### Model Parameters ####################################### # Transformer d_model: 512 nhead: 4 @@ -148,7 +149,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -216,7 +217,8 @@ Adam: !name:torch.optim.Adam eps: 0.000000001 -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -290,57 +292,34 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" # Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -354,6 +333,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/branchformer_large.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/branchformer_large.yaml index c3f66ec9f3faa95c53320d5dec12e3ceba1d9a61..02fc2eac46b99dfa8801fb986f0fbf0a507ffde9 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/branchformer_large.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/branchformer_large.yaml @@ -41,9 +41,11 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. -# The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. +# The global batch size is computed as batch_size * n_gpus * +# grad_accumulation_factor. # Empirically, we found that this value should be >= 128. # Please, set your parameters accordingly. number_of_epochs: 120 @@ -103,7 +105,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 512 nhead: 8 @@ -131,7 +134,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -204,7 +207,8 @@ Adam: !name:torch.optim.AdamW eps: 0.000000001 weight_decay: !ref <weight_decay> -# Scorer +####################### Decoding & optimiser ################################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -275,57 +279,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +####################### Augmentations ########################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -340,6 +321,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank win_length: !ref <win_length> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml index 5d252a4b59f3d6603bd6b4a7ef95cd79df19d21a..7cdd4c06f91cac155d273ba9a9f9edd7dcc52885 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/conformer_large.yaml @@ -41,7 +41,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -102,7 +103,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 512 nhead: 8 @@ -129,7 +131,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -200,7 +202,8 @@ Adam: !name:torch.optim.AdamW model: !new:torch.nn.ModuleList - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>] -# Scorer +####################### Decoding & optimiser ########################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -271,57 +274,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -336,6 +316,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_mels: !ref <n_mels> win_length: !ref <win_length> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/conformer_small.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/conformer_small.yaml index eddc967800ccabb834d3c3751feba187b0262ad0..a24e6649a28488cf2d3cdf4a29242717134d6ae8 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/conformer_small.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/conformer_small.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -102,7 +103,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 144 nhead: 4 @@ -129,7 +131,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -200,12 +202,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -#SGD: !name:torch.optim.SGD -# lr: !ref <lr_sgd> -# momentum: 0.99 -# nesterov: True +############################## Decoding & optimiser ############################ -# Scorer ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -277,57 +275,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -341,6 +316,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_13M.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_13M.yaml index 7b2912ec33a8d0b6efe84a29048654ee1004442b..4b6ca718f306a9d81a406d4c92c6d8536aa9721a 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_13M.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_13M.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +############################## Training Parameters ############################# + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -99,7 +100,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 144 nhead: 8 @@ -133,7 +135,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -206,7 +208,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -276,57 +279,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -340,6 +320,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_25M.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_25M.yaml index 71d97cdb9f2920b8cb6a86a554e7d59b06148370..2e0242e311442c5ece15bdf30b2287f054a1475d 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_25M.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/hyperbranchformer_25M.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +############################## Training Parameters ############################# + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -100,7 +101,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### # Transformer d_model: 256 nhead: 8 @@ -134,7 +135,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -207,7 +208,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -# Scorer +############################## Decoding & optimiser ############################ + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -277,57 +279,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -335,11 +314,14 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <time_drop>, !ref <freq_drop>, !ref <time_warp>] + compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref <sample_rate> n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_22M.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_22M.yaml index fdf65fde380ce806eb6216bf2fb5cb308c30a011..6e165ed5c65424d1432c3c7c39f6615175830ed5 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_22M.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_22M.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -87,6 +88,7 @@ dynamic_batch_sampler_valid: batch_ordering: !ref <batch_ordering> max_batch_ex: !ref <max_batch_ex> + # Dataloader options train_dataloader_opts: batch_size: !ref <batch_size> @@ -99,7 +101,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 256 nhead: 8 @@ -132,7 +135,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -203,7 +206,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -# Scorer +####################### Decoding & optimiser ################################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -273,57 +277,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -337,6 +318,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_8M.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_8M.yaml index 47053b5fff683fd287339d20a5b5cd86e1488a65..fe3bd599c7712a95d2d55300d0073329a068a7e5 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_8M.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/hyperconformer_8M.yaml @@ -40,7 +40,8 @@ test_csv: - !ref <output_folder>/test-clean.csv - !ref <output_folder>/test-other.csv -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -100,7 +101,8 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 144 nhead: 8 @@ -133,7 +135,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -204,7 +206,8 @@ Adam: !name:torch.optim.Adam betas: (0.9, 0.98) eps: 0.000000001 -# Scorer +####################### Decoding & optimiser ########################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -274,57 +277,34 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -338,6 +318,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/train_hf_whisper.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/train_hf_whisper.yaml index 4805d7c6c543ab7369ef6c8b66751c2e90a1ec7d..4891ca61746a15f7af937e5e6bf53ef7ed1c372f 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/train_hf_whisper.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/train_hf_whisper.yaml @@ -35,7 +35,8 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +############################## Training Parameters ############################# + number_of_epochs: 1 lr_whisper: 0.00003 sorting: ascending @@ -61,7 +62,7 @@ min_decode_ratio: 0.0 max_decode_ratio: 1.0 test_beam_size: 8 -# Model parameters +####################### Model Parameters ####################################### freeze_whisper: False @@ -74,52 +75,34 @@ valid_loader_kwargs: test_loader_kwargs: batch_size: !ref <test_batch_size> - -# -# Functions and classes -# epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 # Min frequency band dropout probability + drop_freq_high: 1 # Max frequency band dropout probability + drop_freq_count_low: 1 # Min number of frequency bands to drop + drop_freq_count_high: 3 # Max number of frequency bands to drop + drop_freq_width: 0.05 # Width of frequency bands to drop # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1 + drop_length_high: 5 + drop_count_low: 1000 + drop_count_high: 2000 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -128,6 +111,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper source: !ref <whisper_hub> freeze: !ref <freeze_whisper> @@ -142,6 +127,8 @@ nll_loss: !name:speechbrain.nnet.losses.nll_loss modules: whisper: !ref <whisper> +############################## Decoding & optimiser ############################ + whisper_opt_class: !name:torch.optim.AdamW lr: !ref <lr_whisper> weight_decay: 0.01 @@ -167,6 +154,8 @@ lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NewBobScheduler annealing_factor: 0.9 patient: 0 +############################## Logging and Pretrainer ########################## + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref <save_folder> recoverables: diff --git a/recipes/LibriSpeech/ASR/transformer/hparams/transformer.yaml b/recipes/LibriSpeech/ASR/transformer/hparams/transformer.yaml index 36626f0d9e3c958ac7f25afbfe69a66e3a42f143..173453e9d4b992e109f5314261aeb129e14ddc79 100644 --- a/recipes/LibriSpeech/ASR/transformer/hparams/transformer.yaml +++ b/recipes/LibriSpeech/ASR/transformer/hparams/transformer.yaml @@ -42,7 +42,8 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### + # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -116,7 +117,8 @@ test_dataloader_opts: padding_kwargs: value: !ref <pad_index> -####################### Model parameters ########################### +####################### Model Parameters ####################################### + # Transformer d_model: 512 nhead: 4 @@ -142,7 +144,7 @@ test_beam_size: 66 lm_weight: 0.60 ctc_weight_decode: 0.40 -############################## models ################################ +############################## Models ########################################## CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -210,7 +212,8 @@ Adam: !name:torch.optim.Adam eps: 0.000000001 -# Scorer +####################### Decoding & optimiser ################################### + ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref <eos_index> blank_index: !ref <blank_index> @@ -284,57 +287,34 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 4 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 4 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "mean" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 - -# Frequency Drop -freq_drop_length_low: 10 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 20 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 4 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 4 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "mean" # Method of dropping chunks + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" +# Freq Drop freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 10 + drop_length_high: 20 + drop_count_low: 4 + drop_count_high: 4 + replace: "mean" dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -348,6 +328,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Logging and Pretrainer ########################## + train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref <train_log> diff --git a/recipes/LibriSpeech/ASR/transformer/train.py b/recipes/LibriSpeech/ASR/transformer/train.py index b69763e26fc4b21809c90d3c2c4f46039f75d8f4..292d7cc4249f6108f0cfba6f732073260ab80d9c 100644 --- a/recipes/LibriSpeech/ASR/transformer/train.py +++ b/recipes/LibriSpeech/ASR/transformer/train.py @@ -114,16 +114,16 @@ class ASR(sb.core.Brain): tokens, tokens_lens = batch.tokens if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - tokens_lens = self.hparams.fea_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.fea_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml b/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml index a565905c84d8c912f93657ba4515374b087fad3e..f487ffbe1ce7aa22c06cfd7fd11e4c6568371a68 100644 --- a/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml +++ b/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml @@ -95,7 +95,7 @@ homograph_loss_weight: 2.0 lr: 0.002 save_for_pretrained: True -# Model parameters +####################### Model Parameters ####################################### output_neurons: !apply:speechbrain.utils.hparams.choice value: !ref <phn_tokenize> choices: diff --git a/recipes/LibriSpeech/G2P/hparams/hparams_g2p_transformer.yaml b/recipes/LibriSpeech/G2P/hparams/hparams_g2p_transformer.yaml index c75cd97bfb2cc5bdbfd847fe6c1a185a3d4ca033..e1c0f44c79ad48982beb34c64f81cff2ae326deb 100644 --- a/recipes/LibriSpeech/G2P/hparams/hparams_g2p_transformer.yaml +++ b/recipes/LibriSpeech/G2P/hparams/hparams_g2p_transformer.yaml @@ -95,7 +95,7 @@ lr_dont_halve_until_epoch: 1 lr_patience: 1 save_for_pretrained: True -# Model parameters +####################### Model Parameters ####################################### output_neurons: !apply:speechbrain.utils.hparams.choice value: !ref <phn_tokenize> choices: diff --git a/recipes/LibriSpeech/G2P/hparams/hparams_lm_rnn.yaml b/recipes/LibriSpeech/G2P/hparams/hparams_lm_rnn.yaml index dcb76825915b84f5b2f253bd4f40e37b481b617a..7e1b7bc4af0d06d9258a2c5b85be9d7074e3ce38 100644 --- a/recipes/LibriSpeech/G2P/hparams/hparams_lm_rnn.yaml +++ b/recipes/LibriSpeech/G2P/hparams/hparams_lm_rnn.yaml @@ -50,7 +50,7 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger tokenizer_file: <output_folder>/save/phoneme_tokenizer.model -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 80 lr: 0.001 @@ -68,7 +68,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### model_dim: !apply:speechbrain.utils.hparams.choice value: !ref <phn_tokenize> choices: diff --git a/recipes/LibriSpeech/G2P/hparams/hparams_lm_transformer.yaml b/recipes/LibriSpeech/G2P/hparams/hparams_lm_transformer.yaml index 2a9a434d68bc2c6db3e299cdc3f37d3d7733f416..5e319e3d861df3c7a832f6219aaf142d35832a80 100644 --- a/recipes/LibriSpeech/G2P/hparams/hparams_lm_transformer.yaml +++ b/recipes/LibriSpeech/G2P/hparams/hparams_lm_transformer.yaml @@ -39,7 +39,7 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger # Tokenizer model (you must use the same tokenizer for LM and ASR training) tokenizer_file: <output_folder>/save/phoneme_tokenizer.model -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 80 lr: 0.001 @@ -57,7 +57,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### emb_dim: 256 # dimension of the embeddings transformer_num_heads: 4 diff --git a/recipes/LibriSpeech/LM/hparams/RNNLM.yaml b/recipes/LibriSpeech/LM/hparams/RNNLM.yaml index b061b4fccad7572ba5ddc125e2586b527c7733e8..0896de96032620c15d8b0e0cf19960aed1b953c8 100644 --- a/recipes/LibriSpeech/LM/hparams/RNNLM.yaml +++ b/recipes/LibriSpeech/LM/hparams/RNNLM.yaml @@ -29,7 +29,7 @@ test_transcripts_pattern: "test*/**/*.trans.txt" # Tokenizer model tokenizer_file: https://www.dropbox.com/s/o7gnouwdoqchotj/1000_unigram.model?dl=1 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 80 lr: 0.001 @@ -47,7 +47,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -# Model parameters +####################### Model Parameters ####################################### emb_size: 128 activation: !name:torch.nn.LeakyReLU dropout: 0.0 diff --git a/recipes/LibriSpeech/LM/hparams/transformer.yaml b/recipes/LibriSpeech/LM/hparams/transformer.yaml index c79ef576963eb3550fbfdbe56843fcc0302c6e5f..50123a4c3cdbfabbc53f006aa6b132fa3392e9e0 100644 --- a/recipes/LibriSpeech/LM/hparams/transformer.yaml +++ b/recipes/LibriSpeech/LM/hparams/transformer.yaml @@ -29,7 +29,7 @@ test_transcripts_pattern: "test*/**/*.trans.txt" # Tokenizer model tokenizer_file: speechbrain/asr-transformer-transformerlm-librispeech/tokenizer.ckpt -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 16 lr: 10 diff --git a/recipes/LibriSpeech/Tokenizer/hparams/1K_unigram_subword_bpe.yaml b/recipes/LibriSpeech/Tokenizer/hparams/1K_unigram_subword_bpe.yaml index b5a9fa60e66b29ad6104311a54917ab0b7ad0421..9dda21f82781ccbcd5bda6fa686a8c5f93eb2cfc 100644 --- a/recipes/LibriSpeech/Tokenizer/hparams/1K_unigram_subword_bpe.yaml +++ b/recipes/LibriSpeech/Tokenizer/hparams/1K_unigram_subword_bpe.yaml @@ -16,7 +16,7 @@ skip_prep: False train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev-clean.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 1000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/LibriSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml b/recipes/LibriSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml index c312ce5bbc3c8bc2843966c77d7fc5e97e0a5831..1f328c6f1682dcf2f25b34632b96ecdb96e5b45a 100644 --- a/recipes/LibriSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml +++ b/recipes/LibriSpeech/Tokenizer/hparams/5K_unigram_subword_bpe.yaml @@ -16,7 +16,7 @@ skip_prep: False train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/dev-clean.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/LibriSpeech/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml b/recipes/LibriSpeech/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml index 4806f3e064abaf88de5dc1879f898f3e63b6af77..13ce0d2203fd62dfc84afa29a9ddd7c04cfd66d3 100644 --- a/recipes/LibriSpeech/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml +++ b/recipes/LibriSpeech/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml @@ -48,7 +48,7 @@ test_dataloader_options: batch_size: 8 # DynamicBatching not used at testing time num_workers: 4 -# Training parameters +####################### Training Parameters #################################### lr: 0.0005 warmup: 30000 # This is equivalent to optimizer_step_limit - warmup @@ -63,7 +63,7 @@ mask_prob: 0.65 mask_length: 10 num_negatives: 100 -# Model parameters +####################### Model Parameters ####################################### embedding_dim: 768 extractor_dim: 512 final_dim: 256 diff --git a/recipes/MEDIA/ASR/CTC/hparams/train_hf_wav2vec.yaml b/recipes/MEDIA/ASR/CTC/hparams/train_hf_wav2vec.yaml index 924579a9c3f122db9fea7d150c6e5ef13c897c2d..70ef38de7090847faa4b5ead4d5bf721ccd399ee 100644 --- a/recipes/MEDIA/ASR/CTC/hparams/train_hf_wav2vec.yaml +++ b/recipes/MEDIA/ASR/CTC/hparams/train_hf_wav2vec.yaml @@ -55,7 +55,7 @@ test_dataloader_options: sample_rate: 16000 feats_dim: 1024 -# Training parameters: +####################### Training Parameters ####################################: number_of_epochs: 30 lr: 1 lr_wav2vec: 0.0001 @@ -67,7 +67,7 @@ patient: 0 patient_wav2vec: 0 sorting: ascending -# Model parameters: +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_blocks: 3 dnn_neurons: 512 diff --git a/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_full.yaml b/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_full.yaml index 7be6f6b94757115d810c6cc07f7c9907ca0e15ed..4f9bad2e7ebead66974cd6f164c3a92f2b6504b9 100644 --- a/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_full.yaml +++ b/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_full.yaml @@ -57,7 +57,7 @@ test_dataloader_options: sample_rate: 16000 feats_dim: 1024 -# Training parameters: +####################### Training Parameters ####################################: number_of_epochs: 30 lr: 1 lr_wav2vec: 0.0001 @@ -69,7 +69,7 @@ patient: 0 patient_wav2vec: 0 sorting: ascending -# Model parameters: +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_blocks: 3 dnn_neurons: 512 diff --git a/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_relax.yaml b/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_relax.yaml index d631a6da8add16c74ffb98b8b5a20b95075173c9..8631e6e885f0e31dfb9448b51a86acf7b40d918a 100644 --- a/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_relax.yaml +++ b/recipes/MEDIA/SLU/CTC/hparams/train_hf_wav2vec_relax.yaml @@ -57,7 +57,7 @@ test_dataloader_options: sample_rate: 16000 feats_dim: 1024 -# Training parameters: +####################### Training Parameters ####################################: number_of_epochs: 30 lr: 1 lr_wav2vec: 0.0001 @@ -69,7 +69,7 @@ patient: 0 patient_wav2vec: 0 sorting: ascending -# Model parameters: +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_blocks: 3 dnn_neurons: 512 diff --git a/recipes/MultiWOZ/response_generation/gpt/hparams/train_gpt.yaml b/recipes/MultiWOZ/response_generation/gpt/hparams/train_gpt.yaml index 05b16bea3f4af7afee0afd4aafe1d375f61c74f0..5bb3b8ed8fa30c797cd1ad1a3f62bdaac535a0a6 100644 --- a/recipes/MultiWOZ/response_generation/gpt/hparams/train_gpt.yaml +++ b/recipes/MultiWOZ/response_generation/gpt/hparams/train_gpt.yaml @@ -58,7 +58,7 @@ max_history: 5 ignore_index: -100 label_smoothing: 0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 4 batch_size: 8 test_batch_size: 4 diff --git a/recipes/MultiWOZ/response_generation/llama2/hparams/train_llama2.yaml b/recipes/MultiWOZ/response_generation/llama2/hparams/train_llama2.yaml index f7fd2b087b2b66ade39816c3245e7b67eb62fcc2..507115e832948e06abc8a8501d66606df252525b 100644 --- a/recipes/MultiWOZ/response_generation/llama2/hparams/train_llama2.yaml +++ b/recipes/MultiWOZ/response_generation/llama2/hparams/train_llama2.yaml @@ -40,7 +40,7 @@ max_history: 2 ignore_index: -100 label_smoothing: 0 -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 4 batch_size: 1 test_batch_size: 1 diff --git a/recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml b/recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml index 3f1da09190314e5c8a956eba60f2c94d1be75833..c23c11c53524db10e8e8110202af01e00cfa611e 100644 --- a/recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml +++ b/recipes/REAL-M/sisnr-estimation/hparams/pool_sisnrestimator.yaml @@ -67,7 +67,7 @@ num_spks: 2 noprogressbar: False sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.0001 diff --git a/recipes/RescueSpeech/ASR/noise-robust/hparams/robust_asr_16k.yaml b/recipes/RescueSpeech/ASR/noise-robust/hparams/robust_asr_16k.yaml index 302dabe57530db6df8f33ec147450e4f75b10fef..10e8e58e4a05a72f94d4c2f9d63aada185b3e236 100644 --- a/recipes/RescueSpeech/ASR/noise-robust/hparams/robust_asr_16k.yaml +++ b/recipes/RescueSpeech/ASR/noise-robust/hparams/robust_asr_16k.yaml @@ -41,7 +41,7 @@ skip_prep: False # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 -## Model parameters- Enhance model +## Model Parameters- Enhance model dereverberate: False save_audio: True sample_rate: 16000 @@ -54,7 +54,7 @@ use_rand_shift: False min_shift: -8000 max_shift: 8000 -## Training parameters- ASR +######################## Training Parameters ####################################- ASR number_of_epochs: 10 lr_whisper: 0.00003 sorting: ascending diff --git a/recipes/SLURP/NLU/hparams/train.yaml b/recipes/SLURP/NLU/hparams/train.yaml index e2201d96b05c4e94846555b39d65c4b4a1f6c517..7d88d62a9baa3b307c7a9c1c497c67f606146521 100644 --- a/recipes/SLURP/NLU/hparams/train.yaml +++ b/recipes/SLURP/NLU/hparams/train.yaml @@ -28,14 +28,14 @@ asr_tokenizer_file: https://www.dropbox.com/s/o7gnouwdoqchotj/1000_unigram.model slu_tokenizer_file: https://www.dropbox.com/s/tmwq12r5vgcsif9/58_unigram.model?dl=1 skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 16 lr: 0.0003 # token_type: unigram # ["unigram", "bpe", "char"] sorting: random -# Model parameters +####################### Model Parameters ####################################### # sample_rate: 1600 emb_size: 128 dec_neurons: 512 diff --git a/recipes/SLURP/Tokenizer/hparams/tokenizer_bpe58.yaml b/recipes/SLURP/Tokenizer/hparams/tokenizer_bpe58.yaml index 51f805b078449a7594c1a205be73af6c6778e147..bf935024a739e665dad65a5260a57f393f36aa48 100644 --- a/recipes/SLURP/Tokenizer/hparams/tokenizer_bpe58.yaml +++ b/recipes/SLURP/Tokenizer/hparams/tokenizer_bpe58.yaml @@ -14,7 +14,7 @@ train_csv: !ref <output_folder>/train-type=direct.csv valid_csv: !ref <output_folder>/devel-type=direct.csv skip_prep: False -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 58 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/SLURP/direct/hparams/train.yaml b/recipes/SLURP/direct/hparams/train.yaml index 5a42c738c2c823c8d9eac5d6cd1c1c59abb6950f..038d2e59ea32bf9322087d0df6c4fe4cb3380878 100644 --- a/recipes/SLURP/direct/hparams/train.yaml +++ b/recipes/SLURP/direct/hparams/train.yaml @@ -34,7 +34,7 @@ rir_annotation: !ref <save_folder>/rir.csv tokenizer_file: https://www.dropbox.com/s/tmwq12r5vgcsif9/58_unigram.model?dl=1 skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 16 lr: 0.0003 @@ -42,7 +42,7 @@ lr: 0.0003 sorting: random ckpt_interval_minutes: 15 # save checkpoint every N min -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -100,41 +100,31 @@ add_noise: !new:speechbrain.augment.time_domain.AddNoise clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> +############################## Augmentations ################################### + # Speed perturbation -speed_changes: [90, 95, 105, 110] # List of speed changes for time-stretching speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [90, 95, 105, 110] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 3 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 3 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 shuffle_augmentations: True min_augmentations: 1 max_augmentations: 4 @@ -146,7 +136,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] -# Models +############################## Models ########################################## + asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech slu_enc: !new:speechbrain.nnet.containers.Sequential diff --git a/recipes/SLURP/direct/hparams/train_with_wav2vec2.yaml b/recipes/SLURP/direct/hparams/train_with_wav2vec2.yaml index b383da5cbededb020a662caf4784ad7663761eaa..84222db5f430c3c4df1e5446279b48cd5ceb3eb5 100644 --- a/recipes/SLURP/direct/hparams/train_with_wav2vec2.yaml +++ b/recipes/SLURP/direct/hparams/train_with_wav2vec2.yaml @@ -32,7 +32,7 @@ skip_prep: False # URL for the wav2vec2 model, you can change to benchmark diffrenet models wav2vec2_hub: "facebook/hubert-base-ls960" -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 35 batch_size: 6 lr: 0.0003 @@ -47,7 +47,7 @@ freeze_wav2vec2: False #set to true to freeze the CONV part of the wav2vec2 model freeze_wav2vec2_conv: True -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -96,45 +96,31 @@ seq_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref <dec_neurons> n_neurons: !ref <output_neurons> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/Switchboard/ASR/CTC/hparams/train_with_wav2vec.yaml b/recipes/Switchboard/ASR/CTC/hparams/train_with_wav2vec.yaml index 2933e0fd4878dc17ec3dde112c13f73d067d65b6..7741680bdbb25e81dd9c1f0216e8d275d9af44aa 100644 --- a/recipes/Switchboard/ASR/CTC/hparams/train_with_wav2vec.yaml +++ b/recipes/Switchboard/ASR/CTC/hparams/train_with_wav2vec.yaml @@ -49,7 +49,7 @@ test_csv: - !ref <output_folder>/test_callhome.csv - !ref <output_folder>/test.csv -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 lr: 1.0 lr_wav2vec: 0.0001 @@ -74,7 +74,7 @@ test_dataloader_options: token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 -# Model parameters +####################### Model Parameters ####################################### wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False @@ -109,45 +109,31 @@ kenlm_model_path: null epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -156,6 +142,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref <wav2vec_output_dim>] linear1: !name:speechbrain.nnet.linear.Linear diff --git a/recipes/Switchboard/ASR/seq2seq/hparams/train_BPE_2000.yaml b/recipes/Switchboard/ASR/seq2seq/hparams/train_BPE_2000.yaml index d20001f1cf5b550c1eb6638c86198b2fb06f9394..743467bcf2ffd6b0ceacb7fc873259b8ddb8f2c1 100644 --- a/recipes/Switchboard/ASR/seq2seq/hparams/train_BPE_2000.yaml +++ b/recipes/Switchboard/ASR/seq2seq/hparams/train_BPE_2000.yaml @@ -57,7 +57,7 @@ test_csv: - !ref <save_folder>/test_callhome.csv - !ref <save_folder>/test.csv -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 number_of_ctc_epochs: 5 batch_size: 10 @@ -103,7 +103,7 @@ test_dataloader_opts: num_workers: !ref <num_workers> batch_size: !ref <batch_size> -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -144,57 +144,40 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U ext: wav csv_file: !ref <noise_annotation> -# Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation +############################## Augmentations ################################### +# Add noise to input signal add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -215,6 +198,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/Switchboard/ASR/seq2seq/train.py b/recipes/Switchboard/ASR/seq2seq/train.py index 57ce5c9667156577040358760c65794d58f968f2..d0cd3ce91e3248345c76a7f662fe010ad1d92e02 100644 --- a/recipes/Switchboard/ASR/seq2seq/train.py +++ b/recipes/Switchboard/ASR/seq2seq/train.py @@ -127,12 +127,16 @@ class ASR(sb.Brain): tokens_eos, tokens_eos_lens = batch.tokens_eos tokens, tokens_lens = batch.tokens + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - tokens = self.hparams.wav_augment.replicate_labels(tokens) - tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - tokens_eos = self.hparams.wav_augment.replicate_labels(tokens_eos) - tokens_eos_lens = self.hparams.wav_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.wav_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/Switchboard/ASR/transformer/hparams/transformer.yaml b/recipes/Switchboard/ASR/transformer/hparams/transformer.yaml index bd84a8a19ec109f5c4f331a53fb2aecbf3dee43c..674c037199f6d654512df99d8c32f08aa82cf977 100644 --- a/recipes/Switchboard/ASR/transformer/hparams/transformer.yaml +++ b/recipes/Switchboard/ASR/transformer/hparams/transformer.yaml @@ -51,7 +51,7 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### # To make Transformers converge, the global batch size should be large enough. # The global batch size is computed as: # batch_size * n_gpus * grad_accumulation_factor. @@ -96,7 +96,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer transformer_input_size: 1280 d_model: 256 @@ -271,50 +271,32 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 -# Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks +############################## Augmentations ################################### +# Speed perturbation +speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb + orig_freq: !ref <sample_rate> + speeds: [95, 100, 105] + +# Time Drop time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -323,14 +305,7 @@ fea_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <freq_drop>, !ref <time_warp>] - -# Speed perturbation do_speed_perturb: True -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - -speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb - orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref <sample_rate> diff --git a/recipes/Switchboard/ASR/transformer/hparams/transformer_finetuned_LM.yaml b/recipes/Switchboard/ASR/transformer/hparams/transformer_finetuned_LM.yaml index 45a765cd1238f0aa2c882dfb46731b3b3c4e3adb..8dd221ca407d4e68e34479cbce9e87f49e6ca542 100644 --- a/recipes/Switchboard/ASR/transformer/hparams/transformer_finetuned_LM.yaml +++ b/recipes/Switchboard/ASR/transformer/hparams/transformer_finetuned_LM.yaml @@ -51,7 +51,7 @@ test_csv: ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### # To make Transformers converge, the global batch size should be large enough. # The global batch size is computed as: # batch_size * n_gpus * grad_accumulation_factor. @@ -96,7 +96,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 512 nhead: 4 @@ -126,7 +126,7 @@ lm_weight: 0.60 ctc_weight_decode: 0.40 temperature: 1.15 temperature_lm: 1.15 -############################## models ################################ +############################## Models ################################ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) @@ -258,57 +258,32 @@ normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Time Drop -time_drop_length_low: 15 # Min length for temporal chunk to drop in spectrogram -time_drop_length_high: 25 # Max length for temporal chunk to drop in spectrogram -time_drop_count_low: 5 # Min number of chunks to drop in time in the spectrogram -time_drop_count_high: 5 # Max number of chunks to drop in time in the spectrogram -time_drop_replace: "zeros" # Method of dropping chunks - time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <time_drop_length_low> - drop_length_high: !ref <time_drop_length_high> - drop_count_low: !ref <time_drop_count_low> - drop_count_high: !ref <time_drop_count_high> - replace: !ref <time_drop_replace> - dim: 1 + drop_length_low: 15 + drop_length_high: 25 + drop_count_low: 5 + drop_count_high: 5 # Frequency Drop -freq_drop_length_low: 25 # Min length for chunks to drop in frequency in the spectrogram -freq_drop_length_high: 35 # Max length for chunks to drop in frequency in the spectrogram -freq_drop_count_low: 2 # Min number of chunks to drop in frequency in the spectrogram -freq_drop_count_high: 2 # Max number of chunks to drop in frequency in the spectrogram -freq_drop_replace: "zeros" # Method of dropping chunks - freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop - drop_length_low: !ref <freq_drop_length_low> - drop_length_high: !ref <freq_drop_length_high> - drop_count_low: !ref <freq_drop_count_low> - drop_count_high: !ref <freq_drop_count_high> - replace: !ref <freq_drop_replace> + drop_length_low: 25 + drop_length_high: 35 + drop_count_low: 2 + drop_count_high: 2 dim: 2 # Time warp -time_warp_window: 5 # Length of time warping window -time_warp_mode: "bicubic" # Time warping method - time_warp: !new:speechbrain.augment.freq_domain.Warping - warp_window: !ref <time_warp_window> - warp_mode: !ref <time_warp_mode> - dim: 1 fea_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 diff --git a/recipes/Switchboard/ASR/transformer/train.py b/recipes/Switchboard/ASR/transformer/train.py index 5fb6ebc47dbe64b3ddadd0ee7506282aa10138e6..dcc5279523a545aaa30d958d18b5962e85e03460 100644 --- a/recipes/Switchboard/ASR/transformer/train.py +++ b/recipes/Switchboard/ASR/transformer/train.py @@ -133,16 +133,16 @@ class ASR(sb.core.Brain): tokens, tokens_lens = batch.tokens if stage == sb.Stage.TRAIN: + # Labels must be extended if parallel augmentation or concatenated + # augmentation was performed on the input (increasing the time dimension) if hasattr(self.hparams, "fea_augment"): - tokens = self.hparams.fea_augment.replicate_labels(tokens) - tokens_lens = self.hparams.fea_augment.replicate_labels( - tokens_lens - ) - tokens_eos = self.hparams.fea_augment.replicate_labels( - tokens_eos - ) - tokens_eos_lens = self.hparams.fea_augment.replicate_labels( - tokens_eos_lens + ( + tokens, + tokens_lens, + tokens_eos, + tokens_eos_lens, + ) = self.hparams.fea_augment.replicate_multiple_labels( + tokens, tokens_lens, tokens_eos, tokens_eos_lens ) loss_seq = self.hparams.seq_cost( diff --git a/recipes/Switchboard/LM/hparams/transformer.yaml b/recipes/Switchboard/LM/hparams/transformer.yaml index 2f27463afa4214e0c31893e57f595e5bab1fbdf8..b501faf55cf7d8830f06abafa2145b02a9d6a465 100644 --- a/recipes/Switchboard/LM/hparams/transformer.yaml +++ b/recipes/Switchboard/LM/hparams/transformer.yaml @@ -36,7 +36,7 @@ test_csv: !ref <save_folder>/test.csv # (e.g. /path/to/2000_unigram.model) tokenizer_file: !PLACEHOLDER -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 100 batch_size: 164 lr: 1 diff --git a/recipes/Switchboard/LM/hparams/transformer_finetune.yaml b/recipes/Switchboard/LM/hparams/transformer_finetune.yaml index f5657c76c2625fe76fac251d9b2e94086b4db23e..5b0860e41a432252b513ccb23afe81508d42ad23 100644 --- a/recipes/Switchboard/LM/hparams/transformer_finetune.yaml +++ b/recipes/Switchboard/LM/hparams/transformer_finetune.yaml @@ -39,7 +39,7 @@ test_csv: !ref <save_folder>/test.csv # instead. E.g if you want to use your own LM / tokenizer. pretrained_lm_tokenizer_path: speechbrain/asr-transformer-transformerlm-librispeech -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 5 batch_size: 128 lr: 2 diff --git a/recipes/Switchboard/Tokenizer/hparams/2K_unigram_subword_bpe.yaml b/recipes/Switchboard/Tokenizer/hparams/2K_unigram_subword_bpe.yaml index e6c546bdf9deec07bffa0c0546401f1990c8b33a..d07d83e707b0ae05a49e3c2813a79a1a0baec88e 100644 --- a/recipes/Switchboard/Tokenizer/hparams/2K_unigram_subword_bpe.yaml +++ b/recipes/Switchboard/Tokenizer/hparams/2K_unigram_subword_bpe.yaml @@ -20,7 +20,7 @@ train_csv: !ref <output_folder>/train_lm.csv valid_csv: !ref <output_folder>/dev.csv skip_prep: False -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 2000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/TIMIT/ASR/CTC/hparams/train.yaml b/recipes/TIMIT/ASR/CTC/hparams/train.yaml index dce350b7e2f2edc7621b7f223c0e83b70a41a301..145fa1a3e362af75bd80e3d435bf0c4ce0ac0d7d 100644 --- a/recipes/TIMIT/ASR/CTC/hparams/train.yaml +++ b/recipes/TIMIT/ASR/CTC/hparams/train.yaml @@ -25,7 +25,7 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 lr: 1.0 @@ -36,7 +36,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -69,6 +69,8 @@ test_dataloader_opts: normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -76,58 +78,38 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U ext: wav csv_file: !ref <noise_annotation> - # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -137,6 +119,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> diff --git a/recipes/TIMIT/ASR/seq2seq/hparams/train.yaml b/recipes/TIMIT/ASR/seq2seq/hparams/train.yaml index cc3276506d547eb6b02fd2f830d902730164a38f..d61179fa9787d06cdfe128ff4942485725648033 100644 --- a/recipes/TIMIT/ASR/seq2seq/hparams/train.yaml +++ b/recipes/TIMIT/ASR/seq2seq/hparams/train.yaml @@ -22,7 +22,7 @@ test_annotation: !ref <save_folder>/test.json skip_prep: False # Skip data preparation uppercase: False # Must be True when the TIMIT dataset is in the upper-case version -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 # Used if dynamic_batching is False lr: 0.0003 @@ -34,7 +34,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -88,45 +88,30 @@ dynamic_batch_sampler: shuffle: !ref <shuffle> batch_ordering: !ref <batch_ordering> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -135,6 +120,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global diff --git a/recipes/TIMIT/ASR/seq2seq/hparams/train_with_wav2vec2.yaml b/recipes/TIMIT/ASR/seq2seq/hparams/train_with_wav2vec2.yaml index 9756e9356b2676c1d3f32d4b0f9cf6362e2c013b..705f79e9a8cefc25b3138070b6fcd10682381c03 100644 --- a/recipes/TIMIT/ASR/seq2seq/hparams/train_with_wav2vec2.yaml +++ b/recipes/TIMIT/ASR/seq2seq/hparams/train_with_wav2vec2.yaml @@ -23,7 +23,7 @@ test_annotation: !ref <save_folder>/test.json skip_prep: False # Skip data preparation uppercase: False # Must be True when the TIMIT dataset is in the upper-case version -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 8 lr: 0.0003 @@ -33,7 +33,7 @@ sorting: ascending precision: fp32 # bf16, fp16 or fp32 sample_rate: 16000 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 @@ -66,45 +66,30 @@ test_dataloader_opts: batch_size: !ref <batch_size> num_workers: !ref <batch_size> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -113,6 +98,7 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> diff --git a/recipes/TIMIT/ASR/transducer/hparams/train.yaml b/recipes/TIMIT/ASR/transducer/hparams/train.yaml index 5b8e53809d679c0829bba2c8662ef2491db5acae..204297dc68591031a53ac15128d3194a551071c4 100644 --- a/recipes/TIMIT/ASR/transducer/hparams/train.yaml +++ b/recipes/TIMIT/ASR/transducer/hparams/train.yaml @@ -28,7 +28,7 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 lr: 1.0 @@ -40,7 +40,7 @@ n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -92,6 +92,8 @@ compute_features: !new:speechbrain.lobes.features.Fbank normalize: !new:speechbrain.processing.features.InputNormalization norm_type: global +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -99,58 +101,38 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U ext: wav csv_file: !ref <noise_annotation> - # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -160,6 +142,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + enc: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] diff --git a/recipes/TIMIT/ASR/transducer/hparams/train_wav2vec.yaml b/recipes/TIMIT/ASR/transducer/hparams/train_wav2vec.yaml index 90f899d26bfb469234b4856c8e5af7a9f81b38f9..9ead09f56ecc8d959455233397cc28ab11d875b6 100644 --- a/recipes/TIMIT/ASR/transducer/hparams/train_wav2vec.yaml +++ b/recipes/TIMIT/ASR/transducer/hparams/train_wav2vec.yaml @@ -28,7 +28,7 @@ test_annotation: !ref <save_folder>/test.json skip_prep: False # Skip data preparation uppercase: False # Must be True when the TIMIT dataset is in the upper-case version -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 batch_size: 8 lr: 0.0003 @@ -41,7 +41,7 @@ sample_rate: 16000 # n_fft: 400 # n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU # dropout: 0.15 dnn_blocks: 1 @@ -74,45 +74,30 @@ test_dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False - concat_original: False - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -121,6 +106,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref <wav2vec2_hub> output_norm: True diff --git a/recipes/TIMIT/Alignment/hparams/train.yaml b/recipes/TIMIT/Alignment/hparams/train.yaml index 7a2a581d9719c29a3bce62925feebf21630c3b0d..aaf06b7ffc0f61291d5de751c9fe3094876fdef7 100644 --- a/recipes/TIMIT/Alignment/hparams/train.yaml +++ b/recipes/TIMIT/Alignment/hparams/train.yaml @@ -20,7 +20,7 @@ valid_annotation: !ref <data_folder>/dev.json test_annotation: !ref <data_folder>/test.json skip_prep: False # Skip data prep -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 10 batch_size: 256 lr: 0.0003 @@ -40,7 +40,7 @@ phn_set: 60 # {60, 48, 39} output_neurons: 183 blank_index: 182 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dnn_blocks: 1 dnn_neurons: 2000 diff --git a/recipes/Tedlium2/ASR/transformer/hparams/branchformer_large.yaml b/recipes/Tedlium2/ASR/transformer/hparams/branchformer_large.yaml index 6ffad0b0041d9de2ff5768dd303c29f7b2d81855..2f9f924c71069e6dadffcd638d25692efd1597c5 100644 --- a/recipes/Tedlium2/ASR/transformer/hparams/branchformer_large.yaml +++ b/recipes/Tedlium2/ASR/transformer/hparams/branchformer_large.yaml @@ -34,7 +34,7 @@ valid_csv: !ref <output_folder>/dev/dev.csv test_csv: - !ref <output_folder>/test/test.csv -# Training parameters +####################### Training Parameters #################################### # To make Transformers converge, the global bath size should be large enough. # The global batch size is computed as batch_size * n_gpus * grad_accumulation_factor. # Empirically, we found that this value should be >= 128. @@ -97,7 +97,7 @@ valid_dataloader_opts: test_dataloader_opts: batch_size: 1 -####################### Model parameters ########################### +####################### Model Parameters ########################### # Transformer d_model: 512 nhead: 8 diff --git a/recipes/Tedlium2/Tokenizer/hparams/tedlium2_500_bpe.yaml b/recipes/Tedlium2/Tokenizer/hparams/tedlium2_500_bpe.yaml index a97290f13f49d2cb09271a388529d4ee652f3015..03c91b12692c4415e1c3cdb71b0802467b2505f6 100644 --- a/recipes/Tedlium2/Tokenizer/hparams/tedlium2_500_bpe.yaml +++ b/recipes/Tedlium2/Tokenizer/hparams/tedlium2_500_bpe.yaml @@ -14,7 +14,7 @@ skip_prep: False train_csv: !ref <output_folder>/train/train.csv valid_csv: !ref <output_folder>/dev/dev.csv -# Training parameters +####################### Training Parameters #################################### token_type: bpe # ["unigram", "bpe", "char"] token_output: 500 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/UrbanSound8k/SoundClassification/hparams/train_ecapa_tdnn.yaml b/recipes/UrbanSound8k/SoundClassification/hparams/train_ecapa_tdnn.yaml index 1d9613a165bb25336112a8c15f38343d9edb52ae..3ecb1119b460be4c313dd61fc7d6a2a0321bc61f 100644 --- a/recipes/UrbanSound8k/SoundClassification/hparams/train_ecapa_tdnn.yaml +++ b/recipes/UrbanSound8k/SoundClassification/hparams/train_ecapa_tdnn.yaml @@ -48,7 +48,7 @@ skip_manifest_creation: False ckpt_interval_minutes: 15 # save checkpoint every N min -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 25 batch_size: 32 lr: 0.001 diff --git a/recipes/Voicebank/ASR/CTC/hparams/train.yaml b/recipes/Voicebank/ASR/CTC/hparams/train.yaml index 65b833a0bcc33915a60fbe278de1fe2c814c0384..a49bae5fa4caebfd13f87b4fc4a8f24db47af522 100644 --- a/recipes/Voicebank/ASR/CTC/hparams/train.yaml +++ b/recipes/Voicebank/ASR/CTC/hparams/train.yaml @@ -20,7 +20,7 @@ valid_annotation: !ref <output_folder>/valid.json test_annotation: !ref <output_folder>/test.json skip_prep: False # Skip data preparation -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 sorting: ascending @@ -37,7 +37,7 @@ sample_rate: 16000 n_fft: 400 n_mels: 40 -# Model parameters +####################### Model Parameters ####################################### activation: !name:torch.nn.LeakyReLU dropout: 0.15 cnn_blocks: 2 @@ -61,45 +61,31 @@ compute_features: !new:speechbrain.lobes.features.Fbank n_fft: !ref <n_fft> n_mels: !ref <n_mels> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 @@ -108,6 +94,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + model: !new:speechbrain.lobes.models.CRDNN.CRDNN input_shape: [null, null, !ref <n_mels>] activation: !ref <activation> diff --git a/recipes/Voicebank/MTL/ASR_enhance/hparams/enhance_mimic.yaml b/recipes/Voicebank/MTL/ASR_enhance/hparams/enhance_mimic.yaml index 2a96e25db7460d473460889dc326f6589fd2eceb..c3391498a7ad7792331fce172dd820cd15ce9cb1 100644 --- a/recipes/Voicebank/MTL/ASR_enhance/hparams/enhance_mimic.yaml +++ b/recipes/Voicebank/MTL/ASR_enhance/hparams/enhance_mimic.yaml @@ -18,7 +18,7 @@ valid_annotation: !ref <data_folder>/valid.json test_annotation: !ref <data_folder>/test.json skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 lr: 0.0001 diff --git a/recipes/Voicebank/MTL/ASR_enhance/hparams/pretrain_perceptual.yaml b/recipes/Voicebank/MTL/ASR_enhance/hparams/pretrain_perceptual.yaml index a86fbc4ccf1cec27c6c4979fe386b146b7418e65..d384d026ae5bb20de42f48f9beb572e93bd65a00 100644 --- a/recipes/Voicebank/MTL/ASR_enhance/hparams/pretrain_perceptual.yaml +++ b/recipes/Voicebank/MTL/ASR_enhance/hparams/pretrain_perceptual.yaml @@ -18,7 +18,7 @@ valid_annotation: !ref <data_folder>/valid.json test_annotation: !ref <data_folder>/test.json skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 20 ctc_epochs: 4 batch_size: 8 diff --git a/recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml b/recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml index 1bf087d8558d2c788c5829f9f0d5b0b0b68f5cbc..1835342c30d97c7335927fd2f6c17f7e8fe2a18d 100644 --- a/recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml +++ b/recipes/Voicebank/MTL/ASR_enhance/hparams/robust_asr.yaml @@ -24,7 +24,7 @@ test_annotation: !ref <data_folder>/test.json noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 30 ctc_epochs: 0 batch_size: 8 @@ -141,6 +141,8 @@ compute_stft: !new:speechbrain.processing.features.STFT spectral_magnitude: !name:speechbrain.processing.features.spectral_magnitude power: 0.5 +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -148,58 +150,38 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U ext: wav csv_file: !ref <noise_annotation> - # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -209,7 +191,6 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] - fbank: !new:speechbrain.lobes.features.Fbank n_mels: !ref <n_mels> sample_rate: !ref <sample_rate> diff --git a/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml b/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml index d7d2be081e1f4f6997b2bcdb6b0ebbbf6953ec40..c93ba21ecb1109e9cf67ce5c02a4255fff16192a 100644 --- a/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml +++ b/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn.yaml @@ -85,6 +85,7 @@ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -95,18 +96,14 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> - # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <RIR_DATASET_URL> @@ -122,37 +119,24 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: True concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn_mel_spec.yaml b/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn_mel_spec.yaml index 6cb17c6bc7b92a5eaf1592b4d66584f01438ae15..becd8e4d41511e073e5f35f1ba258cff8e8466a4 100644 --- a/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn_mel_spec.yaml +++ b/recipes/VoxCeleb/SpeakerRec/hparams/train_ecapa_tdnn_mel_spec.yaml @@ -104,6 +104,7 @@ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -114,18 +115,14 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> - # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <RIR_DATASET_URL> @@ -141,37 +138,24 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: True concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/VoxCeleb/SpeakerRec/hparams/train_resnet.yaml b/recipes/VoxCeleb/SpeakerRec/hparams/train_resnet.yaml index 617457f1a4bf8bd7432f63c71f94c1b3a1d5b2cb..a20786574601e7d61b37848db29cf66493d2255e 100644 --- a/recipes/VoxCeleb/SpeakerRec/hparams/train_resnet.yaml +++ b/recipes/VoxCeleb/SpeakerRec/hparams/train_resnet.yaml @@ -55,7 +55,7 @@ right_frames: 0 deltas: False # Number of speakers -# 1211 for vox1, 5994 for vox2, 7205 for vox1+vox2 +# 1211 for vox1, 5994 for vox2, 7205 for vox1+vox2 out_n_neurons: 7205 num_workers: 4 @@ -85,6 +85,7 @@ classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -95,18 +96,14 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> - # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <RIR_DATASET_URL> @@ -122,37 +119,24 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: True concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml b/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml index 8a70462c5ac9ccc35900016a3f810631e0b38819..ab628c681b9729ea2cf431593f524c151c599570 100644 --- a/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml +++ b/recipes/VoxCeleb/SpeakerRec/hparams/train_x_vectors.yaml @@ -88,6 +88,7 @@ classifier: !new:speechbrain.lobes.models.Xvector.Classifier epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -98,18 +99,14 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> - # Download and prepare the dataset of room impulse responses for augmentation prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <RIR_DATASET_URL> @@ -125,37 +122,24 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter parallel_augment: True concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml b/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml index a2d87248629eba9f9d985ca49962eef04fd32cf4..db29a301b80588ce8b2f0d82b0f4d4da16afa24b 100644 --- a/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml +++ b/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml @@ -58,6 +58,8 @@ val_dataloader_options: num_workers: 1 batch_size: !ref <batch_size_val> +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -72,7 +74,6 @@ prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL ext: wav csv_file: !ref <rir_annotation> - # Add reverberation to input signal add_reverb: !new:speechbrain.augment.time_domain.AddReverb csv_file: !ref <rir_annotation> @@ -81,27 +82,21 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [90, 100, 110] # List of speed changes for time-stretching speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 shuffle_augmentations: True min_augmentations: 1 max_augmentations: 3 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-wham-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-wham-DM.yaml index a38bb8c885792abce888d34e5f0ab4138c3c1625..843c9fb0917fcb5a1aed07668a0426be2d59d558 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-wham-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-wham-DM.yaml @@ -45,7 +45,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 8 lr: 0.0001 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-whamr-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-whamr-DM.yaml index af3fcb0d79f942c528b591cbf46d4f2cec0c8e88..b55f05a1169c344207f7fee4a885a425be200b5a 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-whamr-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/cnntransformer-whamr-DM.yaml @@ -45,7 +45,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 8 lr: 0.0001 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/convtasnet-whamr-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/convtasnet-whamr-DM.yaml index 7c2990442e0d631d0ce2b7a2fbb6e0f2edf33103..545428d5ac31f32ee81dd2af7a616040cfed231e 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/convtasnet-whamr-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/convtasnet-whamr-DM.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 10 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/dprnn-whamr-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/dprnn-whamr-DM.yaml index f3158a6252e95e9186f4cea07cdbacf78f14f3a1..d974b03c11edf0c0db0698f3b7f30d81a4c9a5a0 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/dprnn-whamr-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/dprnn-whamr-DM.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-wham.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-wham.yaml index 75c90a0f108c62040ee1f01228d49a6505ab1210..df1935306b47512947cd04962744c0217192811f 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-wham.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-wham.yaml @@ -44,7 +44,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k-DM.yaml index 625801e5178f4383f7b4c65bc7f2c44ec7730e43..dc27834919fca3b9cff38b3a12e65fb56b18c653 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k-DM.yaml @@ -46,7 +46,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 16000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k.yaml index d42060642fd291292f01f862c8d337e180a68a98..d11332c7e0702ea5cae1c778a4f8b7f90c13ed58 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-16k.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 16000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-DM.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-DM.yaml index 2acd34370d777850006ac1d5c45830675c59c18f..3721698bced2416d6f97521978eb18934a0f3a6c 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-DM.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr-DM.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr.yaml b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr.yaml index 536a46f4960c264449a682b748c7c350b0847187..14a38a06b6b442e3d1b9086e6889a0125a884ef1 100644 --- a/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr.yaml +++ b/recipes/WHAMandWHAMR/enhancement/hparams/sepformer-whamr.yaml @@ -44,7 +44,7 @@ save_audio: True # Save estimated sources on disk sample_rate: 8000 n_audio_to_save: 20 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/separation/hparams/sepformer-wham.yaml b/recipes/WHAMandWHAMR/separation/hparams/sepformer-wham.yaml index 1661d68f271740a2af8c1160a137ca7823e6331b..db920e7fab8d969cbebe61a15b4884ce6f843da1 100644 --- a/recipes/WHAMandWHAMR/separation/hparams/sepformer-wham.yaml +++ b/recipes/WHAMandWHAMR/separation/hparams/sepformer-wham.yaml @@ -42,7 +42,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml b/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml index d4bc250d0c4bcddc8385717970e265729c993b3c..8538529a629e4ed99176930e46d778e9c8a12438 100644 --- a/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml +++ b/recipes/WHAMandWHAMR/separation/hparams/sepformer-whamr.yaml @@ -40,7 +40,7 @@ num_spks: 2 # set to 3 for wsj0-3mix save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/convtasnet.yaml b/recipes/WSJ0Mix/separation/hparams/convtasnet.yaml index 03833050419273e7888f56e49ba34868c371a6fc..0305c6236d5cb7faa07739321428e4ad97e1a5a4 100644 --- a/recipes/WSJ0Mix/separation/hparams/convtasnet.yaml +++ b/recipes/WSJ0Mix/separation/hparams/convtasnet.yaml @@ -36,7 +36,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/dprnn.yaml b/recipes/WSJ0Mix/separation/hparams/dprnn.yaml index a78a782666f0cd93c8f696174948df1cc25cdef8..df1952d8c93d034db2f97749865a02a40c45ee22 100644 --- a/recipes/WSJ0Mix/separation/hparams/dprnn.yaml +++ b/recipes/WSJ0Mix/separation/hparams/dprnn.yaml @@ -36,7 +36,7 @@ noprogressbar: False save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/resepformer.yaml b/recipes/WSJ0Mix/separation/hparams/resepformer.yaml index 2b2711f52c0f5fdb9c1f0961d63561b6704ac289..406f2aa76510ddce75fed5fc8e40e2cee3aea08b 100644 --- a/recipes/WSJ0Mix/separation/hparams/resepformer.yaml +++ b/recipes/WSJ0Mix/separation/hparams/resepformer.yaml @@ -37,7 +37,7 @@ num_spks: 2 # set to 3 for wsj0-3mix save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/sepformer-conformerintra.yaml b/recipes/WSJ0Mix/separation/hparams/sepformer-conformerintra.yaml index 15550c9a24f77362c392cf637b0b13cb37ea21b3..2cf2b7ac551547a88df59a1f2f658cc895f0b4be 100644 --- a/recipes/WSJ0Mix/separation/hparams/sepformer-conformerintra.yaml +++ b/recipes/WSJ0Mix/separation/hparams/sepformer-conformerintra.yaml @@ -37,7 +37,7 @@ num_spks: 2 # set to 3 for wsj0-3mix save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/sepformer-customdataset.yaml b/recipes/WSJ0Mix/separation/hparams/sepformer-customdataset.yaml index 82a2d3009029882b4aa6726519cc08197d83fe00..c896f2dfd844ee1dbac3625f3eea38358fcc8d0d 100644 --- a/recipes/WSJ0Mix/separation/hparams/sepformer-customdataset.yaml +++ b/recipes/WSJ0Mix/separation/hparams/sepformer-customdataset.yaml @@ -39,7 +39,7 @@ noprogressbar: False save_audio: True # Save estimated sources on disk sample_rate: 16000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/sepformer.yaml b/recipes/WSJ0Mix/separation/hparams/sepformer.yaml index 4787fb3aa93a7854fc287774b28097b3b6cdbbb7..77319604d02669a6823e79acdb6aac75f70f88aa 100644 --- a/recipes/WSJ0Mix/separation/hparams/sepformer.yaml +++ b/recipes/WSJ0Mix/separation/hparams/sepformer.yaml @@ -40,7 +40,7 @@ save_audio: True # Save estimated sources on disk n_audio_to_save: 20 sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/WSJ0Mix/separation/hparams/skim.yaml b/recipes/WSJ0Mix/separation/hparams/skim.yaml index 53b312efd5b1e076b6e7e77fc57879ac41e74110..606c7060a5115b863805c835575bd6a6db182cf7 100644 --- a/recipes/WSJ0Mix/separation/hparams/skim.yaml +++ b/recipes/WSJ0Mix/separation/hparams/skim.yaml @@ -37,7 +37,7 @@ num_spks: 2 # set to 3 for wsj0-3mix save_audio: False # Save estimated sources on disk sample_rate: 8000 -# Training parameters +####################### Training Parameters #################################### N_epochs: 200 batch_size: 1 lr: 0.00015 diff --git a/recipes/ZaionEmotionDataset/emotion_diarization/hparams/train.yaml b/recipes/ZaionEmotionDataset/emotion_diarization/hparams/train.yaml index 644ca89a27255f1f37a10a15004294d010b78737..0d9601d2068e91db1819a2647a8628657bc0a47c 100644 --- a/recipes/ZaionEmotionDataset/emotion_diarization/hparams/train.yaml +++ b/recipes/ZaionEmotionDataset/emotion_diarization/hparams/train.yaml @@ -30,7 +30,7 @@ train_annotation: !ref <output_folder>/train.json valid_annotation: !ref <output_folder>/valid.json test_annotation: !ref <output_folder>/test.json -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 15 lr: 0.0001 lr_wav2vec: 0.00001 diff --git a/recipes/fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml b/recipes/fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml index db7c1ddb7287627e1f59ac1374bc774e94f4e302..eff38c7bf42af12f0bb5945cc54209c12ba36551 100644 --- a/recipes/fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml +++ b/recipes/fluent-speech-commands/Tokenizer/hparams/tokenizer_bpe51.yaml @@ -13,7 +13,7 @@ train_csv: !ref <output_folder>/train.csv valid_csv: !ref <output_folder>/valid.csv skip_prep: False -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 51 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/fluent-speech-commands/direct/hparams/train.yaml b/recipes/fluent-speech-commands/direct/hparams/train.yaml index a7c072343f0a44a18910ef972ba3d63a34af4bf3..428faf144a9fb1e371bde9c3a6816bfb1e95138d 100644 --- a/recipes/fluent-speech-commands/direct/hparams/train.yaml +++ b/recipes/fluent-speech-commands/direct/hparams/train.yaml @@ -32,14 +32,14 @@ rir_annotation: !ref <save_folder>/rir.csv tokenizer_file: https://www.dropbox.com/s/hvf2huofnq0sjbn/51_unigram.model?dl=1 skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 6 batch_size: 16 lr: 0.0003 # token_type: unigram # ["unigram", "bpe", "char"] sorting: random -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -65,6 +65,8 @@ dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -87,45 +89,32 @@ add_reverb: !new:speechbrain.augment.time_domain.AddReverb num_workers: !ref <num_workers> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 9 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 3 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 2 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 shuffle_augmentations: True min_augmentations: 1 max_augmentations: 4 @@ -136,7 +125,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] -# Models +############################## Models ########################################## + asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech slu_enc: !new:speechbrain.nnet.containers.Sequential diff --git a/recipes/timers-and-such/LM/hparams/train.yaml b/recipes/timers-and-such/LM/hparams/train.yaml index 485dd54265a57612f862b180d223c79149d3aa3b..f3ba652edbc6effb882d697e13ee3747d7badf9f 100644 --- a/recipes/timers-and-such/LM/hparams/train.yaml +++ b/recipes/timers-and-such/LM/hparams/train.yaml @@ -23,7 +23,7 @@ csv_test_synth: !ref <output_folder>/test-synth-type=decoupled.csv csv_test_real: !ref <output_folder>/test-real-type=decoupled.csv skip_prep: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 10 batch_size: 128 lr: 0.0003 diff --git a/recipes/timers-and-such/Tokenizer/hparams/tokenizer_bpe51.yaml b/recipes/timers-and-such/Tokenizer/hparams/tokenizer_bpe51.yaml index 7554a03421cfa18627b4809af2296f6c54d4ea13..2a9f39161d2237e228ee0a9d82def76728a52324 100644 --- a/recipes/timers-and-such/Tokenizer/hparams/tokenizer_bpe51.yaml +++ b/recipes/timers-and-such/Tokenizer/hparams/tokenizer_bpe51.yaml @@ -15,7 +15,7 @@ train_csv: !ref <output_folder>/train-type=direct.csv valid_csv: !ref <output_folder>/dev-real-type=direct.csv -# Training parameters +####################### Training Parameters #################################### token_type: unigram # ["unigram", "bpe", "char"] token_output: 51 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 diff --git a/recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml b/recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml index 212752668aba1ecdabcfffab3831f906c34165b8..1ee56d5612f0a7c0cf18b4afbb14f56bb64523c3 100644 --- a/recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml +++ b/recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml @@ -34,7 +34,7 @@ skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min test_on_all_real: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 diff --git a/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml b/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml index fcf6393e68868e7f9b83283bcde5c0a107ab4882..5f0d93d09a22d524731862d3c83823911aea5ef8 100644 --- a/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml +++ b/recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml @@ -34,7 +34,7 @@ skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min test_on_all_real: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 diff --git a/recipes/timers-and-such/direct/hparams/train.yaml b/recipes/timers-and-such/direct/hparams/train.yaml index 4fb574fc3d82c32575aadef338e63f8b757f1ea4..01909eb5b02891a6bc91b66ed5a360f6e24df5bd 100644 --- a/recipes/timers-and-such/direct/hparams/train.yaml +++ b/recipes/timers-and-such/direct/hparams/train.yaml @@ -38,14 +38,14 @@ data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augm NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 # token_type: unigram # ["unigram", "bpe", "char"] sorting: random -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -71,6 +71,7 @@ dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL @@ -80,56 +81,37 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U csv_file: !ref <noise_annotation> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -139,8 +121,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## -# Models asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech slu_enc: !new:speechbrain.nnet.containers.Sequential diff --git a/recipes/timers-and-such/direct/hparams/train_with_wav2vec2.yaml b/recipes/timers-and-such/direct/hparams/train_with_wav2vec2.yaml index b9b451e910b9bda29dfd21491c48cf5a317b1619..b9ad3cfc2490fab5fc8f91930e6bac77fbd10bf6 100644 --- a/recipes/timers-and-such/direct/hparams/train_with_wav2vec2.yaml +++ b/recipes/timers-and-such/direct/hparams/train_with_wav2vec2.yaml @@ -37,7 +37,7 @@ ckpt_interval_minutes: 15 # save checkpoint every N min test_on_all_real: False -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 50 batch_size: 8 lr: 0.0004 @@ -49,7 +49,7 @@ freeze_wav2vec: False # token_type: unigram # ["unigram", "bpe", "char"] sorting: ascending -# Model parameters +####################### Model Parameters ####################################### sample_rate: 16000 emb_size: 128 dec_neurons: 512 @@ -171,45 +171,31 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_annealing_wav2vec2: !ref <lr_annealing_wav2vec2> counter: !ref <epoch_counter> -# Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching +############################## Augmentations ################################### +# Speed perturbation speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 diff --git a/recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml b/recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml index 3d1c4e1566187ebe293242ac064c833fdb4fbce1..b804df9f952a3fb90accb9c9a9db0cb5201d2662 100644 --- a/recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml +++ b/recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml @@ -39,7 +39,7 @@ NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.z noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 @@ -73,6 +73,8 @@ dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -81,57 +83,37 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U csv_file: !ref <noise_annotation> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 -# Augmenter: Combines previously defined augmentations to perform data augmentation # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -141,7 +123,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] -# Models +############################## Models ########################################## + asr_model: !apply:speechbrain.inference.ASR.EncoderDecoderASR.from_hparams source: speechbrain/asr-crdnn-rnnlm-librispeech run_opts: {"device":"cuda:0"} diff --git a/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml b/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml index d21f309c596a8a4bf6e8e1d09e1ab8434a535795..56eb59d20e532432e11da09f8f938762abd3d6fd 100644 --- a/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml +++ b/recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml @@ -39,7 +39,7 @@ NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.z noise_annotation: !ref <save_folder>/noise.csv #The data manifest files are created by the data preparation script -# Training parameters +####################### Training Parameters #################################### number_of_epochs: 1 batch_size: 16 lr: 0.0003 @@ -73,6 +73,8 @@ dataloader_opts: epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref <number_of_epochs> +############################## Augmentations ################################### + # Download and prepare the dataset of noisy sequences for augmentation prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL URL: !ref <NOISE_DATASET_URL> @@ -81,57 +83,37 @@ prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_U csv_file: !ref <noise_annotation> # Add noise to input signal -snr_low: 0 # Min SNR for noise augmentation -snr_high: 15 # Max SNR for noise augmentation - add_noise: !new:speechbrain.augment.time_domain.AddNoise csv_file: !ref <noise_annotation> - snr_low: !ref <snr_low> - snr_high: !ref <snr_high> + snr_low: 0 + snr_high: 15 noise_sample_rate: !ref <sample_rate> clean_sample_rate: !ref <sample_rate> num_workers: !ref <num_workers> # Speed perturbation -speed_changes: [95, 100, 105] # List of speed changes for time-stretching - speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb orig_freq: !ref <sample_rate> - speeds: !ref <speed_changes> + speeds: [95, 100, 105] # Frequency drop: randomly drops a number of frequency bands to zero. -drop_freq_low: 0 # Min frequency band dropout probability -drop_freq_high: 1 # Max frequency band dropout probability -drop_freq_count_low: 1 # Min number of frequency bands to drop -drop_freq_count_high: 3 # Max number of frequency bands to drop -drop_freq_width: 0.05 # Width of frequency bands to drop - drop_freq: !new:speechbrain.augment.time_domain.DropFreq - drop_freq_low: !ref <drop_freq_low> - drop_freq_high: !ref <drop_freq_high> - drop_freq_count_low: !ref <drop_freq_count_low> - drop_freq_count_high: !ref <drop_freq_count_high> - drop_freq_width: !ref <drop_freq_width> + drop_freq_low: 0 + drop_freq_high: 1 + drop_freq_count_low: 1 + drop_freq_count_high: 3 + drop_freq_width: 0.05 # Time drop: randomly drops a number of temporal chunks. -drop_chunk_count_low: 1 # Min number of audio chunks to drop -drop_chunk_count_high: 5 # Max number of audio chunks to drop -drop_chunk_length_low: 1000 # Min length of audio chunks to drop -drop_chunk_length_high: 2000 # Max length of audio chunks to drop - drop_chunk: !new:speechbrain.augment.time_domain.DropChunk - drop_length_low: !ref <drop_chunk_length_low> - drop_length_high: !ref <drop_chunk_length_high> - drop_count_low: !ref <drop_chunk_count_low> - drop_count_high: !ref <drop_chunk_count_high> + drop_length_low: 1000 + drop_length_high: 2000 + drop_count_low: 1 + drop_count_high: 5 -# Augmenter: Combines previously defined augmentations to perform data augmentation # Augmenter: Combines previously defined augmentations to perform data augmentation wav_augment: !new:speechbrain.augment.augmenter.Augmenter - parallel_augment: False concat_original: True - repeat_augment: 1 - shuffle_augmentations: False min_augmentations: 4 max_augmentations: 4 augment_prob: 1.0 @@ -141,6 +123,8 @@ wav_augment: !new:speechbrain.augment.augmenter.Augmenter !ref <drop_freq>, !ref <drop_chunk>] +############################## Models ########################################## + # Models asr_model: !apply:speechbrain.inference.ASR.EncoderDecoderASR.from_hparams source: speechbrain/asr-crdnn-rnnlm-librispeech diff --git a/speechbrain/augment/augmenter.py b/speechbrain/augment/augmenter.py index 25ea174809a8086c56a5ad929854df3bc2b97e9c..55aee785cbc94f7ff996d5c350f2bbd1090029f7 100644 --- a/speechbrain/augment/augmenter.py +++ b/speechbrain/augment/augmenter.py @@ -438,20 +438,52 @@ class Augmenter(torch.nn.Module): return output, output_lengths + def replicate_multiple_labels(self, *args): + """ + Replicates the labels along the batch axis a number of times that + corresponds to the number of augmentations. Indeed parallel and + concatenation augmentations alter the time dimension. + + Arguments + --------- + args : torch.Tensor + Input label tensors to be replicated. Can be a uniq or a list of + Tensors. + + Returns + ------- + augmented_labels: torch.Tensor + Labels corresponding to the augmented input. Returns as many Tensor + as given in input. + """ + + # Determine whether to apply data augmentation + if not self.do_augment: + return args + + list_of_augmented_labels = [] + + for labels in args: + list_of_augmented_labels.append(self.replicate_labels(labels)) + + return list_of_augmented_labels + def replicate_labels(self, labels): """ Replicates the labels along the batch axis a number of times that - corresponds to the number of augmentations. + corresponds to the number of augmentations. Indeed parallel and + concatenation augmentations alter the time dimension. Arguments --------- labels : torch.Tensor - Input label tensor to be replicated. + Input label tensors to be replicated. Returns ------- augmented_labels: torch.Tensor - Labels corresponding to the augmented input. + Labels corresponding to the augmented input. Returns as many Tensor + as given in input. """ # Determine whether to apply data augmentation @@ -477,6 +509,7 @@ class Augmenter(torch.nn.Module): ) augmented_labels = torch.cat(augmented_labels, dim=0) + return augmented_labels def check_min_max_augmentations(self):