123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 |
- # set random seed, so that you may reproduce your result.
- __set_seed1: !apply:random.seed [1986]
- __set_seed2: !apply:numpy.random.seed [1986]
- __set_seed3: !apply:torch.manual_seed [1986]
- __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
- # fixed params
- sample_rate: 22050
- text_encoder_input_size: 512
- llm_input_size: 1024
- llm_output_size: 1024
- spk_embed_dim: 192
- # model params
- # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
- # for system/third_party class/function, we do not require this.
- llm: !new:cosyvoice.llm.llm.TransformerLM
- text_encoder_input_size: !ref <text_encoder_input_size>
- llm_input_size: !ref <llm_input_size>
- llm_output_size: !ref <llm_output_size>
- text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
- speech_token_size: 4096
- length_normalized_loss: True
- lsm_weight: 0
- spk_embed_dim: !ref <spk_embed_dim>
- text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
- input_size: !ref <text_encoder_input_size>
- output_size: 1024
- attention_heads: 16
- linear_units: 4096
- num_blocks: 6
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- attention_dropout_rate: 0.0
- normalize_before: True
- input_layer: 'linear'
- pos_enc_layer_type: 'rel_pos_espnet'
- selfattention_layer_type: 'rel_selfattn'
- use_cnn_module: False
- macaron_style: False
- use_dynamic_chunk: False
- use_dynamic_left_chunk: False
- static_chunk_size: 1
- llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
- input_size: !ref <llm_input_size>
- output_size: !ref <llm_output_size>
- attention_heads: 16
- linear_units: 4096
- num_blocks: 14
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- attention_dropout_rate: 0.0
- input_layer: 'linear_legacy'
- pos_enc_layer_type: 'rel_pos_espnet'
- selfattention_layer_type: 'rel_selfattn'
- static_chunk_size: 1
- sampling: !name:cosyvoice.utils.common.ras_sampling
- top_p: 0.8
- top_k: 25
- win_size: 10
- tau_r: 0.1
- flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
- input_size: 512
- output_size: 80
- spk_embed_dim: !ref <spk_embed_dim>
- output_type: 'mel'
- vocab_size: 4096
- input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
- only_mask_loss: True
- encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
- output_size: 512
- attention_heads: 8
- linear_units: 2048
- num_blocks: 6
- dropout_rate: 0.1
- positional_dropout_rate: 0.1
- attention_dropout_rate: 0.1
- normalize_before: True
- input_layer: 'linear'
- pos_enc_layer_type: 'rel_pos_espnet'
- selfattention_layer_type: 'rel_selfattn'
- input_size: 512
- use_cnn_module: False
- macaron_style: False
- length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
- channels: 80
- sampling_ratios: [1, 1, 1, 1]
- decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
- in_channels: 240
- n_spks: 1
- spk_emb_dim: 80
- cfm_params: !new:omegaconf.DictConfig
- content:
- sigma_min: 1e-06
- solver: 'euler'
- t_scheduler: 'cosine'
- training_cfg_rate: 0.2
- inference_cfg_rate: 0.7
- reg_loss_type: 'l1'
- estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
- in_channels: 320
- out_channels: 80
- channels: [256, 256]
- dropout: 0.0
- attention_head_dim: 64
- n_blocks: 4
- num_mid_blocks: 12
- num_heads: 8
- act_fn: 'gelu'
- hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
- in_channels: 80
- base_channels: 512
- nb_harmonics: 8
- sampling_rate: !ref <sample_rate>
- nsf_alpha: 0.1
- nsf_sigma: 0.003
- nsf_voiced_threshold: 10
- upsample_rates: [8, 8]
- upsample_kernel_sizes: [16, 16]
- istft_params:
- n_fft: 16
- hop_len: 4
- resblock_kernel_sizes: [3, 7, 11]
- resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
- source_resblock_kernel_sizes: [7, 11]
- source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
- lrelu_slope: 0.1
- audio_limit: 0.99
- f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
- num_class: 1
- in_channels: 80
- cond_channels: 512
- # gan related module
- mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
- n_fft: 1024
- num_mels: 80
- sampling_rate: !ref <sample_rate>
- hop_size: 256
- win_size: 1024
- fmin: 0
- fmax: null
- center: False
- hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
- generator: !ref <hift>
- discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
- mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
- mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
- mel_spec_transform: [
- !ref <mel_spec_transform1>
- ]
- # processor functions
- parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
- get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
- multilingual: True
- num_languages: 100
- language: 'en'
- task: 'transcribe'
- allowed_special: 'all'
- tokenize: !name:cosyvoice.dataset.processor.tokenize
- get_tokenizer: !ref <get_tokenizer>
- allowed_special: !ref <allowed_special>
- filter: !name:cosyvoice.dataset.processor.filter
- max_length: 40960
- min_length: 0
- token_max_length: 200
- token_min_length: 1
- resample: !name:cosyvoice.dataset.processor.resample
- resample_rate: !ref <sample_rate>
- truncate: !name:cosyvoice.dataset.processor.truncate
- truncate_length: 24576 # must be a multiplier of hop_size
- feat_extractor: !name:matcha.utils.audio.mel_spectrogram
- n_fft: 1024
- num_mels: 80
- sampling_rate: !ref <sample_rate>
- hop_size: 256
- win_size: 1024
- fmin: 0
- fmax: 8000
- center: False
- compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
- feat_extractor: !ref <feat_extractor>
- compute_f0: !name:cosyvoice.dataset.processor.compute_f0
- sample_rate: !ref <sample_rate>
- hop_size: 256
- parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
- normalize: True
- shuffle: !name:cosyvoice.dataset.processor.shuffle
- shuffle_size: 1000
- sort: !name:cosyvoice.dataset.processor.sort
- sort_size: 500 # sort_size should be less than shuffle_size
- batch: !name:cosyvoice.dataset.processor.batch
- batch_type: 'dynamic'
- max_frames_in_batch: 2000 # change to 1400 in gan train on v100 16g
- padding: !name:cosyvoice.dataset.processor.padding
- use_spk_embedding: False # change to True during sft
- # dataset processor pipeline
- data_pipeline: [
- !ref <parquet_opener>,
- !ref <tokenize>,
- !ref <filter>,
- !ref <resample>,
- !ref <compute_fbank>,
- !ref <parse_embedding>,
- !ref <shuffle>,
- !ref <sort>,
- !ref <batch>,
- !ref <padding>,
- ]
- data_pipeline_gan: [
- !ref <parquet_opener>,
- !ref <tokenize>,
- !ref <filter>,
- !ref <resample>,
- !ref <truncate>,
- !ref <compute_fbank>,
- !ref <compute_f0>,
- !ref <parse_embedding>,
- !ref <shuffle>,
- !ref <sort>,
- !ref <batch>,
- !ref <padding>,
- ]
- # llm flow train conf
- train_conf:
- optim: adam
- optim_conf:
- lr: 0.001 # change to 1e-5 during sft
- scheduler: warmuplr # change to constantlr during sft
- scheduler_conf:
- warmup_steps: 2500
- max_epoch: 200
- grad_clip: 5
- accum_grad: 2
- log_interval: 100
- save_per_step: -1
- # gan train conf
- train_conf_gan:
- optim: adam
- optim_conf:
- lr: 0.0002 # use small lr for gan training
- scheduler: constantlr
- optim_d: adam
- optim_conf_d:
- lr: 0.0002 # use small lr for gan training
- scheduler_d: constantlr
- max_epoch: 200
- grad_clip: 5
- accum_grad: 1 # in gan training, accum_grad must be 1
- log_interval: 100
- save_per_step: -1
|