| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- # set random seed, so that you may reproduce your result.
- __set_seed1: !apply:random.seed [1986]
- __set_seed2: !apply:numpy.random.seed [1986]
- __set_seed3: !apply:torch.manual_seed [1986]
- __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
- # fixed params
- sample_rate: 22050
- text_encoder_input_size: 512
- llm_input_size: 1024
- llm_output_size: 1024
- spk_embed_dim: 192
- # model params
- # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
- # for system/third_party class/function, we do not require this.
- hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
- in_channels: 80
- base_channels: 512
- nb_harmonics: 8
- sampling_rate: !ref <sample_rate>
- nsf_alpha: 0.1
- nsf_sigma: 0.003
- nsf_voiced_threshold: 10
- upsample_rates: [8, 8]
- upsample_kernel_sizes: [16, 16]
- istft_params:
- n_fft: 16
- hop_len: 4
- resblock_kernel_sizes: [3, 7, 11]
- resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
- source_resblock_kernel_sizes: [7, 11]
- source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
- lrelu_slope: 0.1
- audio_limit: 0.99
- f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
- num_class: 1
- in_channels: 80
- cond_channels: 512
- mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
- n_fft: 1024
- num_mels: 80
- sampling_rate: !ref <sample_rate>
- hop_size: 256
- win_size: 1024
- fmin: 0
- fmax: 8000
- center: False
- hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
- generator: !ref <hift>
- discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
- mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
- mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
- mel_spec_transform: [
- !ref <mel_spec_transform1>
- ]
- # processor functions
- parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
- get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
- multilingual: True
- num_languages: 100
- language: 'en'
- task: 'transcribe'
- tokenize: !name:cosyvoice.dataset.processor.tokenize
- get_tokenizer: !ref <get_tokenizer>
- allowed_special: 'all'
- filter: !name:cosyvoice.dataset.processor.filter
- max_length: 40960
- min_length: 0
- token_max_length: 200
- token_min_length: 1
- resample: !name:cosyvoice.dataset.processor.resample
- resample_rate: !ref <sample_rate>
- truncate: !name:cosyvoice.dataset.processor.truncate
- truncate_length: 24576 # must be a multiplier of hop_size
- feat_extractor: !name:matcha.utils.audio.mel_spectrogram
- n_fft: 1024
- num_mels: 80
- sampling_rate: !ref <sample_rate>
- hop_size: 256
- win_size: 1024
- fmin: 0
- fmax: 8000
- center: False
- compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
- feat_extractor: !ref <feat_extractor>
- pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch
- sample_rate: !ref <sample_rate>
- frame_length: 46.4 # match feat_extractor win_size/sampling_rate
- frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
- compute_f0: !name:cosyvoice.dataset.processor.compute_f0
- pitch_extractor: !ref <pitch_extractor>
- parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
- normalize: True
- shuffle: !name:cosyvoice.dataset.processor.shuffle
- shuffle_size: 1000
- sort: !name:cosyvoice.dataset.processor.sort
- sort_size: 500 # sort_size should be less than shuffle_size
- batch: !name:cosyvoice.dataset.processor.batch
- batch_type: 'dynamic'
- max_frames_in_batch: 1200
- padding: !name:cosyvoice.dataset.processor.padding
- use_spk_embedding: False # change to True during sft
- # dataset processor pipeline
- data_pipeline: [
- !ref <parquet_opener>,
- !ref <tokenize>,
- !ref <filter>,
- !ref <resample>,
- !ref <truncate>,
- !ref <compute_fbank>,
- !ref <compute_f0>,
- !ref <parse_embedding>,
- !ref <shuffle>,
- !ref <sort>,
- !ref <batch>,
- !ref <padding>,
- ]
- # train conf
- train_conf:
- optim: adam
- optim_conf:
- lr: 0.002 # change to 0.001 if you want to train flow from scratch
- scheduler: warmuplr
- scheduler_conf:
- warmup_steps: 25000
- optim_d: adam
- optim_conf_d:
- lr: 0.002 # change to 0.001 if you want to train flow from scratch
- scheduler_d: warmuplr
- scheduler_conf_d:
- warmup_steps: 25000
- max_epoch: 200
- grad_clip: 5
- accum_grad: 2
- log_interval: 100
- save_per_step: -1
|