wuyue
/
CosyVoice
forked from github-repo/CosyVoice


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
							# set random seed, so that you may reproduce your result.
__set_seed1: !apply:random.seed [1986]
__set_seed2: !apply:numpy.random.seed [1986]
__set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]

# fixed params
sample_rate: 22050
text_encoder_input_size: 512
llm_input_size: 1024
llm_output_size: 1024
spk_embed_dim: 192

# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
    in_channels: 80
    base_channels: 512
    nb_harmonics: 8
    sampling_rate: !ref <sample_rate>
    nsf_alpha: 0.1
    nsf_sigma: 0.003
    nsf_voiced_threshold: 10
    upsample_rates: [8, 8]
    upsample_kernel_sizes: [16, 16]
    istft_params:
        n_fft: 16
        hop_len: 4
    resblock_kernel_sizes: [3, 7, 11]
    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
    source_resblock_kernel_sizes: [7, 11]
    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
    lrelu_slope: 0.1
    audio_limit: 0.99
    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
        num_class: 1
        in_channels: 80
        cond_channels: 512

mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
    n_fft: 1024
    num_mels: 80
    sampling_rate: !ref <sample_rate>
    hop_size: 256
    win_size: 1024
    fmin: 0
    fmax: 8000
    center: False
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
    generator: !ref <hift>
    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
        mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
    mel_spec_transform: [
        !ref <mel_spec_transform1>
    ]

# processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
    multilingual: True
    num_languages: 100
    language: 'en'
    task: 'transcribe'
tokenize: !name:cosyvoice.dataset.processor.tokenize
    get_tokenizer: !ref <get_tokenizer>
    allowed_special: 'all'
filter: !name:cosyvoice.dataset.processor.filter
    max_length: 40960
    min_length: 0
    token_max_length: 200
    token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
    resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
    truncate_length: 24576 # must be a multiplier of hop_size
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    n_fft: 1024
    num_mels: 80
    sampling_rate: !ref <sample_rate>
    hop_size: 256
    win_size: 1024
    fmin: 0
    fmax: 8000
    center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
    feat_extractor: !ref <feat_extractor>
pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch
    sample_rate: !ref <sample_rate>
    frame_length: 46.4 # match feat_extractor win_size/sampling_rate
    frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
    pitch_extractor: !ref <pitch_extractor>
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
    normalize: True
shuffle: !name:cosyvoice.dataset.processor.shuffle
    shuffle_size: 1000
sort: !name:cosyvoice.dataset.processor.sort
    sort_size: 500  # sort_size should be less than shuffle_size
batch: !name:cosyvoice.dataset.processor.batch
    batch_type: 'dynamic'
    max_frames_in_batch: 1200
padding: !name:cosyvoice.dataset.processor.padding
    use_spk_embedding: False # change to True during sft

# dataset processor pipeline
data_pipeline: [
    !ref <parquet_opener>,
    !ref <tokenize>,
    !ref <filter>,
    !ref <resample>,
    !ref <truncate>,
    !ref <compute_fbank>,
    !ref <compute_f0>,
    !ref <parse_embedding>,
    !ref <shuffle>,
    !ref <sort>,
    !ref <batch>,
    !ref <padding>,
]

# train conf
train_conf:
    optim: adam
    optim_conf:
        lr: 0.002 # change to 0.001 if you want to train flow from scratch
    scheduler: warmuplr
    scheduler_conf:
        warmup_steps: 25000
    optim_d: adam
    optim_conf_d:
        lr: 0.002 # change to 0.001 if you want to train flow from scratch
    scheduler_d: warmuplr
    scheduler_conf_d:
        warmup_steps: 25000
    max_epoch: 200
    grad_clip: 5
    accum_grad: 2
    log_interval: 100
    save_per_step: -1