4 달 전 · b048a2d6db
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -1,5 +1,5 @@
 
				 # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
			
 
				-#               2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li, Qihua)
			
 
				+#               2025 Alibaba Inc (authors: Xiang Lyu, Yabin Li, Qihua, Shengqiang Li)
			
 
				 #
			
 
				 # Licensed under the Apache License, Version 2.0 (the "License");
			
 
				 # you may not use this file except in compliance with the License.
			
@@ -420,8 +420,8 @@ class Qwen2LM(TransformerLM):
 
				         rejected_lm_mask = rejected_lm_target == IGNORE_ID
			
 
				         chosen_logps = torch.gather(chosen_logits.log_softmax(dim=-1), dim=2, index=chosen_lm_target.masked_fill(chosen_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1)
			
 
				         rejected_logps = torch.gather(rejected_logits.log_softmax(dim=-1), dim=2, index=rejected_lm_target.masked_fill(rejected_lm_mask, 0).unsqueeze(dim=-1)).squeeze(dim=-1)
			
 
				-        chosen_logps = (chosen_logps * chosen_lm_mask).mean(dim=-1)
			
 
				-        rejected_logps = (rejected_logps * chosen_lm_mask).mean(dim=-1)
			
 
				+        chosen_logps = (chosen_logps * chosen_lm_mask).sum(dim=-1) / chosen_lm_mask.sum(dim=-1)
			
 
				+        rejected_logps = (rejected_logps * rejected_lm_mask).sum(dim=-1) / rejected_lm_mask.sum(dim=-1)
			
 
				         return {'loss': loss, 'acc': acc, 'chosen_logps': chosen_logps, 'rejected_logps': rejected_logps}
			
 
				 
			
 
				     @torch.inference_mode()
			
--- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
@@ -1,257 +0,0 @@
 
				-# set random seed, so that you may reproduce your result.
			
 
				-__set_seed1: !apply:random.seed [1986]
			
 
				-__set_seed2: !apply:numpy.random.seed [1986]
			
 
				-__set_seed3: !apply:torch.manual_seed [1986]
			
 
				-__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
			
 
				-
			
 
				-# fixed params
			
 
				-sample_rate: 22050
			
 
				-text_encoder_input_size: 512
			
 
				-llm_input_size: 1024
			
 
				-llm_output_size: 1024
			
 
				-spk_embed_dim: 192
			
 
				-
			
 
				-# model params
			
 
				-# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
			
 
				-# for system/third_party class/function, we do not require this.
			
 
				-llm: !new:cosyvoice.llm.llm.TransformerLM
			
 
				-    text_encoder_input_size: !ref <text_encoder_input_size>
			
 
				-    llm_input_size: !ref <llm_input_size>
			
 
				-    llm_output_size: !ref <llm_output_size>
			
 
				-    text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    speech_token_size: 4096
			
 
				-    length_normalized_loss: True
			
 
				-    lsm_weight: 0
			
 
				-    spk_embed_dim: !ref <spk_embed_dim>
			
 
				-    text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
			
 
				-        input_size: !ref <text_encoder_input_size>
			
 
				-        output_size: 1024
			
 
				-        attention_heads: 8
			
 
				-        linear_units: 2048
			
 
				-        num_blocks: 3
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.0
			
 
				-        normalize_before: True
			
 
				-        input_layer: 'linear'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        use_cnn_module: False
			
 
				-        macaron_style: False
			
 
				-        use_dynamic_chunk: False
			
 
				-        use_dynamic_left_chunk: False
			
 
				-        static_chunk_size: 1
			
 
				-    llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
			
 
				-        input_size: !ref <llm_input_size>
			
 
				-        output_size: !ref <llm_output_size>
			
 
				-        attention_heads: 8
			
 
				-        linear_units: 2048
			
 
				-        num_blocks: 7
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.0
			
 
				-        input_layer: 'linear_legacy'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        static_chunk_size: 1
			
 
				-    sampling: !name:cosyvoice.utils.common.ras_sampling
			
 
				-        top_p: 0.8
			
 
				-        top_k: 25
			
 
				-        win_size: 10
			
 
				-        tau_r: 0.1
			
 
				-
			
 
				-flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
			
 
				-    input_size: 512
			
 
				-    output_size: 80
			
 
				-    spk_embed_dim: !ref <spk_embed_dim>
			
 
				-    output_type: 'mel'
			
 
				-    vocab_size: 4096
			
 
				-    input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    only_mask_loss: True
			
 
				-    encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
			
 
				-        output_size: 512
			
 
				-        attention_heads: 4
			
 
				-        linear_units: 1024
			
 
				-        num_blocks: 3
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.1
			
 
				-        normalize_before: True
			
 
				-        input_layer: 'linear'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        input_size: 512
			
 
				-        use_cnn_module: False
			
 
				-        macaron_style: False
			
 
				-    length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
			
 
				-        channels: 80
			
 
				-        sampling_ratios: [1, 1, 1, 1]
			
 
				-    decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
			
 
				-        in_channels: 240
			
 
				-        n_spks: 1
			
 
				-        spk_emb_dim: 80
			
 
				-        cfm_params: !new:omegaconf.DictConfig
			
 
				-            content:
			
 
				-                sigma_min: 1e-06
			
 
				-                solver: 'euler'
			
 
				-                t_scheduler: 'cosine'
			
 
				-                training_cfg_rate: 0.2
			
 
				-                inference_cfg_rate: 0.7
			
 
				-                reg_loss_type: 'l1'
			
 
				-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
			
 
				-            in_channels: 320
			
 
				-            out_channels: 80
			
 
				-            channels: [256, 256]
			
 
				-            dropout: 0.0
			
 
				-            attention_head_dim: 64
			
 
				-            n_blocks: 4
			
 
				-            num_mid_blocks: 8
			
 
				-            num_heads: 8
			
 
				-            act_fn: 'gelu'
			
 
				-
			
 
				-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
			
 
				-    in_channels: 80
			
 
				-    base_channels: 512
			
 
				-    nb_harmonics: 8
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    nsf_alpha: 0.1
			
 
				-    nsf_sigma: 0.003
			
 
				-    nsf_voiced_threshold: 10
			
 
				-    upsample_rates: [8, 8]
			
 
				-    upsample_kernel_sizes: [16, 16]
			
 
				-    istft_params:
			
 
				-        n_fft: 16
			
 
				-        hop_len: 4
			
 
				-    resblock_kernel_sizes: [3, 7, 11]
			
 
				-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    source_resblock_kernel_sizes: [7, 11]
			
 
				-    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
			
 
				-    lrelu_slope: 0.1
			
 
				-    audio_limit: 0.99
			
 
				-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
			
 
				-        num_class: 1
			
 
				-        in_channels: 80
			
 
				-        cond_channels: 512
			
 
				-
			
 
				-# gan related module
			
 
				-mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
			
 
				-    n_fft: 1024
			
 
				-    num_mels: 80
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    hop_size: 256
			
 
				-    win_size: 1024
			
 
				-    fmin: 0
			
 
				-    fmax: null
			
 
				-    center: False
			
 
				-hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
			
 
				-    generator: !ref <hift>
			
 
				-    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
			
 
				-        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
			
 
				-        mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
			
 
				-    mel_spec_transform: [
			
 
				-        !ref <mel_spec_transform1>
			
 
				-    ]
			
 
				-
			
 
				-# processor functions
			
 
				-parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
			
 
				-get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    multilingual: True
			
 
				-    num_languages: 100
			
 
				-    language: 'en'
			
 
				-    task: 'transcribe'
			
 
				-allowed_special: 'all'
			
 
				-tokenize: !name:cosyvoice.dataset.processor.tokenize
			
 
				-    get_tokenizer: !ref <get_tokenizer>
			
 
				-    allowed_special: !ref <allowed_special>
			
 
				-filter: !name:cosyvoice.dataset.processor.filter
			
 
				-    max_length: 40960
			
 
				-    min_length: 0
			
 
				-    token_max_length: 200
			
 
				-    token_min_length: 1
			
 
				-resample: !name:cosyvoice.dataset.processor.resample
			
 
				-    resample_rate: !ref <sample_rate>
			
 
				-truncate: !name:cosyvoice.dataset.processor.truncate
			
 
				-    truncate_length: 24576 # must be a multiplier of hop_size
			
 
				-feat_extractor: !name:matcha.utils.audio.mel_spectrogram
			
 
				-    n_fft: 1024
			
 
				-    num_mels: 80
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    hop_size: 256
			
 
				-    win_size: 1024
			
 
				-    fmin: 0
			
 
				-    fmax: 8000
			
 
				-    center: False
			
 
				-compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
			
 
				-    feat_extractor: !ref <feat_extractor>
			
 
				-compute_f0: !name:cosyvoice.dataset.processor.compute_f0
			
 
				-    sample_rate: !ref <sample_rate>
			
 
				-    hop_size: 256
			
 
				-parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
			
 
				-    normalize: True
			
 
				-shuffle: !name:cosyvoice.dataset.processor.shuffle
			
 
				-    shuffle_size: 1000
			
 
				-sort: !name:cosyvoice.dataset.processor.sort
			
 
				-    sort_size: 500  # sort_size should be less than shuffle_size
			
 
				-batch: !name:cosyvoice.dataset.processor.batch
			
 
				-    batch_type: 'dynamic'
			
 
				-    max_frames_in_batch: 12000
			
 
				-padding: !name:cosyvoice.dataset.processor.padding
			
 
				-    use_spk_embedding: False # change to True during sft
			
 
				-
			
 
				-# dataset processor pipeline
			
 
				-data_pipeline: [
			
 
				-    !ref <parquet_opener>,
			
 
				-    !ref <tokenize>,
			
 
				-    !ref <filter>,
			
 
				-    !ref <resample>,
			
 
				-    !ref <compute_fbank>,
			
 
				-    !ref <parse_embedding>,
			
 
				-    !ref <shuffle>,
			
 
				-    !ref <sort>,
			
 
				-    !ref <batch>,
			
 
				-    !ref <padding>,
			
 
				-]
			
 
				-data_pipeline_gan: [
			
 
				-    !ref <parquet_opener>,
			
 
				-    !ref <tokenize>,
			
 
				-    !ref <filter>,
			
 
				-    !ref <resample>,
			
 
				-    !ref <truncate>,
			
 
				-    !ref <compute_fbank>,
			
 
				-    !ref <compute_f0>,
			
 
				-    !ref <parse_embedding>,
			
 
				-    !ref <shuffle>,
			
 
				-    !ref <sort>,
			
 
				-    !ref <batch>,
			
 
				-    !ref <padding>,
			
 
				-]
			
 
				-
			
 
				-# llm flow train conf
			
 
				-train_conf:
			
 
				-    optim: adam
			
 
				-    optim_conf:
			
 
				-        lr: 0.002 # change to 0.001 if you want to train flow from scratch
			
 
				-    scheduler: warmuplr
			
 
				-    scheduler_conf:
			
 
				-        warmup_steps: 25000
			
 
				-    max_epoch: 200
			
 
				-    grad_clip: 5
			
 
				-    accum_grad: 2
			
 
				-    log_interval: 100
			
 
				-    save_per_step: -1
			
 
				-
			
 
				-# gan train conf
			
 
				-train_conf_gan:
			
 
				-    optim: adam
			
 
				-    optim_conf:
			
 
				-        lr: 0.0002 # use small lr for gan training
			
 
				-    scheduler: constantlr
			
 
				-    optim_d: adam
			
 
				-    optim_conf_d:
			
 
				-        lr: 0.0002 # use small lr for gan training
			
 
				-    scheduler_d: constantlr
			
 
				-    max_epoch: 200
			
 
				-    grad_clip: 5
			
 
				-    accum_grad: 1 # in gan training, accum_grad must be 1
			
 
				-    log_interval: 100
			
 
				-    save_per_step: -1
			
--- a/examples/libritts/cosyvoice/conf/cosyvoice_dpo.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice_dpo.yaml
@@ -1,226 +0,0 @@
 
				-# set random seed, so that you may reproduce your result.
			
 
				-__set_seed1: !apply:random.seed [1986]
			
 
				-__set_seed2: !apply:numpy.random.seed [1986]
			
 
				-__set_seed3: !apply:torch.manual_seed [1986]
			
 
				-__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
			
 
				-
			
 
				-# fixed params
			
 
				-sample_rate: 24000   # 16000 for llm, 24000 for cfm
			
 
				-llm_input_size: 896
			
 
				-llm_output_size: 896
			
 
				-spk_embed_dim: 192
			
 
				-qwen_pretrain_path: 'CosyVoice2-0.5B/CosyVoice-BlankEN'
			
 
				-
			
 
				-# model params
			
 
				-# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
			
 
				-# for system/third_party class/function, we do not require this.
			
 
				-llm: !new:cosyvoice.llm.llm_dpo.Qwen2LM
			
 
				-    llm_input_size: !ref <llm_input_size>
			
 
				-    llm_output_size: !ref <llm_output_size>
			
 
				-    speech_token_size: 6561
			
 
				-    length_normalized_loss: True
			
 
				-    lsm_weight: 0
			
 
				-    dpo: True
			
 
				-    llm: !new:cosyvoice.llm.llm.Qwen2Encoder
			
 
				-        pretrain_path: !ref <qwen_pretrain_path>
			
 
				-    sampling: !name:cosyvoice.utils.common.ras_sampling
			
 
				-        top_p: 0.8
			
 
				-        top_k: 25
			
 
				-        win_size: 10
			
 
				-        tau_r: 0.1
			
 
				-flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
			
 
				-    input_size: 512
			
 
				-    output_size: 80
			
 
				-    spk_embed_dim: !ref <spk_embed_dim>
			
 
				-    output_type: 'mel'
			
 
				-    vocab_size: 6561
			
 
				-    input_frame_rate: 25
			
 
				-    only_mask_loss: True
			
 
				-    token_mel_ratio: 2
			
 
				-    pre_lookahead_len: 3
			
 
				-    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
			
 
				-        output_size: 512
			
 
				-        attention_heads: 8
			
 
				-        linear_units: 2048
			
 
				-        num_blocks: 6
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.1
			
 
				-        normalize_before: True
			
 
				-        input_layer: 'linear'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        input_size: 512
			
 
				-        use_cnn_module: False
			
 
				-        macaron_style: False
			
 
				-    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
			
 
				-        in_channels: 240
			
 
				-        n_spks: 1
			
 
				-        spk_emb_dim: 80
			
 
				-        cfm_params: !new:omegaconf.DictConfig
			
 
				-            content:
			
 
				-                sigma_min: 1e-06
			
 
				-                solver: 'euler'
			
 
				-                t_scheduler: 'cosine'
			
 
				-                training_cfg_rate: 0.2
			
 
				-                inference_cfg_rate: 0.7
			
 
				-                reg_loss_type: 'l1'
			
 
				-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
			
 
				-            in_channels: 320
			
 
				-            out_channels: 80
			
 
				-            causal: True
			
 
				-            channels: [256]
			
 
				-            dropout: 0.0
			
 
				-            attention_head_dim: 64
			
 
				-            n_blocks: 4
			
 
				-            num_mid_blocks: 12
			
 
				-            num_heads: 8
			
 
				-            act_fn: 'gelu'
			
 
				-
			
 
				-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
			
 
				-    in_channels: 80
			
 
				-    base_channels: 512
			
 
				-    nb_harmonics: 8
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    nsf_alpha: 0.1
			
 
				-    nsf_sigma: 0.003
			
 
				-    nsf_voiced_threshold: 10
			
 
				-    upsample_rates: [8, 5, 3]
			
 
				-    upsample_kernel_sizes: [16, 11, 7]
			
 
				-    istft_params:
			
 
				-        n_fft: 16
			
 
				-        hop_len: 4
			
 
				-    resblock_kernel_sizes: [3, 7, 11]
			
 
				-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    source_resblock_kernel_sizes: [7, 7, 11]
			
 
				-    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    lrelu_slope: 0.1
			
 
				-    audio_limit: 0.99
			
 
				-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
			
 
				-        num_class: 1
			
 
				-        in_channels: 80
			
 
				-        cond_channels: 512
			
 
				-
			
 
				-# gan related module
			
 
				-mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
			
 
				-    n_fft: 1024
			
 
				-    num_mels: 80
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    hop_size: 256
			
 
				-    win_size: 1024
			
 
				-    fmin: 0
			
 
				-    fmax: null
			
 
				-    center: False
			
 
				-hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
			
 
				-    generator: !ref <hift>
			
 
				-    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
			
 
				-        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
			
 
				-        mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
			
 
				-    mel_spec_transform: [
			
 
				-        !ref <mel_spec_transform1>
			
 
				-    ]
			
 
				-
			
 
				-# processor functions
			
 
				-parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
			
 
				-get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    multilingual: True
			
 
				-    num_languages: 100
			
 
				-    language: 'en'
			
 
				-    task: 'transcribe'
			
 
				-allowed_special: 'all'
			
 
				-tokenize: !name:cosyvoice.dataset.processor.tokenize
			
 
				-    get_tokenizer: !ref <get_tokenizer>
			
 
				-    allowed_special: !ref <allowed_special>
			
 
				-filter: !name:cosyvoice.dataset.processor.filter
			
 
				-    max_length: 40960
			
 
				-    min_length: 0
			
 
				-    token_max_length: 200
			
 
				-    token_min_length: 1
			
 
				-resample: !name:cosyvoice.dataset.processor.resample
			
 
				-    resample_rate: !ref <sample_rate>
			
 
				-truncate: !name:cosyvoice.dataset.processor.truncate
			
 
				-    truncate_length: 24576 # must be a multiplier of hop_size
			
 
				-feat_extractor: !name:matcha.utils.audio.mel_spectrogram
			
 
				-    n_fft: 1024
			
 
				-    num_mels: 80
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    hop_size: 256
			
 
				-    win_size: 1024
			
 
				-    fmin: 0
			
 
				-    fmax: 8000
			
 
				-    center: False
			
 
				-compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
			
 
				-    feat_extractor: !ref <feat_extractor>
			
 
				-compute_f0: !name:cosyvoice.dataset.processor.compute_f0
			
 
				-    sample_rate: !ref <sample_rate>
			
 
				-    hop_size: 256
			
 
				-parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
			
 
				-    normalize: True
			
 
				-shuffle: !name:cosyvoice.dataset.processor.shuffle
			
 
				-    shuffle_size: 1000
			
 
				-sort: !name:cosyvoice.dataset.processor.sort
			
 
				-    sort_size: 500  # sort_size should be less than shuffle_size
			
 
				-batch: !name:cosyvoice.dataset.processor.batch
			
 
				-    batch_type: 'dynamic'
			
 
				-    max_frames_in_batch: 2000 # change to 1400 in gan train on v100 16g
			
 
				-padding: !name:cosyvoice.dataset.processor.padding
			
 
				-    use_spk_embedding: True # change to True during sft
			
 
				-    dpo: True
			
 
				-
			
 
				-# dataset processor pipeline
			
 
				-data_pipeline: [
			
 
				-    !ref <parquet_opener>,
			
 
				-    !ref <tokenize>,
			
 
				-    !ref <filter>,
			
 
				-    !ref <resample>,
			
 
				-    !ref <compute_fbank>,
			
 
				-    !ref <parse_embedding>,
			
 
				-    !ref <shuffle>,
			
 
				-    !ref <sort>,
			
 
				-    !ref <batch>,
			
 
				-    !ref <padding>,
			
 
				-]
			
 
				-data_pipeline_gan: [
			
 
				-    !ref <parquet_opener>,
			
 
				-    !ref <tokenize>,
			
 
				-    !ref <filter>,
			
 
				-    !ref <resample>,
			
 
				-    !ref <truncate>,
			
 
				-    !ref <compute_fbank>,
			
 
				-    !ref <compute_f0>,
			
 
				-    !ref <parse_embedding>,
			
 
				-    !ref <shuffle>,
			
 
				-    !ref <sort>,
			
 
				-    !ref <batch>,
			
 
				-    !ref <padding>,
			
 
				-]
			
 
				-
			
 
				-# llm flow train conf
			
 
				-train_conf:
			
 
				-    optim: adam
			
 
				-    optim_conf:
			
 
				-        lr: 0.00001 # change to 1e-5 during sft
			
 
				-    scheduler: warmuplr # change to constantlr during sft
			
 
				-    scheduler_conf:
			
 
				-        warmup_steps: 25000
			
 
				-    max_epoch: 200
			
 
				-    grad_clip: 5
			
 
				-    accum_grad: 2
			
 
				-    log_interval: 100
			
 
				-    save_per_step: -1
			
 
				-
			
 
				-# gan train conf
			
 
				-train_conf_gan:
			
 
				-    optim: adam
			
 
				-    optim_conf:
			
 
				-        lr: 0.0002 # use small lr for gan training
			
 
				-    scheduler: constantlr
			
 
				-    optim_d: adam
			
 
				-    optim_conf_d:
			
 
				-        lr: 0.0002 # use small lr for gan training
			
 
				-    scheduler_d: constantlr
			
 
				-    max_epoch: 200
			
 
				-    grad_clip: 5
			
 
				-    accum_grad: 1 # in gan training, accum_grad must be 1
			
 
				-    log_interval: 100
			
 
				-    save_per_step: -1
			
--- a/examples/libritts/cosyvoice/run.sh
+++ b/examples/libritts/cosyvoice/run.sh
@@ -60,7 +60,7 @@ num_workers=2
 
				 prefetch=100
			
 
				 train_engine=torch_ddp
			
 
				 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
			
 
				-  echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
			
 
				+  echo "Run train. We only support llm traning for now"
			
 
				   if [ $train_engine == 'deepspeed' ]; then
			
 
				     echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
			
 
				   fi
			
--- a/examples/libritts/cosyvoice2/run.sh
+++ b/examples/libritts/cosyvoice2/run.sh
@@ -60,7 +60,7 @@ num_workers=2
 
				 prefetch=100
			
 
				 train_engine=torch_ddp
			
 
				 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
			
 
				-  echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
			
 
				+  echo "Run train. We only support llm traning for now"
			
 
				   if [ $train_engine == 'deepspeed' ]; then
			
 
				     echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
			
 
				   fi
			
--- a/examples/libritts/cosyvoice2/run_dpo.sh
+++ b/examples/libritts/cosyvoice2/run_dpo.sh
@@ -70,7 +70,7 @@ num_workers=2
 
				 prefetch=100
			
 
				 train_engine=torch_ddp
			
 
				 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
			
 
				-  echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
			
 
				+  echo "Run train. We only support llm traning for now"
			
 
				   if [ $train_engine == 'deepspeed' ]; then
			
 
				     echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
			
 
				   fi