4 months ago · 7baefaf0f2
--- a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
+++ b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
@@ -20,7 +20,7 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size,
 
				 # model params
			
 
				 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
			
 
				 # for system/third_party class/function, we do not require this.
			
 
				-llm: !new:cosyvoice.llm.llm.Qwen2LM
			
 
				+llm: !new:cosyvoice.llm.llm.CosyVoice3LM
			
 
				     llm_input_size: !ref <llm_input_size>
			
 
				     llm_output_size: !ref <llm_output_size>
			
 
				     speech_token_size: 6561
			
@@ -35,8 +35,8 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
 
				         win_size: 10
			
 
				         tau_r: 0.1
			
 
				 
			
 
				-flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
			
 
				-    input_size: 512
			
 
				+flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT
			
 
				+    input_size: 80
			
 
				     output_size: 80
			
 
				     spk_embed_dim: !ref <spk_embed_dim>
			
 
				     output_type: 'mel'
			
@@ -45,22 +45,10 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
 
				     only_mask_loss: True
			
 
				     token_mel_ratio: !ref <token_mel_ratio>
			
 
				     pre_lookahead_len: 3
			
 
				-    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
			
 
				-        output_size: 512
			
 
				-        attention_heads: 8
			
 
				-        linear_units: 2048
			
 
				-        num_blocks: 6
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.1
			
 
				-        normalize_before: True
			
 
				-        input_layer: 'linear'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        input_size: 512
			
 
				-        use_cnn_module: False
			
 
				-        macaron_style: False
			
 
				-        static_chunk_size: !ref <chunk_size>
			
 
				+    pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer
			
 
				+        in_channels: 80
			
 
				+        channels: 1024
			
 
				+        pre_lookahead_len: 3
			
 
				     decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
			
 
				         in_channels: 240
			
 
				         n_spks: 1
			
@@ -73,20 +61,20 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
 
				                 training_cfg_rate: 0.2
			
 
				                 inference_cfg_rate: 0.7
			
 
				                 reg_loss_type: 'l1'
			
 
				-        estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
			
 
				-            in_channels: 320
			
 
				+        estimator: !new:cosyvoice.flow.DiT.dit.DiT
			
 
				+            dim: 1024
			
 
				+            depth: 22
			
 
				+            heads: 16
			
 
				+            dim_head: 64
			
 
				+            ff_mult: 2
			
 
				+            mel_dim: 80
			
 
				+            mu_dim: 80
			
 
				+            spk_dim: 80
			
 
				             out_channels: 80
			
 
				-            channels: [256]
			
 
				-            dropout: 0.0
			
 
				-            attention_head_dim: 64
			
 
				-            n_blocks: 4
			
 
				-            num_mid_blocks: 12
			
 
				-            num_heads: 8
			
 
				-            act_fn: 'gelu'
			
 
				             static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
			
 
				             num_decoding_left_chunks: !ref <num_decoding_left_chunks>
			
 
				 
			
 
				-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
			
 
				+hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator
			
 
				     in_channels: 80
			
 
				     base_channels: 512
			
 
				     nb_harmonics: 8
			
@@ -105,7 +93,8 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
 
				     source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				     lrelu_slope: 0.1
			
 
				     audio_limit: 0.99
			
 
				-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
			
 
				+    conv_pre_look_right: 4
			
 
				+    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor
			
 
				         num_class: 1
			
 
				         in_channels: 80
			
 
				         cond_channels: 512