cosyvoice3.yaml 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. # set random seed, so that you may reproduce your result.
  2. __set_seed1: !apply:random.seed [1986]
  3. __set_seed2: !apply:numpy.random.seed [1986]
  4. __set_seed3: !apply:torch.manual_seed [1986]
  5. __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
  6. # fixed params
  7. sample_rate: 24000
  8. llm_input_size: 896
  9. llm_output_size: 896
  10. spk_embed_dim: 192
  11. qwen_pretrain_path: ''
  12. token_frame_rate: 25
  13. token_mel_ratio: 2
  14. # stream related params
  15. chunk_size: 25 # streaming inference chunk size, in token
  16. num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
  17. # model params
  18. # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
  19. # for system/third_party class/function, we do not require this.
  20. llm: !new:cosyvoice.llm.llm.CosyVoice3LM
  21. llm_input_size: !ref <llm_input_size>
  22. llm_output_size: !ref <llm_output_size>
  23. speech_token_size: 6561
  24. length_normalized_loss: True
  25. lsm_weight: 0
  26. mix_ratio: [5, 15]
  27. llm: !new:cosyvoice.llm.llm.Qwen2Encoder
  28. pretrain_path: !ref <qwen_pretrain_path>
  29. sampling: !name:cosyvoice.utils.common.ras_sampling
  30. top_p: 0.8
  31. top_k: 25
  32. win_size: 10
  33. tau_r: 0.1
  34. flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT
  35. input_size: 80
  36. output_size: 80
  37. spk_embed_dim: !ref <spk_embed_dim>
  38. output_type: 'mel'
  39. vocab_size: 6561
  40. input_frame_rate: !ref <token_frame_rate>
  41. only_mask_loss: True
  42. token_mel_ratio: !ref <token_mel_ratio>
  43. pre_lookahead_len: 3
  44. pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer
  45. in_channels: 80
  46. channels: 1024
  47. pre_lookahead_len: 3
  48. decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
  49. in_channels: 240
  50. n_spks: 1
  51. spk_emb_dim: 80
  52. cfm_params: !new:omegaconf.DictConfig
  53. content:
  54. sigma_min: 1e-06
  55. solver: 'euler'
  56. t_scheduler: 'cosine'
  57. training_cfg_rate: 0.2
  58. inference_cfg_rate: 0.7
  59. reg_loss_type: 'l1'
  60. estimator: !new:cosyvoice.flow.DiT.dit.DiT
  61. dim: 1024
  62. depth: 22
  63. heads: 16
  64. dim_head: 64
  65. ff_mult: 2
  66. mel_dim: 80
  67. mu_dim: 80
  68. spk_dim: 80
  69. out_channels: 80
  70. static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
  71. num_decoding_left_chunks: !ref <num_decoding_left_chunks>
  72. hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator
  73. in_channels: 80
  74. base_channels: 512
  75. nb_harmonics: 8
  76. sampling_rate: !ref <sample_rate>
  77. nsf_alpha: 0.1
  78. nsf_sigma: 0.003
  79. nsf_voiced_threshold: 10
  80. upsample_rates: [8, 5, 3]
  81. upsample_kernel_sizes: [16, 11, 7]
  82. istft_params:
  83. n_fft: 16
  84. hop_len: 4
  85. resblock_kernel_sizes: [3, 7, 11]
  86. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  87. source_resblock_kernel_sizes: [7, 7, 11]
  88. source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  89. lrelu_slope: 0.1
  90. audio_limit: 0.99
  91. conv_pre_look_right: 4
  92. f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor
  93. num_class: 1
  94. in_channels: 80
  95. cond_channels: 512
  96. # gan related module
  97. mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
  98. n_fft: 1920
  99. num_mels: 80
  100. sampling_rate: !ref <sample_rate>
  101. hop_size: 480
  102. win_size: 1920
  103. fmin: 0
  104. fmax: null
  105. center: False
  106. hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
  107. generator: !ref <hift>
  108. discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
  109. mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
  110. mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
  111. mel_spec_transform: [
  112. !ref <mel_spec_transform1>
  113. ]
  114. # processor functions
  115. parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
  116. get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
  117. token_path: !ref <qwen_pretrain_path>
  118. skip_special_tokens: True
  119. version: cosyvoice3
  120. allowed_special: 'all'
  121. tokenize: !name:cosyvoice.dataset.processor.tokenize
  122. get_tokenizer: !ref <get_tokenizer>
  123. allowed_special: !ref <allowed_special>
  124. filter: !name:cosyvoice.dataset.processor.filter
  125. max_length: 6000
  126. min_length: 100
  127. token_max_length: 200
  128. token_min_length: 1
  129. resample: !name:cosyvoice.dataset.processor.resample
  130. resample_rate: !ref <sample_rate>
  131. truncate: !name:cosyvoice.dataset.processor.truncate
  132. truncate_length: 24960 # must be a multiplier of hop_size and token_mel_ratio
  133. feat_extractor: !name:matcha.utils.audio.mel_spectrogram
  134. n_fft: 1920
  135. num_mels: 80
  136. sampling_rate: !ref <sample_rate>
  137. hop_size: 480
  138. win_size: 1920
  139. fmin: 0
  140. fmax: null
  141. center: False
  142. compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
  143. feat_extractor: !ref <feat_extractor>
  144. num_frames: 960
  145. compute_whisper_fbank: !name:cosyvoice.dataset.processor.compute_whisper_fbank
  146. num_frames: 960
  147. compute_f0: !name:cosyvoice.dataset.processor.compute_f0
  148. sample_rate: !ref <sample_rate>
  149. hop_size: 480
  150. parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
  151. normalize: True
  152. shuffle: !name:cosyvoice.dataset.processor.shuffle
  153. shuffle_size: 1000
  154. sort: !name:cosyvoice.dataset.processor.sort
  155. sort_size: 500 # sort_size should be less than shuffle_size
  156. batch: !name:cosyvoice.dataset.processor.batch
  157. batch_type: 'dynamic'
  158. max_frames_in_batch: 2000
  159. padding: !name:cosyvoice.dataset.processor.padding
  160. use_spk_embedding: False # change to True during sft
  161. # dataset processor pipeline
  162. data_pipeline: [
  163. !ref <parquet_opener>,
  164. !ref <tokenize>,
  165. !ref <filter>,
  166. !ref <resample>,
  167. !ref <compute_fbank>,
  168. !ref <parse_embedding>,
  169. !ref <compute_whisper_fbank>,
  170. !ref <shuffle>,
  171. !ref <sort>,
  172. !ref <batch>,
  173. !ref <padding>,
  174. ]
  175. data_pipeline_gan: [
  176. !ref <parquet_opener>,
  177. !ref <tokenize>,
  178. !ref <filter>,
  179. !ref <resample>,
  180. !ref <truncate>,
  181. !ref <compute_fbank>,
  182. !ref <compute_f0>,
  183. !ref <parse_embedding>,
  184. !ref <shuffle>,
  185. !ref <sort>,
  186. !ref <batch>,
  187. !ref <padding>,
  188. ]
  189. # llm flow train conf
  190. train_conf:
  191. optim: adam
  192. optim_conf:
  193. lr: 1e-5 # change to 1e-5 during sft
  194. scheduler: constantlr # change to constantlr during sft
  195. scheduler_conf:
  196. warmup_steps: 2500
  197. max_epoch: 200
  198. grad_clip: 5
  199. accum_grad: 2
  200. log_interval: 100
  201. save_per_step: -1
  202. # gan train conf
  203. train_conf_gan:
  204. optim: adam
  205. optim_conf:
  206. lr: 0.0002 # use small lr for gan training
  207. scheduler: constantlr
  208. optim_d: adam
  209. optim_conf_d:
  210. lr: 0.0002 # use small lr for gan training
  211. scheduler_d: constantlr
  212. max_epoch: 200
  213. grad_clip: 5
  214. accum_grad: 1 # in gan training, accum_grad must be 1
  215. log_interval: 100
  216. save_per_step: -1