cosyvoice2.yaml 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. # set random seed, so that you may reproduce your result.
  2. __set_seed1: !apply:random.seed [1986]
  3. __set_seed2: !apply:numpy.random.seed [1986]
  4. __set_seed3: !apply:torch.manual_seed [1986]
  5. __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
  6. # fixed params
  7. sample_rate: 24000
  8. llm_input_size: 896
  9. llm_output_size: 896
  10. spk_embed_dim: 192
  11. qwen_pretrain_path: ''
  12. token_frame_rate: 25
  13. token_mel_ratio: 2
  14. # model params
  15. # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
  16. # for system/third_party class/function, we do not require this.
  17. llm: !new:cosyvoice.llm.llm.Qwen2LM
  18. llm_input_size: !ref <llm_input_size>
  19. llm_output_size: !ref <llm_output_size>
  20. speech_token_size: 6561
  21. length_normalized_loss: True
  22. lsm_weight: 0
  23. mix_ratio: [5, 15]
  24. llm: !new:cosyvoice.llm.llm.Qwen2Encoder
  25. pretrain_path: !ref <qwen_pretrain_path>
  26. sampling: !name:cosyvoice.utils.common.ras_sampling
  27. top_p: 0.8
  28. top_k: 25
  29. win_size: 10
  30. tau_r: 0.1
  31. flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
  32. input_size: 512
  33. output_size: 80
  34. spk_embed_dim: !ref <spk_embed_dim>
  35. output_type: 'mel'
  36. vocab_size: 6561
  37. input_frame_rate: !ref <token_frame_rate>
  38. only_mask_loss: True
  39. token_mel_ratio: !ref <token_mel_ratio>
  40. pre_lookahead_len: 3
  41. encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
  42. output_size: 512
  43. attention_heads: 8
  44. linear_units: 2048
  45. num_blocks: 6
  46. dropout_rate: 0.1
  47. positional_dropout_rate: 0.1
  48. attention_dropout_rate: 0.1
  49. normalize_before: True
  50. input_layer: 'linear'
  51. pos_enc_layer_type: 'rel_pos_espnet'
  52. selfattention_layer_type: 'rel_selfattn'
  53. input_size: 512
  54. use_cnn_module: False
  55. macaron_style: False
  56. use_dynamic_chunk: True
  57. decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
  58. in_channels: 240
  59. n_spks: 1
  60. spk_emb_dim: 80
  61. cfm_params: !new:omegaconf.DictConfig
  62. content:
  63. sigma_min: 1e-06
  64. solver: 'euler'
  65. t_scheduler: 'cosine'
  66. training_cfg_rate: 0.2
  67. inference_cfg_rate: 0.7
  68. reg_loss_type: 'l1'
  69. estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
  70. in_channels: 320
  71. out_channels: 80
  72. channels: [256]
  73. dropout: 0.0
  74. attention_head_dim: 64
  75. n_blocks: 4
  76. num_mid_blocks: 12
  77. num_heads: 8
  78. act_fn: 'gelu'
  79. static_chunk_size: !ref <token_frame_rate> * <token_mel_ratio> # here we use static_chunk_size because we want to fix kv cache size during inference
  80. num_decoding_left_chunks: 2
  81. hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
  82. in_channels: 80
  83. base_channels: 512
  84. nb_harmonics: 8
  85. sampling_rate: !ref <sample_rate>
  86. nsf_alpha: 0.1
  87. nsf_sigma: 0.003
  88. nsf_voiced_threshold: 10
  89. upsample_rates: [8, 5, 3]
  90. upsample_kernel_sizes: [16, 11, 7]
  91. istft_params:
  92. n_fft: 16
  93. hop_len: 4
  94. resblock_kernel_sizes: [3, 7, 11]
  95. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  96. source_resblock_kernel_sizes: [7, 7, 11]
  97. source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  98. lrelu_slope: 0.1
  99. audio_limit: 0.99
  100. f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
  101. num_class: 1
  102. in_channels: 80
  103. cond_channels: 512
  104. # gan related module
  105. mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
  106. n_fft: 1024
  107. num_mels: 80
  108. sampling_rate: !ref <sample_rate>
  109. hop_size: 256
  110. win_size: 1024
  111. fmin: 0
  112. fmax: null
  113. center: False
  114. hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
  115. generator: !ref <hift>
  116. discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
  117. mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
  118. mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
  119. mel_spec_transform: [
  120. !ref <mel_spec_transform1>
  121. ]
  122. # processor functions
  123. parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
  124. get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
  125. token_path: !ref <qwen_pretrain_path>
  126. skip_special_tokens: True
  127. allowed_special: 'all'
  128. tokenize: !name:cosyvoice.dataset.processor.tokenize
  129. get_tokenizer: !ref <get_tokenizer>
  130. allowed_special: !ref <allowed_special>
  131. filter: !name:cosyvoice.dataset.processor.filter
  132. max_length: 40960
  133. min_length: 100
  134. token_max_length: 200
  135. token_min_length: 1
  136. resample: !name:cosyvoice.dataset.processor.resample
  137. resample_rate: !ref <sample_rate>
  138. truncate: !name:cosyvoice.dataset.processor.truncate
  139. truncate_length: 24480 # must be a multiplier of hop_size
  140. feat_extractor: !name:matcha.utils.audio.mel_spectrogram
  141. n_fft: 1920
  142. num_mels: 80
  143. sampling_rate: !ref <sample_rate>
  144. hop_size: 480
  145. win_size: 1920
  146. fmin: 0
  147. fmax: 8000
  148. center: False
  149. compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
  150. feat_extractor: !ref <feat_extractor>
  151. # pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch # TODO need to replace it
  152. # sample_rate: !ref <sample_rate>
  153. # frame_length: 46.4 # match feat_extractor win_size/sampling_rate
  154. # frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
  155. # compute_f0: !name:cosyvoice.dataset.processor.compute_f0
  156. # pitch_extractor: !ref <pitch_extractor>
  157. parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
  158. normalize: True
  159. shuffle: !name:cosyvoice.dataset.processor.shuffle
  160. shuffle_size: 1000
  161. sort: !name:cosyvoice.dataset.processor.sort
  162. sort_size: 500 # sort_size should be less than shuffle_size
  163. batch: !name:cosyvoice.dataset.processor.batch
  164. batch_type: 'dynamic'
  165. max_frames_in_batch: 2500
  166. padding: !name:cosyvoice.dataset.processor.padding
  167. use_spk_embedding: False # change to True during sft
  168. # dataset processor pipeline
  169. data_pipeline: [
  170. !ref <parquet_opener>,
  171. !ref <tokenize>,
  172. !ref <filter>,
  173. !ref <resample>,
  174. !ref <compute_fbank>,
  175. !ref <parse_embedding>,
  176. !ref <shuffle>,
  177. !ref <sort>,
  178. !ref <batch>,
  179. !ref <padding>,
  180. ]
  181. # data_pipeline_gan: [
  182. # !ref <parquet_opener>,
  183. # !ref <tokenize>,
  184. # !ref <filter>,
  185. # !ref <resample>,
  186. # !ref <truncate>,
  187. # !ref <compute_fbank>,
  188. # !ref <compute_f0>,
  189. # !ref <parse_embedding>,
  190. # !ref <shuffle>,
  191. # !ref <sort>,
  192. # !ref <batch>,
  193. # !ref <padding>,
  194. # ]
  195. # llm flow train conf
  196. train_conf:
  197. optim: adam
  198. optim_conf:
  199. lr: 1e-5 # change to 1e-5 during sft
  200. scheduler: constantlr # change to constantlr during sft
  201. scheduler_conf:
  202. warmup_steps: 2500
  203. max_epoch: 200
  204. grad_clip: 5
  205. accum_grad: 2
  206. log_interval: 100
  207. save_per_step: -1
  208. # gan train conf
  209. train_conf_gan:
  210. optim: adam
  211. optim_conf:
  212. lr: 0.0002 # use small lr for gan training
  213. scheduler: constantlr
  214. optim_d: adam
  215. optim_conf_d:
  216. lr: 0.0002 # use small lr for gan training
  217. scheduler_d: constantlr
  218. max_epoch: 200
  219. grad_clip: 5
  220. accum_grad: 1 # in gan training, accum_grad must be 1
  221. log_interval: 100
  222. save_per_step: -1