cosyvoice.yaml 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # set random seed, so that you may reproduce your result.
  2. __set_seed1: !apply:random.seed [1986]
  3. __set_seed2: !apply:numpy.random.seed [1986]
  4. __set_seed3: !apply:torch.manual_seed [1986]
  5. __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
  6. # fixed params
  7. sample_rate: 22050
  8. text_encoder_input_size: 512
  9. llm_input_size: 1024
  10. llm_output_size: 1024
  11. spk_embed_dim: 192
  12. # model params
  13. # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
  14. # for system/third_party class/function, we do not require this.
  15. llm: !new:cosyvoice.llm.llm.TransformerLM
  16. text_encoder_input_size: !ref <text_encoder_input_size>
  17. llm_input_size: !ref <llm_input_size>
  18. llm_output_size: !ref <llm_output_size>
  19. text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
  20. speech_token_size: 4096
  21. length_normalized_loss: True
  22. lsm_weight: 0
  23. spk_embed_dim: !ref <spk_embed_dim>
  24. text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
  25. input_size: !ref <text_encoder_input_size>
  26. output_size: 1024
  27. attention_heads: 16
  28. linear_units: 4096
  29. num_blocks: 6
  30. dropout_rate: 0.1
  31. positional_dropout_rate: 0.1
  32. attention_dropout_rate: 0.0
  33. normalize_before: True
  34. input_layer: 'linear'
  35. pos_enc_layer_type: 'rel_pos_espnet'
  36. selfattention_layer_type: 'rel_selfattn'
  37. use_cnn_module: False
  38. macaron_style: False
  39. use_dynamic_chunk: False
  40. use_dynamic_left_chunk: False
  41. static_chunk_size: 1
  42. llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
  43. input_size: !ref <llm_input_size>
  44. output_size: !ref <llm_output_size>
  45. attention_heads: 16
  46. linear_units: 4096
  47. num_blocks: 14
  48. dropout_rate: 0.1
  49. positional_dropout_rate: 0.1
  50. attention_dropout_rate: 0.0
  51. input_layer: 'linear_legacy'
  52. pos_enc_layer_type: 'rel_pos_espnet'
  53. selfattention_layer_type: 'rel_selfattn'
  54. static_chunk_size: 1
  55. sampling: !name:cosyvoice.utils.common.ras_sampling
  56. top_p: 0.8
  57. top_k: 25
  58. win_size: 10
  59. tau_r: 0.1
  60. flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
  61. input_size: 512
  62. output_size: 80
  63. spk_embed_dim: !ref <spk_embed_dim>
  64. output_type: 'mel'
  65. vocab_size: 4096
  66. input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
  67. only_mask_loss: True
  68. encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
  69. output_size: 512
  70. attention_heads: 8
  71. linear_units: 2048
  72. num_blocks: 6
  73. dropout_rate: 0.1
  74. positional_dropout_rate: 0.1
  75. attention_dropout_rate: 0.1
  76. normalize_before: True
  77. input_layer: 'linear'
  78. pos_enc_layer_type: 'rel_pos_espnet'
  79. selfattention_layer_type: 'rel_selfattn'
  80. input_size: 512
  81. use_cnn_module: False
  82. macaron_style: False
  83. length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
  84. channels: 80
  85. sampling_ratios: [1, 1, 1, 1]
  86. decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
  87. in_channels: 240
  88. n_spks: 1
  89. spk_emb_dim: 80
  90. cfm_params: !new:omegaconf.DictConfig
  91. content:
  92. sigma_min: 1e-06
  93. solver: 'euler'
  94. t_scheduler: 'cosine'
  95. training_cfg_rate: 0.2
  96. inference_cfg_rate: 0.7
  97. reg_loss_type: 'l1'
  98. estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
  99. in_channels: 320
  100. out_channels: 80
  101. channels: [256, 256]
  102. dropout: 0.0
  103. attention_head_dim: 64
  104. n_blocks: 4
  105. num_mid_blocks: 12
  106. num_heads: 8
  107. act_fn: 'gelu'
  108. hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
  109. in_channels: 80
  110. base_channels: 512
  111. nb_harmonics: 8
  112. sampling_rate: !ref <sample_rate>
  113. nsf_alpha: 0.1
  114. nsf_sigma: 0.003
  115. nsf_voiced_threshold: 10
  116. upsample_rates: [8, 8]
  117. upsample_kernel_sizes: [16, 16]
  118. istft_params:
  119. n_fft: 16
  120. hop_len: 4
  121. resblock_kernel_sizes: [3, 7, 11]
  122. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  123. source_resblock_kernel_sizes: [7, 11]
  124. source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
  125. lrelu_slope: 0.1
  126. audio_limit: 0.99
  127. f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
  128. num_class: 1
  129. in_channels: 80
  130. cond_channels: 512
  131. # gan related module
  132. mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
  133. n_fft: 1024
  134. num_mels: 80
  135. sampling_rate: !ref <sample_rate>
  136. hop_size: 256
  137. win_size: 1024
  138. fmin: 0
  139. fmax: null
  140. center: False
  141. hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
  142. generator: !ref <hift>
  143. discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
  144. mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
  145. mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
  146. mel_spec_transform: [
  147. !ref <mel_spec_transform1>
  148. ]
  149. # processor functions
  150. parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
  151. get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
  152. multilingual: True
  153. num_languages: 100
  154. language: 'en'
  155. task: 'transcribe'
  156. allowed_special: 'all'
  157. tokenize: !name:cosyvoice.dataset.processor.tokenize
  158. get_tokenizer: !ref <get_tokenizer>
  159. allowed_special: !ref <allowed_special>
  160. filter: !name:cosyvoice.dataset.processor.filter
  161. max_length: 40960
  162. min_length: 0
  163. token_max_length: 200
  164. token_min_length: 1
  165. resample: !name:cosyvoice.dataset.processor.resample
  166. resample_rate: !ref <sample_rate>
  167. truncate: !name:cosyvoice.dataset.processor.truncate
  168. truncate_length: 24576 # must be a multiplier of hop_size
  169. feat_extractor: !name:matcha.utils.audio.mel_spectrogram
  170. n_fft: 1024
  171. num_mels: 80
  172. sampling_rate: !ref <sample_rate>
  173. hop_size: 256
  174. win_size: 1024
  175. fmin: 0
  176. fmax: 8000
  177. center: False
  178. compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
  179. feat_extractor: !ref <feat_extractor>
  180. compute_f0: !name:cosyvoice.dataset.processor.compute_f0
  181. sample_rate: !ref <sample_rate>
  182. hop_size: 256
  183. parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
  184. normalize: True
  185. shuffle: !name:cosyvoice.dataset.processor.shuffle
  186. shuffle_size: 1000
  187. sort: !name:cosyvoice.dataset.processor.sort
  188. sort_size: 500 # sort_size should be less than shuffle_size
  189. batch: !name:cosyvoice.dataset.processor.batch
  190. batch_type: 'dynamic'
  191. max_frames_in_batch: 2000 # change to 1400 in gan train on v100 16g
  192. padding: !name:cosyvoice.dataset.processor.padding
  193. use_spk_embedding: False # change to True during sft
  194. # dataset processor pipeline
  195. data_pipeline: [
  196. !ref <parquet_opener>,
  197. !ref <tokenize>,
  198. !ref <filter>,
  199. !ref <resample>,
  200. !ref <compute_fbank>,
  201. !ref <parse_embedding>,
  202. !ref <shuffle>,
  203. !ref <sort>,
  204. !ref <batch>,
  205. !ref <padding>,
  206. ]
  207. data_pipeline_gan: [
  208. !ref <parquet_opener>,
  209. !ref <tokenize>,
  210. !ref <filter>,
  211. !ref <resample>,
  212. !ref <truncate>,
  213. !ref <compute_fbank>,
  214. !ref <compute_f0>,
  215. !ref <parse_embedding>,
  216. !ref <shuffle>,
  217. !ref <sort>,
  218. !ref <batch>,
  219. !ref <padding>,
  220. ]
  221. # llm flow train conf
  222. train_conf:
  223. optim: adam
  224. optim_conf:
  225. lr: 0.001 # change to 1e-5 during sft
  226. scheduler: warmuplr # change to constantlr during sft
  227. scheduler_conf:
  228. warmup_steps: 2500
  229. max_epoch: 200
  230. grad_clip: 5
  231. accum_grad: 2
  232. log_interval: 100
  233. save_per_step: -1
  234. # gan train conf
  235. train_conf_gan:
  236. optim: adam
  237. optim_conf:
  238. lr: 0.0002 # use small lr for gan training
  239. scheduler: constantlr
  240. optim_d: adam
  241. optim_conf_d:
  242. lr: 0.0002 # use small lr for gan training
  243. scheduler_d: constantlr
  244. max_epoch: 200
  245. grad_clip: 5
  246. accum_grad: 1 # in gan training, accum_grad must be 1
  247. log_interval: 100
  248. save_per_step: -1