cosyvoice_dpo.yaml 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. # set random seed, so that you may reproduce your result.
  2. __set_seed1: !apply:random.seed [1986]
  3. __set_seed2: !apply:numpy.random.seed [1986]
  4. __set_seed3: !apply:torch.manual_seed [1986]
  5. __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
  6. # fixed params
  7. sample_rate: 24000 # 16000 for llm, 24000 for cfm
  8. llm_input_size: 896
  9. llm_output_size: 896
  10. spk_embed_dim: 192
  11. qwen_pretrain_path: 'CosyVoice2-0.5B/CosyVoice-BlankEN'
  12. # model params
  13. # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
  14. # for system/third_party class/function, we do not require this.
  15. llm: !new:cosyvoice.llm.llm_dpo.Qwen2LM
  16. llm_input_size: !ref <llm_input_size>
  17. llm_output_size: !ref <llm_output_size>
  18. speech_token_size: 6561
  19. length_normalized_loss: True
  20. lsm_weight: 0
  21. dpo: True
  22. llm: !new:cosyvoice.llm.llm.Qwen2Encoder
  23. pretrain_path: !ref <qwen_pretrain_path>
  24. sampling: !name:cosyvoice.utils.common.ras_sampling
  25. top_p: 0.8
  26. top_k: 25
  27. win_size: 10
  28. tau_r: 0.1
  29. flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
  30. input_size: 512
  31. output_size: 80
  32. spk_embed_dim: !ref <spk_embed_dim>
  33. output_type: 'mel'
  34. vocab_size: 6561
  35. input_frame_rate: 25
  36. only_mask_loss: True
  37. token_mel_ratio: 2
  38. pre_lookahead_len: 3
  39. encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
  40. output_size: 512
  41. attention_heads: 8
  42. linear_units: 2048
  43. num_blocks: 6
  44. dropout_rate: 0.1
  45. positional_dropout_rate: 0.1
  46. attention_dropout_rate: 0.1
  47. normalize_before: True
  48. input_layer: 'linear'
  49. pos_enc_layer_type: 'rel_pos_espnet'
  50. selfattention_layer_type: 'rel_selfattn'
  51. input_size: 512
  52. use_cnn_module: False
  53. macaron_style: False
  54. decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
  55. in_channels: 240
  56. n_spks: 1
  57. spk_emb_dim: 80
  58. cfm_params: !new:omegaconf.DictConfig
  59. content:
  60. sigma_min: 1e-06
  61. solver: 'euler'
  62. t_scheduler: 'cosine'
  63. training_cfg_rate: 0.2
  64. inference_cfg_rate: 0.7
  65. reg_loss_type: 'l1'
  66. estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
  67. in_channels: 320
  68. out_channels: 80
  69. causal: True
  70. channels: [256]
  71. dropout: 0.0
  72. attention_head_dim: 64
  73. n_blocks: 4
  74. num_mid_blocks: 12
  75. num_heads: 8
  76. act_fn: 'gelu'
  77. hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
  78. in_channels: 80
  79. base_channels: 512
  80. nb_harmonics: 8
  81. sampling_rate: !ref <sample_rate>
  82. nsf_alpha: 0.1
  83. nsf_sigma: 0.003
  84. nsf_voiced_threshold: 10
  85. upsample_rates: [8, 5, 3]
  86. upsample_kernel_sizes: [16, 11, 7]
  87. istft_params:
  88. n_fft: 16
  89. hop_len: 4
  90. resblock_kernel_sizes: [3, 7, 11]
  91. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  92. source_resblock_kernel_sizes: [7, 7, 11]
  93. source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  94. lrelu_slope: 0.1
  95. audio_limit: 0.99
  96. f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
  97. num_class: 1
  98. in_channels: 80
  99. cond_channels: 512
  100. # gan related module
  101. mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
  102. n_fft: 1024
  103. num_mels: 80
  104. sampling_rate: !ref <sample_rate>
  105. hop_size: 256
  106. win_size: 1024
  107. fmin: 0
  108. fmax: null
  109. center: False
  110. hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
  111. generator: !ref <hift>
  112. discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
  113. mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
  114. mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
  115. mel_spec_transform: [
  116. !ref <mel_spec_transform1>
  117. ]
  118. # processor functions
  119. parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
  120. get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
  121. multilingual: True
  122. num_languages: 100
  123. language: 'en'
  124. task: 'transcribe'
  125. allowed_special: 'all'
  126. tokenize: !name:cosyvoice.dataset.processor.tokenize
  127. get_tokenizer: !ref <get_tokenizer>
  128. allowed_special: !ref <allowed_special>
  129. filter: !name:cosyvoice.dataset.processor.filter
  130. max_length: 40960
  131. min_length: 0
  132. token_max_length: 200
  133. token_min_length: 1
  134. resample: !name:cosyvoice.dataset.processor.resample
  135. resample_rate: !ref <sample_rate>
  136. truncate: !name:cosyvoice.dataset.processor.truncate
  137. truncate_length: 24576 # must be a multiplier of hop_size
  138. feat_extractor: !name:matcha.utils.audio.mel_spectrogram
  139. n_fft: 1024
  140. num_mels: 80
  141. sampling_rate: !ref <sample_rate>
  142. hop_size: 256
  143. win_size: 1024
  144. fmin: 0
  145. fmax: 8000
  146. center: False
  147. compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
  148. feat_extractor: !ref <feat_extractor>
  149. compute_f0: !name:cosyvoice.dataset.processor.compute_f0
  150. sample_rate: !ref <sample_rate>
  151. hop_size: 256
  152. parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
  153. normalize: True
  154. shuffle: !name:cosyvoice.dataset.processor.shuffle
  155. shuffle_size: 1000
  156. sort: !name:cosyvoice.dataset.processor.sort
  157. sort_size: 500 # sort_size should be less than shuffle_size
  158. batch: !name:cosyvoice.dataset.processor.batch
  159. batch_type: 'dynamic'
  160. max_frames_in_batch: 2000 # change to 1400 in gan train on v100 16g
  161. padding: !name:cosyvoice.dataset.processor.padding
  162. use_spk_embedding: True # change to True during sft
  163. dpo: True
  164. # dataset processor pipeline
  165. data_pipeline: [
  166. !ref <parquet_opener>,
  167. !ref <tokenize>,
  168. !ref <filter>,
  169. !ref <resample>,
  170. !ref <compute_fbank>,
  171. !ref <parse_embedding>,
  172. !ref <shuffle>,
  173. !ref <sort>,
  174. !ref <batch>,
  175. !ref <padding>,
  176. ]
  177. data_pipeline_gan: [
  178. !ref <parquet_opener>,
  179. !ref <tokenize>,
  180. !ref <filter>,
  181. !ref <resample>,
  182. !ref <truncate>,
  183. !ref <compute_fbank>,
  184. !ref <compute_f0>,
  185. !ref <parse_embedding>,
  186. !ref <shuffle>,
  187. !ref <sort>,
  188. !ref <batch>,
  189. !ref <padding>,
  190. ]
  191. # llm flow train conf
  192. train_conf:
  193. optim: adam
  194. optim_conf:
  195. lr: 0.00001 # change to 1e-5 during sft
  196. scheduler: warmuplr # change to constantlr during sft
  197. scheduler_conf:
  198. warmup_steps: 25000
  199. max_epoch: 200
  200. grad_clip: 5
  201. accum_grad: 2
  202. log_interval: 100
  203. save_per_step: -1
  204. # gan train conf
  205. train_conf_gan:
  206. optim: adam
  207. optim_conf:
  208. lr: 0.0002 # use small lr for gan training
  209. scheduler: constantlr
  210. optim_d: adam
  211. optim_conf_d:
  212. lr: 0.0002 # use small lr for gan training
  213. scheduler_d: constantlr
  214. max_epoch: 200
  215. grad_clip: 5
  216. accum_grad: 1 # in gan training, accum_grad must be 1
  217. log_interval: 100
  218. save_per_step: -1