cosyvoice.hifigan.yaml 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. # set random seed, so that you may reproduce your result.
  2. __set_seed1: !apply:random.seed [1986]
  3. __set_seed2: !apply:numpy.random.seed [1986]
  4. __set_seed3: !apply:torch.manual_seed [1986]
  5. __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
  6. # fixed params
  7. sample_rate: 22050
  8. text_encoder_input_size: 512
  9. llm_input_size: 1024
  10. llm_output_size: 1024
  11. spk_embed_dim: 192
  12. # model params
  13. # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
  14. # for system/third_party class/function, we do not require this.
  15. hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
  16. in_channels: 80
  17. base_channels: 512
  18. nb_harmonics: 8
  19. sampling_rate: !ref <sample_rate>
  20. nsf_alpha: 0.1
  21. nsf_sigma: 0.003
  22. nsf_voiced_threshold: 10
  23. upsample_rates: [8, 8]
  24. upsample_kernel_sizes: [16, 16]
  25. istft_params:
  26. n_fft: 16
  27. hop_len: 4
  28. resblock_kernel_sizes: [3, 7, 11]
  29. resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  30. source_resblock_kernel_sizes: [7, 11]
  31. source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
  32. lrelu_slope: 0.1
  33. audio_limit: 0.99
  34. f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
  35. num_class: 1
  36. in_channels: 80
  37. cond_channels: 512
  38. mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
  39. n_fft: 1024
  40. num_mels: 80
  41. sampling_rate: !ref <sample_rate>
  42. hop_size: 256
  43. win_size: 1024
  44. fmin: 0
  45. fmax: 8000
  46. center: False
  47. hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
  48. generator: !ref <hift>
  49. discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
  50. mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
  51. mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
  52. mel_spec_transform: [
  53. !ref <mel_spec_transform1>
  54. ]
  55. # processor functions
  56. parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
  57. get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
  58. multilingual: True
  59. num_languages: 100
  60. language: 'en'
  61. task: 'transcribe'
  62. tokenize: !name:cosyvoice.dataset.processor.tokenize
  63. get_tokenizer: !ref <get_tokenizer>
  64. allowed_special: 'all'
  65. filter: !name:cosyvoice.dataset.processor.filter
  66. max_length: 40960
  67. min_length: 0
  68. token_max_length: 200
  69. token_min_length: 1
  70. resample: !name:cosyvoice.dataset.processor.resample
  71. resample_rate: !ref <sample_rate>
  72. truncate: !name:cosyvoice.dataset.processor.truncate
  73. truncate_length: 24576 # must be a multiplier of hop_size
  74. feat_extractor: !name:matcha.utils.audio.mel_spectrogram
  75. n_fft: 1024
  76. num_mels: 80
  77. sampling_rate: !ref <sample_rate>
  78. hop_size: 256
  79. win_size: 1024
  80. fmin: 0
  81. fmax: 8000
  82. center: False
  83. compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
  84. feat_extractor: !ref <feat_extractor>
  85. pitch_extractor: !name:torchaudio.functional.compute_kaldi_pitch
  86. sample_rate: !ref <sample_rate>
  87. frame_length: 46.4 # match feat_extractor win_size/sampling_rate
  88. frame_shift: 11.6 # match feat_extractor hop_size/sampling_rate
  89. compute_f0: !name:cosyvoice.dataset.processor.compute_f0
  90. pitch_extractor: !ref <pitch_extractor>
  91. parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
  92. normalize: True
  93. shuffle: !name:cosyvoice.dataset.processor.shuffle
  94. shuffle_size: 1000
  95. sort: !name:cosyvoice.dataset.processor.sort
  96. sort_size: 500 # sort_size should be less than shuffle_size
  97. batch: !name:cosyvoice.dataset.processor.batch
  98. batch_type: 'dynamic'
  99. max_frames_in_batch: 1200
  100. padding: !name:cosyvoice.dataset.processor.padding
  101. use_spk_embedding: False # change to True during sft
  102. # dataset processor pipeline
  103. data_pipeline: [
  104. !ref <parquet_opener>,
  105. !ref <tokenize>,
  106. !ref <filter>,
  107. !ref <resample>,
  108. !ref <truncate>,
  109. !ref <compute_fbank>,
  110. !ref <compute_f0>,
  111. !ref <parse_embedding>,
  112. !ref <shuffle>,
  113. !ref <sort>,
  114. !ref <batch>,
  115. !ref <padding>,
  116. ]
  117. # train conf
  118. train_conf:
  119. optim: adam
  120. optim_conf:
  121. lr: 0.002 # change to 0.001 if you want to train flow from scratch
  122. scheduler: warmuplr
  123. scheduler_conf:
  124. warmup_steps: 25000
  125. optim_d: adam
  126. optim_conf_d:
  127. lr: 0.002 # change to 0.001 if you want to train flow from scratch
  128. scheduler_d: warmuplr
  129. scheduler_conf_d:
  130. warmup_steps: 25000
  131. max_epoch: 200
  132. grad_clip: 5
  133. accum_grad: 2
  134. log_interval: 100
  135. save_per_step: -1