example.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. import sys
  2. sys.path.append('third_party/Matcha-TTS')
  3. from cosyvoice.cli.cosyvoice import AutoModel
  4. import torchaudio
  5. def cosyvoice_example():
  6. """ CosyVoice Usage, check https://fun-audio-llm.github.io/ for more details
  7. """
  8. cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-SFT')
  9. # sft usage
  10. print(cosyvoice.list_available_spks())
  11. # change stream=True for chunk stream inference
  12. for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
  13. torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  14. cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M')
  15. # zero_shot usage
  16. for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav')):
  17. torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  18. # cross_lingual usage, <|zh|><|en|><|ja|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
  19. for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.',
  20. './asset/cross_lingual_prompt.wav')):
  21. torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  22. # vc usage
  23. for i, j in enumerate(cosyvoice.inference_vc('./asset/cross_lingual_prompt.wav', './asset/zero_shot_prompt.wav')):
  24. torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  25. cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-Instruct')
  26. # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
  27. for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男',
  28. 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.<|endofprompt|>')):
  29. torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  30. def cosyvoice2_example():
  31. """ CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details
  32. """
  33. cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice2-0.5B')
  34. # NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
  35. # zero_shot usage
  36. for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav')):
  37. torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  38. # save zero_shot spk for future usage
  39. assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', 'my_zero_shot_spk') is True
  40. for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk')):
  41. torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  42. cosyvoice.save_spkinfo()
  43. # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
  44. for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', './asset/zero_shot_prompt.wav')):
  45. torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  46. # instruct usage
  47. for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话<|endofprompt|>', './asset/zero_shot_prompt.wav')):
  48. torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  49. # bistream usage, you can use generator as input, this is useful when using text llm model as input
  50. # NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
  51. def text_generator():
  52. yield '收到好友从远方寄来的生日礼物,'
  53. yield '那份意外的惊喜与深深的祝福'
  54. yield '让我心中充满了甜蜜的快乐,'
  55. yield '笑容如花儿般绽放。'
  56. for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
  57. torchaudio.save('zero_shot_bistream_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  58. def cosyvoice3_example():
  59. """ CosyVoice3 Usage, check https://funaudiollm.github.io/cosyvoice3/ for more details
  60. """
  61. cosyvoice = AutoModel(model_dir='pretrained_models/Fun-CosyVoice3-0.5B')
  62. # zero_shot usage
  63. for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡,北坡炮兵并排跑,炮兵怕把标兵碰,标兵怕碰炮兵炮。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
  64. './asset/zero_shot_prompt.wav', stream=False)):
  65. torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  66. # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L280
  67. for i, j in enumerate(cosyvoice.inference_cross_lingual('You are a helpful assistant.<|endofprompt|>[breath]因为他们那一辈人[breath]在乡里面住的要习惯一点,[breath]邻居都很活络,[breath]嗯,都很熟悉。[breath]',
  68. './asset/zero_shot_prompt.wav', stream=False)):
  69. torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  70. # instruct usage, for supported control, check cosyvoice/utils/common.py#L28
  71. for i, j in enumerate(cosyvoice.inference_instruct2('好少咯,一般系放嗰啲国庆啊,中秋嗰啲可能会咯。', 'You are a helpful assistant. 请用广东话表达。<|endofprompt|>',
  72. './asset/zero_shot_prompt.wav', stream=False)):
  73. torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  74. for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', 'You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>',
  75. './asset/zero_shot_prompt.wav', stream=False)):
  76. torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  77. # hotfix usage
  78. for i, j in enumerate(cosyvoice.inference_zero_shot('高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。',
  79. './asset/zero_shot_prompt.wav', stream=False)):
  80. torchaudio.save('hotfix_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  81. # NOTE for Japanese usage, you must translate it to katakana.
  82. # 歴史的世界においては、過去は単に過ぎ去ったものではない、プラトンのいう如く非有が有である。 -> レキシ テキ セカイ ニ オイ テ ワ、カコ ワ タンニ スギサッ タ モノ デ ワ ナイ、プラトン ノ イウ ゴトク ヒ ユー ガ ユー デ アル。
  83. for i, j in enumerate(cosyvoice.inference_cross_lingual('You are a helpful assistant.<|endofprompt|>レキシ テキ セカイ ニ オイ テ ワ、カコ ワ タンニ スギサッ タ モノ デ ワ ナイ、プラトン ノ イウ ゴトク ヒ ユー ガ ユー デ アル。',
  84. './asset/zero_shot_prompt.wav', stream=False)):
  85. torchaudio.save('japanese_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  86. def main():
  87. # cosyvoice_example()
  88. # cosyvoice2_example()
  89. cosyvoice3_example()
  90. if __name__ == '__main__':
  91. main()