test1.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. import sys
  2. sys.path.append('third_party/Matcha-TTS')
  3. from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
  4. from cosyvoice.utils.file_utils import load_wav
  5. import torchaudio # type: ignore
  6. cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False, use_flow_cache=False)
  7. # NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
  8. # zero_shot usage
  9. prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
  10. for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
  11. torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  12. # save zero_shot spk for future usage
  13. assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True
  14. for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
  15. torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  16. cosyvoice.save_spkinfo()
  17. # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
  18. for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中,他突然[laughter]停下来,因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
  19. torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  20. # instruct usage
  21. for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
  22. torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
  23. # bistream usage, you can use generator as input, this is useful when using text llm model as input
  24. # NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
  25. def text_generator():
  26. yield '收到好友从远方寄来的生日礼物,'
  27. yield '那份意外的惊喜与深深的祝福'
  28. yield '让我心中充满了甜蜜的快乐,'
  29. yield '笑容如花儿般绽放。'
  30. for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
  31. torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)