main.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import io,time
  2. from fastapi import FastAPI, Response
  3. from fastapi.responses import HTMLResponse
  4. from cosyvoice.cli.cosyvoice import CosyVoice
  5. import torchaudio
  6. cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
  7. # sft usage
  8. print(cosyvoice.list_avaliable_spks())
  9. app = FastAPI()
  10. @app.get("/api/voice/tts")
  11. async def tts(query: str, role: str):
  12. start = time.process_time()
  13. output = cosyvoice.inference_sft(query, role)
  14. end = time.process_time()
  15. print("infer time:", end-start, "seconds")
  16. buffer = io.BytesIO()
  17. torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
  18. buffer.seek(0)
  19. return Response(content=buffer.read(-1), media_type="audio/wav")
  20. @app.get("/api/voice/roles")
  21. async def roles():
  22. return {"roles": cosyvoice.list_avaliable_spks()}
  23. @app.get("/", response_class=HTMLResponse)
  24. async def root():
  25. return """
  26. <!DOCTYPE html>
  27. <html lang=zh-cn>
  28. <head>
  29. <meta charset=utf-8>
  30. <title>Api information</title>
  31. </head>
  32. <body>
  33. Get the supported tones from the Roles API first, then enter the tones and textual content in the TTS API for synthesis. <a href='./docs'>Documents of API</a>
  34. </body>
  35. </html>
  36. """