Ver Fonte

add download models script and fastapi server to serve tts

iflamed há 1 ano atrás
pai
commit
fff6f9f1e0
4 ficheiros alterados com 61 adições e 9 exclusões
  1. 12 8
      README.md
  2. 6 0
      download.py
  3. 40 0
      main.py
  4. 3 1
      requirements.txt

+ 12 - 8
README.md

@@ -37,17 +37,13 @@ We strongly recommend that you download our pretrained `CosyVoice-300M` `CosyVoi
 
 
 If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step.
 If you are expert in this field, and you are only interested in training your own CosyVoice model from scratch, you can skip this step.
 
 
-``` python
-# SDK模型下载
-from modelscope import snapshot_download
-snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
-snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
-snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
-snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
+Download models with python script.
+``` shell
+python download.py
 ```
 ```
 
 
+Download models with git, you should install `git lfs` first.
 ``` sh
 ``` sh
-# git模型下载,请确保已安装git lfs
 mkdir -p pretrained_models
 mkdir -p pretrained_models
 git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
 git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
@@ -120,6 +116,14 @@ python3 webui.py --port 50000 --model_dir pretrained_models/CosyVoice-300M
 For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`.
 For advanced user, we have provided train and inference scripts in `examples/libritts/cosyvoice/run.sh`.
 You can get familiar with CosyVoice following this recipie.
 You can get familiar with CosyVoice following this recipie.
 
 
+**Serve with FastAPI**
+```sh
+# For development
+fastapi dev --port 3003
+# For production
+fastapi run --port 3003
+```
+
 **Build for deployment**
 **Build for deployment**
 
 
 Optionally, if you want to use grpc for service deployment,
 Optionally, if you want to use grpc for service deployment,

+ 6 - 0
download.py

@@ -0,0 +1,6 @@
+# SDK模型下载
+from modelscope import snapshot_download
+snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
+snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
+snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
+snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')

+ 40 - 0
main.py

@@ -0,0 +1,40 @@
+import io,time
+from fastapi import FastAPI, Response
+from fastapi.responses import HTMLResponse
+from cosyvoice.cli.cosyvoice import CosyVoice
+import torchaudio
+
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
+# sft usage
+print(cosyvoice.list_avaliable_spks())
+app = FastAPI()
+
+@app.get("/api/voice/tts")
+async def tts(query: str, role: str):
+    start = time.process_time()
+    output = cosyvoice.inference_sft(query, role)
+    end = time.process_time()
+    print("infer time:", end-start, "seconds")
+    buffer = io.BytesIO()
+    torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
+    buffer.seek(0)
+    return Response(content=buffer.read(-1), media_type="audio/wav")
+
+@app.get("/api/voice/roles")
+async def roles():
+    return {"roles": cosyvoice.list_avaliable_spks()}
+
+@app.get("/", response_class=HTMLResponse)
+async def root():
+    return """
+    <!DOCTYPE html>
+    <html lang=zh-cn>
+        <head>
+            <meta charset=utf-8>
+            <title>Api information</title>
+        </head>
+        <body>
+            Get the supported tones from the Roles API first, then enter the tones and textual content in the TTS API for synthesis. <a href='./docs'>Documents of API</a>
+        </body>
+    </html>
+    """

+ 3 - 1
requirements.txt

@@ -25,4 +25,6 @@ soundfile==0.12.1
 tensorboard==2.14.0
 tensorboard==2.14.0
 torch==2.0.1
 torch==2.0.1
 torchaudio==2.0.2
 torchaudio==2.0.2
-wget==3.2
+wget==3.2
+fastapi==0.111.0
+fastapi-cli==0.0.4