|
|
@@ -26,6 +26,10 @@
|
|
|
|
|
|
## Roadmap
|
|
|
|
|
|
+- [x] 2025/05
|
|
|
+
|
|
|
+ - [x] add cosyvoice 2.0 vllm support
|
|
|
+
|
|
|
- [x] 2024/12
|
|
|
|
|
|
- [x] 25hz cosyvoice 2.0 released
|
|
|
@@ -126,7 +130,7 @@ import torchaudio
|
|
|
|
|
|
**CosyVoice2 Usage**
|
|
|
```python
|
|
|
-cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False)
|
|
|
+cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False)
|
|
|
|
|
|
# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
|
|
|
# zero_shot usage
|
|
|
@@ -159,6 +163,27 @@ for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你
|
|
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
|
|
|
```
|
|
|
|
|
|
+If you want to use vllm for inference, please install `vllm==v0.9.0`. Older vllm version do not support CosyVoice2 inference.
|
|
|
+
|
|
|
+Notice that `vllm==v0.9.0` has a lot of specific requirements, for example `torch==2.7.0`. You can create a new env to in case your hardward do not support vllm and old env is corrupted.
|
|
|
+
|
|
|
+``` sh
|
|
|
+conda create -n cosyvoice_vllm --clone cosyvoice
|
|
|
+pip install vllm==v0.9.0 -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
|
|
|
+conda activate cosyvoice_vllm
|
|
|
+```
|
|
|
+
|
|
|
+```python
|
|
|
+import sys
|
|
|
+sys.path.append('third_party/Matcha-TTS')
|
|
|
+from cosyvoice.cli.cosyvoice import CosyVoice2
|
|
|
+from vllm import ModelRegistry
|
|
|
+from cosyvoice.vllm.cosyvoice2 import CosyVoice2ForCausalLM
|
|
|
+ModelRegistry.register_model("CosyVoice2ForCausalLM", CosyVoice2ForCausalLM)
|
|
|
+
|
|
|
+cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=True, fp16=False)
|
|
|
+```
|
|
|
+
|
|
|
**CosyVoice Usage**
|
|
|
```python
|
|
|
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
|