1 maand geleden · bfa835a74b
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
				 
			
 
				 ## 👉🏻 CosyVoice 👈🏻
			
 
				 
			
 
				-**CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
			
 
				+**CosyVoice 3.0**: [Demos](https://funaudiollm.github.io/cosyvoice3/); [Paper](https://arxiv.org/abs/2505.17589); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice3-0.5B); [CV3-Eval](https://github.com/FunAudioLLM/CV3-Eval)
			
 
				 
			
 
				 **CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
			
 
				 
			
@@ -29,6 +29,11 @@
 
				 
			
 
				 ## Roadmap
			
 
				 
			
 
				+- [x] 2025/12
			
 
				+
			
 
				+    - [x] release cosyvoice3-0.5B base model and its training/inference script
			
 
				+    - [x] release cosyvoice3-0.5B modelscope gradio space
			
 
				+
			
 
				 - [x] 2025/08
			
 
				 
			
 
				     - [x] Thanks to the contribution from NVIDIA Yuekai Zhang, add triton trtllm runtime support and cosyvoice2 grpo training support
			
@@ -96,6 +101,7 @@ We strongly recommend that you download our pretrained `CosyVoice2-0.5B` `CosyVo
 
				 ``` python
			
 
				 # SDK模型下载
			
 
				 from modelscope import snapshot_download
			
 
				+snapshot_download('iic/CosyVoice3-0.5B', local_dir='pretrained_models/CosyVoice3-0.5B')
			
 
				 snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
			
 
				 snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
			
 
				 snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
			
@@ -103,16 +109,6 @@ snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/Co
 
				 snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
			
 
				 ```
			
 
				 
			
 
				-``` sh
			
 
				-# git模型下载，请确保已安装git lfs
			
 
				-mkdir -p pretrained_models
			
 
				-git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B
			
 
				-git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
			
 
				-git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
			
 
				-git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
			
 
				-git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
			
 
				-```
			
 
				-
			
 
				 Optionally, you can unzip `ttsfrd` resource and install `ttsfrd` package for better text normalization performance.
			
 
				 
			
 
				 Notice that this step is not necessary. If you do not install `ttsfrd` package, we will use wetext by default.
			
@@ -127,49 +123,9 @@ pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl
 
				 ### Basic Usage
			
 
				 
			
 
				 We strongly recommend using `CosyVoice2-0.5B` for better performance.
			
 
				-Follow the code below for detailed usage of each model.
			
 
				-
			
 
				-``` python
			
 
				-import sys
			
 
				-sys.path.append('third_party/Matcha-TTS')
			
 
				-from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
			
 
				-from cosyvoice.utils.file_utils import load_wav
			
 
				-import torchaudio
			
 
				-```
			
 
				-
			
 
				-#### CosyVoice2 Usage
			
 
				-```python
			
 
				-cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False)
			
 
				-
			
 
				-# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
			
 
				-# zero_shot usage
			
 
				-prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
			
 
				-for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
			
 
				-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-
			
 
				-# save zero_shot spk for future usage
			
 
				-assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True
			
 
				-for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
			
 
				-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-cosyvoice.save_spkinfo()
			
 
				-
			
 
				-# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
			
 
				-for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中，他突然[laughter]停下来，因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
			
 
				-    torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-
			
 
				-# instruct usage
			
 
				-for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
			
 
				-    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-
			
 
				-# bistream usage, you can use generator as input, this is useful when using text llm model as input
			
 
				-# NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
			
 
				-def text_generator():
			
 
				-    yield '收到好友从远方寄来的生日礼物，'
			
 
				-    yield '那份意外的惊喜与深深的祝福'
			
 
				-    yield '让我心中充满了甜蜜的快乐，'
			
 
				-    yield '笑容如花儿般绽放。'
			
 
				-for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
			
 
				-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+Follow the code in `example.py` for detailed usage of each model.
			
 
				+```sh
			
 
				+python example.py
			
 
				 ```
			
 
				 
			
 
				 #### CosyVoice2 vllm Usage
			
@@ -184,36 +140,6 @@ pip install vllm==v0.9.0 transformers==4.51.3 -i https://mirrors.aliyun.com/pypi
 
				 python vllm_example.py
			
 
				 ```
			
 
				 
			
 
				-#### CosyVoice Usage
			
 
				-```python
			
 
				-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
			
 
				-# sft usage
			
 
				-print(cosyvoice.list_available_spks())
			
 
				-# change stream=True for chunk stream inference
			
 
				-for i, j in enumerate(cosyvoice.inference_sft('你好，我是通义生成式语音大模型，请问有什么可以帮您的吗？', '中文女', stream=False)):
			
 
				-    torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-
			
 
				-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
			
 
				-# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
			
 
				-prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
			
 
				-for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
			
 
				-    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-# cross_lingual usage
			
 
				-prompt_speech_16k = load_wav('./asset/cross_lingual_prompt.wav', 16000)
			
 
				-for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
			
 
				-    torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-# vc usage
			
 
				-prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
			
 
				-source_speech_16k = load_wav('./asset/cross_lingual_prompt.wav', 16000)
			
 
				-for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
			
 
				-    torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-
			
 
				-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
			
 
				-# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
			
 
				-for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
			
 
				-    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-```
			
 
				-
			
 
				 #### Start web demo
			
 
				 
			
 
				 You can use our web demo page to get familiar with CosyVoice quickly.
			
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -182,7 +182,7 @@ class CosyVoice2(CosyVoice):
 
				         raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')
			
 
				 
			
 
				     def inference_instruct2(self, tts_text, instruct_text, prompt_wav, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
			
 
				-        assert isinstance(self.model, CosyVoice2Model), 'inference_instruct2 is only implemented for CosyVoice2!'
			
 
				+        assert isinstance(self.model, CosyVoice2Model) or isinstance(self.model, CosyVoice3Model), 'inference_instruct2 is only implemented for CosyVoice2 and CosyVoice3!'
			
 
				         for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
			
 
				             model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_wav, self.sample_rate, zero_shot_spk_id)
			
 
				             start_time = time.time()
			
@@ -194,7 +194,7 @@ class CosyVoice2(CosyVoice):
 
				                 start_time = time.time()
			
 
				 
			
 
				 
			
 
				-class CosyVoice3(CosyVoice):
			
 
				+class CosyVoice3(CosyVoice2):
			
 
				 
			
 
				     def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False, trt_concurrent=1):
			
 
				         self.instruct = True if '-Instruct' in model_dir else False
			
--- a/cosyvoice/utils/common.py
+++ b/cosyvoice/utils/common.py
@@ -25,6 +25,32 @@ import torch
 
				 
			
 
				 IGNORE_ID = -1
			
 
				 
			
 
				+instruct_list = ["You are a helpful assistant. 请用广东话表达。",
			
 
				+                 "You are a helpful assistant. 请用东北话表达。",
			
 
				+                 "You are a helpful assistant. 请用甘肃话表达。",
			
 
				+                 "You are a helpful assistant. 请用贵州话表达。",
			
 
				+                 "You are a helpful assistant. 请用河南话表达。",
			
 
				+                 "You are a helpful assistant. 请用湖北话表达。",
			
 
				+                 "You are a helpful assistant. 请用湖南话表达。",
			
 
				+                 "You are a helpful assistant. 请用江西话表达。",
			
 
				+                 "You are a helpful assistant. 请用闽南话表达。",
			
 
				+                 "You are a helpful assistant. 请用宁夏话表达。",
			
 
				+                 "You are a helpful assistant. 请用山西话表达。",
			
 
				+                 "You are a helpful assistant. 请用陕西话表达。",
			
 
				+                 "You are a helpful assistant. 请用山东话表达。",
			
 
				+                 "You are a helpful assistant. 请用上海话表达。",
			
 
				+                 "You are a helpful assistant. 请用四川话表达。",
			
 
				+                 "You are a helpful assistant. 请用天津话表达。",
			
 
				+                 "You are a helpful assistant. 请用云南话表达。",
			
 
				+                 "You are a helpful assistant. Please say a sentence as loudly as possible.",
			
 
				+                 "You are a helpful assistant. Please say a sentence in a very soft voice.",
			
 
				+                 "You are a helpful assistant. 请用尽可能慢地语速说一句话。",
			
 
				+                 "You are a helpful assistant. 请用尽可能快地语速说一句话。",
			
 
				+                 "You are a helpful assistant. 请非常开心地说一句话。",
			
 
				+                 "You are a helpful assistant. 请非常伤心地说一句话。",
			
 
				+                 "You are a helpful assistant. 请非常生气地说一句话。",
			
 
				+                 "You are a helpful assistant. 我想体验一下小猪佩奇风格，可以吗？",
			
 
				+                 "You are a helpful assistant. 你可以尝试用机器人的方式解答吗？"]
			
 
				 
			
 
				 def pad_list(xs: List[torch.Tensor], pad_value: int):
			
 
				     """Perform padding for the list of tensors.
			
--- a/example.py
+++ b/example.py
@@ -0,0 +1,97 @@
 
				+import sys
			
 
				+sys.path.append('third_party/Matcha-TTS')
			
 
				+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2, CosyVoice3
			
 
				+from cosyvoice.utils.file_utils import load_wav
			
 
				+import torchaudio
			
 
				+
			
 
				+
			
 
				+def cosyvoice_example():
			
 
				+    """ CosyVoice Usage, check https://fun-audio-llm.github.io/ for more details
			
 
				+    """
			
 
				+    cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
			
 
				+    # sft usage
			
 
				+    print(cosyvoice.list_available_spks())
			
 
				+    # change stream=True for chunk stream inference
			
 
				+    for i, j in enumerate(cosyvoice.inference_sft('你好，我是通义生成式语音大模型，请问有什么可以帮您的吗？', '中文女', stream=False)):
			
 
				+        torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+    cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M')
			
 
				+    # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
			
 
				+    for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+    # cross_lingual usage
			
 
				+    for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', './asset/cross_lingual_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+    # vc usage
			
 
				+    for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+    cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
			
 
				+    # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
			
 
				+    for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
			
 
				+        torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+def cosyvoice2_example():
			
 
				+    """ CosyVoice2 Usage, check https://funaudiollm.github.io/cosyvoice2/ for more details
			
 
				+    """
			
 
				+    cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, load_vllm=False, fp16=False)
			
 
				+
			
 
				+    # NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
			
 
				+    # zero_shot usage
			
 
				+    for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+    # save zero_shot spk for future usage
			
 
				+    assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', 'my_zero_shot_spk') is True
			
 
				+    for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
			
 
				+        torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+    cosyvoice.save_spkinfo()
			
 
				+
			
 
				+    # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
			
 
				+    for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中，他突然[laughter]停下来，因为他自己也被逗笑了[laughter]。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+    # instruct usage
			
 
				+    for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '用四川话说这句话', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+    # bistream usage, you can use generator as input, this is useful when using text llm model as input
			
 
				+    # NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
			
 
				+    def text_generator():
			
 
				+        yield '收到好友从远方寄来的生日礼物，'
			
 
				+        yield '那份意外的惊喜与深深的祝福'
			
 
				+        yield '让我心中充满了甜蜜的快乐，'
			
 
				+        yield '笑容如花儿般绽放。'
			
 
				+    for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+def cosyvoice3_example():
			
 
				+    """ CosyVoice3 Usage, check https://funaudiollm.github.io/cosyvoice3/ for more details
			
 
				+    """
			
 
				+    cosyvoice = CosyVoice3('pretrained_models/CosyVoice3-0.5B', load_jit=False, load_trt=False, fp16=False)
			
 
				+    # zero_shot usage
			
 
				+    for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡，北坡炮兵并排跑，炮兵怕把标兵碰，标兵怕碰炮兵炮。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+    # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L280
			
 
				+    for i, j in enumerate(cosyvoice.inference_cross_lingual('[breath]因为他们那一辈人[breath]在乡里面住的要习惯一点，[breath]邻居都很活络，[breath]嗯，都很熟悉。[breath]', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+    # instruct usage
			
 
				+    for i, j in enumerate(cosyvoice.inference_instruct2('好少咯，一般系放嗰啲国庆啊，中秋嗰啲可能会咯。', 'You are a helpful assistant. 请用广东话表达。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+    for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', 'You are a helpful assistant. 请用尽可能快地语速说一句话。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+    # hotfix usage
			
 
				+    for i, j in enumerate(cosyvoice.inference_zero_shot('高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        torchaudio.save('hotfix_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+def main():
			
 
				+    # cosyvoice_example()
			
 
				+    cosyvoice2_example()
			
 
				+    cosyvoice3_example()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()