|
|
@@ -71,6 +71,7 @@ If you are expert in this field, and you are only interested in training your ow
|
|
|
# SDK模型下载
|
|
|
from modelscope import snapshot_download
|
|
|
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
|
|
|
+snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
|
|
|
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
|
|
|
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
|
|
|
snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
|
|
|
@@ -80,6 +81,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
|
|
|
# git模型下载,请确保已安装git lfs
|
|
|
mkdir -p pretrained_models
|
|
|
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
|
|
|
+git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
|
|
|
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
|
|
|
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
|
|
|
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
|
|
@@ -118,7 +120,7 @@ print(cosyvoice.list_avaliable_spks())
|
|
|
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
|
|
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
|
|
|
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz')
|
|
|
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
|
|
|
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
|
|
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
|
|
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
|
|
@@ -127,18 +129,16 @@ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来
|
|
|
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
|
|
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
|
|
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
-
|
|
|
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
|
|
-# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
|
|
-for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
|
|
- torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
-
|
|
|
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-VC')
|
|
|
# vc usage
|
|
|
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
|
|
source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
|
|
for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
|
|
|
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
+
|
|
|
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
|
|
+# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
|
|
+for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
|
|
+ torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
```
|
|
|
|
|
|
**Start web demo**
|