7 hónapja · afb1a70f7a
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,8 @@
 
				 conformer==0.3.2
			
 
				 deepspeed==0.14.2; sys_platform == 'linux'
			
 
				 diffusers==0.29.0
			
 
				+fastapi==0.115.6
			
 
				+fastapi-cli==0.0.4
			
 
				 gdown==5.1.0
			
 
				 gradio==5.4.0
			
 
				 grpcio==1.57.0
			
@@ -34,7 +36,5 @@ torch==2.3.1
 
				 torchaudio==2.3.1
			
 
				 transformers==4.40.1
			
 
				 uvicorn==0.30.0
			
 
				-wget==3.2
			
 
				-fastapi==0.115.6
			
 
				-fastapi-cli==0.0.4
			
 
				 WeTextProcessing==1.0.3
			
 
				+wget==3.2
			
--- a/test1.py
+++ b/test1.py
@@ -0,0 +1,37 @@
 
				+import sys
			
 
				+sys.path.append('third_party/Matcha-TTS')
			
 
				+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
			
 
				+from cosyvoice.utils.file_utils import load_wav
			
 
				+import torchaudio # type: ignore
			
 
				+
			
 
				+cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=False, load_trt=False, fp16=False, use_flow_cache=False)
			
 
				+
			
 
				+# NOTE if you want to reproduce the results on https://funaudiollm.github.io/cosyvoice2, please add text_frontend=False during inference
			
 
				+# zero_shot usage
			
 
				+prompt_speech_16k = load_wav('./asset/zero_shot_prompt.wav', 16000)
			
 
				+for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
			
 
				+    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+# save zero_shot spk for future usage
			
 
				+assert cosyvoice.add_zero_shot_spk('希望你以后能够做的比我还好呦。', prompt_speech_16k, 'my_zero_shot_spk') is True
			
 
				+for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '', '', zero_shot_spk_id='my_zero_shot_spk', stream=False)):
			
 
				+    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+cosyvoice.save_spkinfo()
			
 
				+
			
 
				+# fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L248
			
 
				+for i, j in enumerate(cosyvoice.inference_cross_lingual('在他讲述那个荒诞故事的过程中，他突然[laughter]停下来，因为他自己也被逗笑了[laughter]。', prompt_speech_16k, stream=False)):
			
 
				+    torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+# instruct usage
			
 
				+for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '用四川话说这句话', prompt_speech_16k, stream=False)):
			
 
				+    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				+
			
 
				+# bistream usage, you can use generator as input, this is useful when using text llm model as input
			
 
				+# NOTE you should still have some basic sentence split logic because llm can not handle arbitrary sentence length
			
 
				+def text_generator():
			
 
				+    yield '收到好友从远方寄来的生日礼物，'
			
 
				+    yield '那份意外的惊喜与深深的祝福'
			
 
				+    yield '让我心中充满了甜蜜的快乐，'
			
 
				+    yield '笑容如花儿般绽放。'
			
 
				+for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
			
 
				+    torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)