|
|
@@ -67,6 +67,8 @@ class CosyVoice:
|
|
|
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
|
|
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
|
|
+ if len(i) < 0.5 * len(prompt_text):
|
|
|
+ logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
|
|
|
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
|
|
|
start_time = time.time()
|
|
|
logging.info('synthesis text {}'.format(i))
|