1 年之前 · 6d22d0b76f
--- a/cosyvoice/cli/cosyvoice.py
+++ b/cosyvoice/cli/cosyvoice.py
@@ -67,6 +67,8 @@ class CosyVoice:
 
				     def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
			
 
				         prompt_text = self.frontend.text_normalize(prompt_text, split=False)
			
 
				         for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
			
 
				+            if len(i) < 0.5 * len(prompt_text):
			
 
				+                logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
			
 
				             model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
			
 
				             start_time = time.time()
			
 
				             logging.info('synthesis text {}'.format(i))
			
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -202,6 +202,9 @@ class TransformerLM(torch.nn.Module):
 
				                                                                   att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
			
 
				                                                                                                  device=lm_input.device)).to(torch.bool))
			
 
				             logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
			
 
				+            # force continue decode first token
			
 
				+            if i == 0:
			
 
				+                logp[:, self.speech_token_size] = -float('inf')
			
 
				             top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
			
 
				             if top_ids == self.speech_token_size:
			
 
				                 break
			
--- a/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
@@ -141,7 +141,7 @@ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
 
				     hop_size: 256
			
 
				     win_size: 1024
			
 
				     fmin: 0
			
 
				-    fmax: 8000
			
 
				+    fmax: null
			
 
				     center: False
			
 
				 hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
			
 
				     generator: !ref <hift>
			
--- a/examples/libritts/cosyvoice/conf/cosyvoice.yaml
+++ b/examples/libritts/cosyvoice/conf/cosyvoice.yaml
@@ -141,7 +141,7 @@ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
 
				     hop_size: 256
			
 
				     win_size: 1024
			
 
				     fmin: 0
			
 
				-    fmax: 8000
			
 
				+    fmax: null
			
 
				     center: False
			
 
				 hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
			
 
				     generator: !ref <hift>
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ gdown==5.1.0
 
				 gradio==4.32.2
			
 
				 grpcio==1.57.0
			
 
				 grpcio-tools==1.57.0
			
 
				+huggingface-hub==0.23.5
			
 
				 hydra-core==1.3.2
			
 
				 HyperPyYAML==1.2.2
			
 
				 inflect==7.3.1