|
|
@@ -40,6 +40,8 @@ class CosyVoiceModel:
|
|
|
# hift cache
|
|
|
self.mel_cache_len = 20
|
|
|
self.source_cache_len = int(self.mel_cache_len * 256)
|
|
|
+ # speech fade in out
|
|
|
+ self.speech_window = np.hamming(2 * self.source_cache_len)
|
|
|
# rtf and decoding related
|
|
|
self.stream_scale_factor = 1
|
|
|
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
|
|
|
@@ -114,13 +116,19 @@ class CosyVoiceModel:
|
|
|
self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
|
|
|
tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
|
|
|
tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
|
|
|
- self.hift_cache_dict[uuid] = {'source': tts_source[:, :, -self.source_cache_len:], 'mel': tts_mel[:, :, -self.mel_cache_len:]}
|
|
|
+ if self.hift_cache_dict[uuid] is not None:
|
|
|
+ tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
|
|
|
+ self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
|
|
|
+ 'source': tts_source[:, :, -self.source_cache_len:],
|
|
|
+ 'speech': tts_speech[:, -self.source_cache_len:]}
|
|
|
tts_speech = tts_speech[:, :-self.source_cache_len]
|
|
|
else:
|
|
|
if speed != 1.0:
|
|
|
assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
|
|
|
tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
|
|
|
tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
|
|
|
+ if self.hift_cache_dict[uuid] is not None:
|
|
|
+ tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
|
|
|
return tts_speech
|
|
|
|
|
|
def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
|