hai 1 mes · ca3b054a52
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -122,12 +122,12 @@ class CosyVoiceFrontEnd:
 
															         return speech_feat, speech_feat_len
														
 
															     def text_normalize(self, text, split=True, text_frontend=True):
														
 
															-        # NOTE skip text_frontend when ssml symbol in text
														
 
															-        if '<|' in text and '|>' in text:
														
 
															-            text_frontend = False
														
 
															         if isinstance(text, Generator):
														
 
															             logging.info('get tts_text generator, will skip text_normalize!')
														
 
															             return [text]
														
 
															+        # NOTE skip text_frontend when ssml symbol in text
														
 
															+        if '<|' in text and '|>' in text:
														
 
															+            text_frontend = False
														
 
															         if text_frontend is False or text == '':
														
 
															             return [text] if split is True else text
														
 
															         text = text.strip()
														
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -413,18 +413,18 @@ class CosyVoice3Model(CosyVoice2Model):
 
															                                              embedding=embedding.to(self.device),
														
 
															                                              streaming=stream,
														
 
															                                              finalize=finalize)
														
 
															-        tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
														
 
															-        # append mel cache
														
 
															-        if self.hift_cache_dict[uuid] is not None:
														
 
															-            hift_cache_mel = self.hift_cache_dict[uuid]['mel']
														
 
															-            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
														
 
															-            self.hift_cache_dict[uuid]['mel'] = tts_mel
														
 
															-        else:
														
 
															-            self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
														
 
															-        if speed != 1.0:
														
 
															-            assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
														
 
															-            tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
														
 
															-        tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
														
 
															-        tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
														
 
															-        self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
														
 
															+            tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
														
 
															+            # append mel cache
														
 
															+            if self.hift_cache_dict[uuid] is not None:
														
 
															+                hift_cache_mel = self.hift_cache_dict[uuid]['mel']
														
 
															+                tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
														
 
															+                self.hift_cache_dict[uuid]['mel'] = tts_mel
														
 
															+            else:
														
 
															+                self.hift_cache_dict[uuid] = {'mel': tts_mel, 'speech_offset': 0}
														
 
															+            if speed != 1.0:
														
 
															+                assert token_offset == 0 and finalize is True, 'speed change only support non-stream inference mode'
														
 
															+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
														
 
															+            tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize)
														
 
															+            tts_speech = tts_speech[:, self.hift_cache_dict[uuid]['speech_offset']:]
														
 
															+            self.hift_cache_dict[uuid]['speech_offset'] += tts_speech.shape[1]
														
 
															         return tts_speech
														
--- a/cosyvoice/hifigan/generator.py
+++ b/cosyvoice/hifigan/generator.py
@@ -155,11 +155,13 @@ class SineGen(torch.nn.Module):
 
															     @torch.no_grad()
														
 
															     def forward(self, f0):
														
 
															+        """ sine_tensor, uv = forward(f0)
														
 
															+        input F0: tensor(batchsize=1, dim=1, length)
														
 
															+                  f0 for unvoiced steps should be 0
														
 
															+        output sine_tensor: tensor(batchsize=1, length, dim)
														
 
															+        output uv: tensor(batchsize=1, length, 1)
														
 
															         """
														
 
															-        :param f0: [B, 1, sample_len], Hz
														
 
															-        :return: [B, 1, sample_len]
														
 
															-        """
														
 
															-
														
 
															+        f0 = f0.transpose(1, 2)
														
 
															         F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
														
 
															         for i in range(self.harmonic_num + 1):
														
 
															             F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
														
@@ -184,7 +186,7 @@ class SineGen(torch.nn.Module):
 
															         # first: set the unvoiced part to 0 by uv
														
 
															         # then: additive noise
														
 
															         sine_waves = sine_waves * uv + noise
														
 
															-        return sine_waves, uv, noise
														
 
															+        return sine_waves.transpose(1, 2), uv.transpose(1, 2), noise
														
 
															 class SineGen2(torch.nn.Module):
														
@@ -221,7 +223,7 @@ class SineGen2(torch.nn.Module):
 
															         if causal is True:
														
 
															             self.rand_ini = torch.rand(1, 9)
														
 
															             self.rand_ini[:, 0] = 0
														
 
															-            self.sine_waves = torch.rand(1, 60 * 16000, 9)
														
 
															+            self.sine_waves = torch.rand(1, 300 * 24000, 9)
														
 
															     def _f02uv(self, f0):
														
 
															         # generate uv signal
														
@@ -351,7 +353,7 @@ class SourceModuleHnNSF(torch.nn.Module):
 
															         self.l_tanh = torch.nn.Tanh()
														
 
															         self.causal = causal
														
 
															         if causal is True:
														
 
															-            self.uv = torch.rand(1, 60 * 24000, 1)
														
 
															+            self.uv = torch.rand(1, 300 * 24000, 1)
														
 
															     def forward(self, x):
														
 
															         """
														
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -17,6 +17,7 @@ import random
 
															 import time
														
 
															 import threading
														
 
															 from typing import Dict, Optional, Callable, List, Generator
														
 
															+import numpy as np
														
 
															 import torch
														
 
															 from torch import nn
														
 
															 import torch.nn.functional as F
														
@@ -216,7 +217,7 @@ class TransformerLM(torch.nn.Module):
 
															                                                                   att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
														
 
															                                                                                                  device=lm_input.device)).to(torch.bool))
														
 
															             logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
														
 
															-            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
														
 
															+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False)
														
 
															             if top_ids == self.eos_token:
														
 
															                 break
														
 
															             # in stream mode, yield token one by one
														
@@ -544,7 +545,7 @@ class Qwen2LM(TransformerLM):
 
															         cache = None
														
 
															         # NOTE init prompt_text as text_cache as it is basically impossible prompt_speech_token/prompt_text < 15/5
														
 
															         text_cache = self.llm.model.model.embed_tokens(prompt_text)
														
 
															-        next_fill_index = -1
														
 
															+        next_fill_index = (int(prompt_speech_token.shape[1] / self.mix_ratio[1]) + 1) * self.mix_ratio[1] - prompt_speech_token.shape[1]
														
 
															         for this_text in text:
														
 
															             text_cache = torch.concat([text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1)
														
 
															             # prompt_speech_token_emb not empty, try append to lm_input
														
@@ -582,7 +583,7 @@ class Qwen2LM(TransformerLM):
 
															                         top_ids = self.fill_token
														
 
															                         next_fill_index += (self.mix_ratio[1] + 1)
														
 
															                     else:
														
 
															-                        top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item()
														
 
															+                        top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True)
														
 
															                     if top_ids == self.fill_token:
														
 
															                         next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1
														
 
															                         logging.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
														
--- a/example.py
+++ b/example.py
@@ -15,15 +15,15 @@ def cosyvoice_example():
 
															         torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
														
 
															     cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M')
														
 
															-    # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
														
 
															+    # zero_shot usage
														
 
															     for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav')):
														
 
															         torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
														
 
															-    # cross_lingual usage
														
 
															+    # cross_lingual usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
														
 
															     for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.',
														
 
															                                                             './asset/cross_lingual_prompt.wav')):
														
 
															         torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
														
 
															     # vc usage
														
 
															-    for i, j in enumerate(cosyvoice.inference_vc('./asset/zero_shot_prompt.wav', './asset/cross_lingual_prompt.wav')):
														
 
															+    for i, j in enumerate(cosyvoice.inference_vc('./asset/cross_lingual_prompt.wav', './asset/zero_shot_prompt.wav')):
														
 
															         torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
														
 
															     cosyvoice = AutoModel(model_dir='pretrained_models/CosyVoice-300M-Instruct')
														
@@ -65,7 +65,7 @@ def cosyvoice2_example():
 
															         yield '让我心中充满了甜蜜的快乐，'
														
 
															         yield '笑容如花儿般绽放。'
														
 
															     for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
														
 
															-        torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
														
 
															+        torchaudio.save('zero_shot_bistream_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
														
 
															 def cosyvoice3_example():
														
@@ -97,8 +97,8 @@ def cosyvoice3_example():
 
															 def main():
														
 
															-    cosyvoice_example()
														
 
															-    cosyvoice2_example()
														
 
															+    # cosyvoice_example()
														
 
															+    # cosyvoice2_example()
														
 
															     cosyvoice3_example()
														
--- a/vllm_example.py
+++ b/vllm_example.py
@@ -31,7 +31,7 @@ def cosyvoice3_example():
 
															 def main():
														
 
															-    cosyvoice2_example()
														
 
															+    # cosyvoice2_example()
														
 
															     cosyvoice3_example()