1 månad sedan · 56d9876037
--- a/cosyvoice/cli/frontend.py
+++ b/cosyvoice/cli/frontend.py
@@ -193,13 +193,13 @@ class CosyVoiceFrontEnd:
 
				         model_input = self.frontend_sft(tts_text, spk_id)
			
 
				         # in instruct mode, we remove spk_embedding in llm due to information leakage
			
 
				         del model_input['llm_embedding']
			
 
				-        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
			
 
				+        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text)
			
 
				         model_input['prompt_text'] = instruct_text_token
			
 
				         model_input['prompt_text_len'] = instruct_text_token_len
			
 
				         return model_input
			
 
				 
			
 
				     def frontend_instruct2(self, tts_text, instruct_text, prompt_wav, resample_rate, zero_shot_spk_id):
			
 
				-        model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_wav, resample_rate, zero_shot_spk_id)
			
 
				+        model_input = self.frontend_zero_shot(tts_text, instruct_text, prompt_wav, resample_rate, zero_shot_spk_id)
			
 
				         del model_input['llm_prompt_speech_token']
			
 
				         del model_input['llm_prompt_speech_token_len']
			
 
				         return model_input
			
--- a/cosyvoice/cli/model.py
+++ b/cosyvoice/cli/model.py
@@ -129,7 +129,7 @@ class CosyVoiceModel:
 
				 
			
 
				     def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
			
 
				         with torch.cuda.amp.autocast(self.fp16):
			
 
				-            tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
			
 
				+            tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
			
 
				                                                                       token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
			
 
				                                                                       prompt_token=prompt_token.to(self.device),
			
 
				                                                                       prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
			
@@ -284,7 +284,7 @@ class CosyVoice2Model(CosyVoiceModel):
 
				 
			
 
				     def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
			
 
				         with torch.cuda.amp.autocast(self.fp16):
			
 
				-            tts_mel, _ = self.flow.inference(token=token.to(self.device),
			
 
				+            tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
			
 
				                                              token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
			
 
				                                              prompt_token=prompt_token.to(self.device),
			
 
				                                              prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
			
@@ -413,7 +413,7 @@ class CosyVoice3Model(CosyVoice2Model):
 
				 
			
 
				     def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0):
			
 
				         with torch.cuda.amp.autocast(self.fp16):
			
 
				-            tts_mel, _ = self.flow.inference(token=token.to(self.device),
			
 
				+            tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32),
			
 
				                                              token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
			
 
				                                              prompt_token=prompt_token.to(self.device),
			
 
				                                              prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
			
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -155,7 +155,7 @@ class TransformerLM(torch.nn.Module):
 
				         num_trials, max_trials = 0, 100
			
 
				         while True:
			
 
				             top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
			
 
				-            if (not ignore_eos) or (self.speech_token_size not in top_ids):
			
 
				+            if (not ignore_eos) or (top_ids < self.speech_token_size):
			
 
				                 break
			
 
				             num_trials += 1
			
 
				             if num_trials > max_trials:
			
@@ -506,7 +506,7 @@ class Qwen2LM(TransformerLM):
 
				                                                           masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
			
 
				                                                           cache=cache)
			
 
				                 logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
			
 
				-                top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
			
 
				+                top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False)
			
 
				                 if top_ids in self.stop_token_ids:
			
 
				                     break
			
 
				                 # in stream mode, yield token one by one
			
@@ -654,7 +654,7 @@ class CosyVoice3LM(Qwen2LM):
 
				         self.mix_ratio = mix_ratio
			
 
				 
			
 
				         # 5. vllm related
			
 
				-        self.stop_token_ids = [speech_token_size + i for i in range(4)]
			
 
				+        self.stop_token_ids = [speech_token_size + i for i in range(200)]
			
 
				         self.vllm_output_queue = {}
			
 
				 
			
 
				     def forward(
			
--- a/cosyvoice/utils/common.py
+++ b/cosyvoice/utils/common.py
@@ -25,32 +25,32 @@ import torch
 
				 
			
 
				 IGNORE_ID = -1
			
 
				 
			
 
				-instruct_list = ["You are a helpful assistant. 请用广东话表达。",
			
 
				-                 "You are a helpful assistant. 请用东北话表达。",
			
 
				-                 "You are a helpful assistant. 请用甘肃话表达。",
			
 
				-                 "You are a helpful assistant. 请用贵州话表达。",
			
 
				-                 "You are a helpful assistant. 请用河南话表达。",
			
 
				-                 "You are a helpful assistant. 请用湖北话表达。",
			
 
				-                 "You are a helpful assistant. 请用湖南话表达。",
			
 
				-                 "You are a helpful assistant. 请用江西话表达。",
			
 
				-                 "You are a helpful assistant. 请用闽南话表达。",
			
 
				-                 "You are a helpful assistant. 请用宁夏话表达。",
			
 
				-                 "You are a helpful assistant. 请用山西话表达。",
			
 
				-                 "You are a helpful assistant. 请用陕西话表达。",
			
 
				-                 "You are a helpful assistant. 请用山东话表达。",
			
 
				-                 "You are a helpful assistant. 请用上海话表达。",
			
 
				-                 "You are a helpful assistant. 请用四川话表达。",
			
 
				-                 "You are a helpful assistant. 请用天津话表达。",
			
 
				-                 "You are a helpful assistant. 请用云南话表达。",
			
 
				-                 "You are a helpful assistant. Please say a sentence as loudly as possible.",
			
 
				-                 "You are a helpful assistant. Please say a sentence in a very soft voice.",
			
 
				-                 "You are a helpful assistant. 请用尽可能慢地语速说一句话。",
			
 
				-                 "You are a helpful assistant. 请用尽可能快地语速说一句话。",
			
 
				-                 "You are a helpful assistant. 请非常开心地说一句话。",
			
 
				-                 "You are a helpful assistant. 请非常伤心地说一句话。",
			
 
				-                 "You are a helpful assistant. 请非常生气地说一句话。",
			
 
				-                 "You are a helpful assistant. 我想体验一下小猪佩奇风格，可以吗？",
			
 
				-                 "You are a helpful assistant. 你可以尝试用机器人的方式解答吗？"]
			
 
				+instruct_list = ["You are a helpful assistant. 请用广东话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用东北话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用甘肃话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用贵州话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用河南话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用湖北话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用湖南话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用江西话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用闽南话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用宁夏话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用山西话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用陕西话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用山东话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用上海话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用四川话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用天津话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用云南话表达。<endofprompt>",
			
 
				+                 "You are a helpful assistant. Please say a sentence as loudly as possible.<endofprompt>",
			
 
				+                 "You are a helpful assistant. Please say a sentence in a very soft voice.<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用尽可能慢地语速说一句话。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请用尽可能快地语速说一句话。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请非常开心地说一句话。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请非常伤心地说一句话。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 请非常生气地说一句话。<endofprompt>",
			
 
				+                 "You are a helpful assistant. 我想体验一下小猪佩奇风格，可以吗？<endofprompt>",
			
 
				+                 "You are a helpful assistant. 你可以尝试用机器人的方式解答吗？<endofprompt>"]
			
 
				 
			
 
				 def pad_list(xs: List[torch.Tensor], pad_value: int):
			
 
				     """Perform padding for the list of tensors.
			
@@ -156,12 +156,12 @@ def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25):
 
				             break
			
 
				     prob = torch.tensor(prob).to(weighted_scores)
			
 
				     indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
			
 
				-    top_ids = indices[prob.multinomial(1, replacement=True)]
			
 
				+    top_ids = indices[prob.multinomial(1, replacement=True)].item()
			
 
				     return top_ids
			
 
				 
			
 
				 
			
 
				 def random_sampling(weighted_scores, decoded_tokens, sampling):
			
 
				-    top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True)
			
 
				+    top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True).item()
			
 
				     return top_ids
			
 
				 
			
 
				 
			
--- a/example.py
+++ b/example.py
@@ -28,7 +28,7 @@ def cosyvoice_example():
 
				 
			
 
				     cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
			
 
				     # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
			
 
				-    for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
			
 
				+    for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.<|endofprompt|>', stream=False)):
			
 
				         torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				 
			
 
				 def cosyvoice2_example():
			
@@ -52,7 +52,7 @@ def cosyvoice2_example():
 
				         torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				 
			
 
				     # instruct usage
			
 
				-    for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '用四川话说这句话', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+    for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '用四川话说这句话<|endofprompt|>', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				         torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				 
			
 
				     # bistream usage, you can use generator as input, this is useful when using text llm model as input
			
@@ -70,21 +70,21 @@ def cosyvoice3_example():
 
				     """
			
 
				     cosyvoice = CosyVoice3('pretrained_models/CosyVoice3-0.5B', load_jit=False, load_trt=False, fp16=False)
			
 
				     # zero_shot usage
			
 
				-    for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡，北坡炮兵并排跑，炮兵怕把标兵碰，标兵怕碰炮兵炮。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+    for i, j in enumerate(cosyvoice.inference_zero_shot('八百标兵奔北坡，北坡炮兵并排跑，炮兵怕把标兵碰，标兵怕碰炮兵炮。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				         torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				 
			
 
				     # fine grained control, for supported control, check cosyvoice/tokenizer/tokenizer.py#L280
			
 
				-    for i, j in enumerate(cosyvoice.inference_cross_lingual('[breath]因为他们那一辈人[breath]在乡里面住的要习惯一点，[breath]邻居都很活络，[breath]嗯，都很熟悉。[breath]', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+    for i, j in enumerate(cosyvoice.inference_cross_lingual('You are a helpful assistant.<|endofprompt|>[breath]因为他们那一辈人[breath]在乡里面住的要习惯一点，[breath]邻居都很活络，[breath]嗯，都很熟悉。[breath]', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				         torchaudio.save('fine_grained_control_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				 
			
 
				     # instruct usage
			
 
				-    for i, j in enumerate(cosyvoice.inference_instruct2('好少咯，一般系放嗰啲国庆啊，中秋嗰啲可能会咯。', 'You are a helpful assistant. 请用广东话表达。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+    for i, j in enumerate(cosyvoice.inference_instruct2('好少咯，一般系放嗰啲国庆啊，中秋嗰啲可能会咯。', 'You are a helpful assistant. 请用广东话表达。<|endofprompt|>', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				         torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				-    for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', 'You are a helpful assistant. 请用尽可能快地语速说一句话。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+    for i, j in enumerate(cosyvoice.inference_instruct2('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', 'You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				         torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				 
			
 
				     # hotfix usage
			
 
				-    for i, j in enumerate(cosyvoice.inference_zero_shot('高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+    for i, j in enumerate(cosyvoice.inference_zero_shot('高管也通过电话、短信、微信等方式对报道[j][ǐ]予好评。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				         torchaudio.save('hotfix_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
			
 
				 
			
 
				 def main():
			
--- a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
+++ b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
--- a/examples/libritts/cosyvoice3/run.sh
+++ b/examples/libritts/cosyvoice3/run.sh
@@ -7,7 +7,7 @@ stop_stage=3
 
				 
			
 
				 data_url=www.openslr.org/resources/60
			
 
				 data_dir=/mnt/lyuxiang.lx/data/tts/openslr/libritts
			
 
				-pretrained_model_dir=../../../pretrained_models/CosyVoice2-0.5B
			
 
				+pretrained_model_dir=../../../pretrained_models/CosyVoice3-0.5B
			
 
				 
			
 
				 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
			
 
				   echo "Data Download"
			
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 
				   echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
			
 
				   for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
			
 
				     tools/extract_speech_token.py --dir data/$x \
			
 
				-      --onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx
			
 
				+      --onnx_path $pretrained_model_dir/speech_tokenizer_v3.onnx
			
 
				   done
			
 
				 fi
			
 
				 
			
@@ -72,14 +72,14 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
 
				         --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \
			
 
				       cosyvoice/bin/train.py \
			
 
				       --train_engine $train_engine \
			
 
				-      --config conf/cosyvoice2.yaml \
			
 
				+      --config conf/cosyvoice3.yaml \
			
 
				       --train_data data/train.data.list \
			
 
				       --cv_data data/dev.data.list \
			
 
				       --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \
			
 
				       --model $model \
			
 
				       --checkpoint $pretrained_model_dir/$model.pt \
			
 
				-      --model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \
			
 
				-      --tensorboard_dir `pwd`/tensorboard/cosyvoice2/$model/$train_engine \
			
 
				+      --model_dir `pwd`/exp/cosyvoice3/$model/$train_engine \
			
 
				+      --tensorboard_dir `pwd`/tensorboard/cosyvoice3/$model/$train_engine \
			
 
				       --ddp.dist_backend $dist_backend \
			
 
				       --num_workers ${num_workers} \
			
 
				       --prefetch ${prefetch} \
			
--- a/vllm_example.py
+++ b/vllm_example.py
@@ -24,7 +24,7 @@ def cosyvoice3_example():
 
				     cosyvoice = CosyVoice3('pretrained_models/CosyVoice3-0.5B', load_trt=True, load_vllm=True, fp16=True)
			
 
				     for i in tqdm(range(100)):
			
 
				         set_all_random_seed(i)
			
 
				-        for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				+        for _, _ in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。', './asset/zero_shot_prompt.wav', stream=False)):
			
 
				             continue
			
 
				 
			
 
				 def main():