4 months ago · ff0d05c380
--- a/cosyvoice/llm/llm.py
+++ b/cosyvoice/llm/llm.py
@@ -698,7 +698,7 @@ class CosyVoice3LM(Qwen2LM):
 
				         lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
			
 
				         logits = self.llm_decoder(lm_output)
			
 
				         loss = self.criterion_ce(logits, lm_target.to(device))
			
 
				-        acc = th_accuracy(logits.view(-1, self.speech_token_size + 3), lm_target, ignore_label=IGNORE_ID)
			
 
				+        acc = th_accuracy(logits.view(-1, self.speech_token_size + 200), lm_target, ignore_label=IGNORE_ID)
			
 
				         return {'loss': loss, 'acc': acc}
			
 
				 
			
 
				     @torch.inference_mode()
			
--- a/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
+++ b/examples/libritts/cosyvoice3/conf/cosyvoice3.yaml
@@ -20,7 +20,7 @@ num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size,
 
				 # model params
			
 
				 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
			
 
				 # for system/third_party class/function, we do not require this.
			
 
				-llm: !new:cosyvoice.llm.llm.Qwen2LM
			
 
				+llm: !new:cosyvoice.llm.llm.CosyVoice3LM
			
 
				     llm_input_size: !ref <llm_input_size>
			
 
				     llm_output_size: !ref <llm_output_size>
			
 
				     speech_token_size: 6561
			
@@ -231,4 +231,4 @@ train_conf_gan:
 
				     grad_clip: 5
			
 
				     accum_grad: 1 # in gan training, accum_grad must be 1
			
 
				     log_interval: 100
			
 
				-    save_per_step: -1
			
 
				+    save_per_step: -1