1 year ago · e141634da1
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -113,7 +113,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
 
				         # concat text and prompt_text
			
 
				         token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
			
 
				         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
			
 
				-        mask = (~make_pad_mask(token_len)).to(embedding.dtype).unsqueeze(-1).to(embedding)
			
 
				+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
			
 
				         token = self.input_embedding(torch.clamp(token, min=0)) * mask
			
 
				 
			
 
				         # text encode
			
--- a/cosyvoice/flow/flow_matching.py
+++ b/cosyvoice/flow/flow_matching.py
@@ -14,8 +14,6 @@
 
				 import torch
			
 
				 import torch.nn.functional as F
			
 
				 from matcha.models.components.flow_matching import BASECFM
			
 
				-import onnxruntime as ort
			
 
				-import numpy as np
			
 
				 
			
 
				 class ConditionalCFM(BASECFM):
			
 
				     def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):