1 éve · e141634da1
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -113,7 +113,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
 
															         # concat text and prompt_text
														
 
															         token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
														
 
															         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
														
 
															-        mask = (~make_pad_mask(token_len)).to(embedding.dtype).unsqueeze(-1).to(embedding)
														
 
															+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
														
 
															         token = self.input_embedding(torch.clamp(token, min=0)) * mask
														
 
															         # text encode
														
--- a/cosyvoice/flow/flow_matching.py
+++ b/cosyvoice/flow/flow_matching.py
@@ -14,8 +14,6 @@
 
															 import torch
														
 
															 import torch.nn.functional as F
														
 
															 from matcha.models.components.flow_matching import BASECFM
														
 
															-import onnxruntime as ort
														
 
															-import numpy as np
														
 
															 class ConditionalCFM(BASECFM):
														
 
															     def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):