@@ -113,7 +113,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
# concat text and prompt_text
token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
- mask = (~make_pad_mask(token_len)).to(embedding.dtype).unsqueeze(-1).to(embedding)
+ mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
token = self.input_embedding(torch.clamp(token, min=0)) * mask
# text encode
@@ -14,8 +14,6 @@
import torch
import torch.nn.functional as F
from matcha.models.components.flow_matching import BASECFM
-import onnxruntime as ort
-import numpy as np
class ConditionalCFM(BASECFM):
def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):