瀏覽代碼

[feature] modify pad to trim

Conflicts:
	cosyvoice/dataset/processor.py
burkliu 7 月之前
父節點
當前提交
fbab274b6a
共有 1 個文件被更改,包括 8 次插入2 次删除
  1. 8 2
      cosyvoice/dataset/processor.py

+ 8 - 2
cosyvoice/dataset/processor.py

@@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'):
 
 def compute_fbank(data,
                   feat_extractor,
+                  token_mel_ratio=0,
                   mode='train'):
     """ Extract fbank
 
@@ -174,8 +175,13 @@ def compute_fbank(data,
         assert 'utt' in sample
         assert 'text_token' in sample
         waveform = sample['speech']
-        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
-        sample['speech_feat'] = mat
+        feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        if token_mel_ratio != 0:
+            # trim to align speech_token and speech_feat
+            token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
+            feat = feat[:token_mel_ratio * token_len]
+            sample["speech_token"] = sample["speech_token"][:token_len]
+        sample['speech_feat'] = feat
         yield sample