1 год назад · fbab274b6a
--- a/cosyvoice/dataset/processor.py
+++ b/cosyvoice/dataset/processor.py
@@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'):
 
				 
			
 
				 def compute_fbank(data,
			
 
				                   feat_extractor,
			
 
				+                  token_mel_ratio=0,
			
 
				                   mode='train'):
			
 
				     """ Extract fbank
			
 
				 
			
@@ -174,8 +175,13 @@ def compute_fbank(data,
 
				         assert 'utt' in sample
			
 
				         assert 'text_token' in sample
			
 
				         waveform = sample['speech']
			
 
				-        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
			
 
				-        sample['speech_feat'] = mat
			
 
				+        feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
			
 
				+        if token_mel_ratio != 0:
			
 
				+            # trim to align speech_token and speech_feat
			
 
				+            token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
			
 
				+            feat = feat[:token_mel_ratio * token_len]
			
 
				+            sample["speech_token"] = sample["speech_token"][:token_len]
			
 
				+        sample['speech_feat'] = feat
			
 
				         yield sample