|
@@ -2,7 +2,8 @@ import argparse
|
|
|
import logging
|
|
|
import os
|
|
|
from tqdm import tqdm
|
|
|
-import torch, torchaudio
|
|
|
+import torch
|
|
|
+import torchaudio
|
|
|
from cosyvoice.cli.cosyvoice import CosyVoice2
|
|
|
from cosyvoice.utils.file_utils import load_wav
|
|
|
|
|
@@ -30,7 +31,7 @@ def main():
|
|
|
if prompt_speech_16k.shape[1] >= 30 * 16000:
|
|
|
continue
|
|
|
speech_list = []
|
|
|
- for i, j in enumerate(cosyvoice.inference_zero_shot(utt2text[utt], utt2text[utt], prompt_speech_16k, stream=False, text_frontend=False)):
|
|
|
+ for _, j in enumerate(cosyvoice.inference_zero_shot(utt2text[utt], utt2text[utt], prompt_speech_16k, stream=False, text_frontend=False)):
|
|
|
speech_list.append(j['tts_speech'])
|
|
|
negative_wav = os.path.abspath('{}/wav/{}'.format(args.des_dir, os.path.basename(wav)))
|
|
|
torchaudio.save(negative_wav, torch.concat(speech_list, dim=1), cosyvoice.sample_rate, backend='soundfile')
|