Procházet zdrojové kódy

Merge pull request #182 from v3ucn/speed_change_sox_version

Speed change sox version
Xiang Lyu před 1 rokem
rodič
revize
2895d99b9a
3 změnil soubory, kde provedl 28 přidání a 7 odebrání
  1. 1 1
      README.md
  2. 12 0
      cosyvoice/utils/file_utils.py
  3. 15 6
      webui.py

+ 1 - 1
README.md

@@ -156,4 +156,4 @@ You can also scan the QR code to join our official Dingding chat group.
 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
 
 ## Disclaimer
-The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
+The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.

+ 12 - 0
cosyvoice/utils/file_utils.py

@@ -39,3 +39,15 @@ def load_wav(wav, target_sr):
         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
     return speech
+
+def speed_change(waveform, sample_rate, speed_factor: str):
+    effects = [
+        ["tempo", speed_factor],  # speed_factor
+        ["rate", f"{sample_rate}"]
+    ]
+    augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
+        waveform,
+        sample_rate,
+        effects
+    )
+    return augmented_waveform, new_sample_rate

+ 15 - 6
webui.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ import logging
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
 
 from cosyvoice.cli.cosyvoice import CosyVoice
-from cosyvoice.utils.file_utils import load_wav
+from cosyvoice.utils.file_utils import load_wav, speed_change
 
 logging.basicConfig(level=logging.DEBUG,
                     format='%(asctime)s %(levelname)s %(message)s')
@@ -66,7 +66,7 @@ instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成
 def change_instruction(mode_checkbox_group):
     return instruct_dict[mode_checkbox_group]
 
-def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed):
+def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor):
     if prompt_wav_upload is not None:
         prompt_wav = prompt_wav_upload
     elif prompt_wav_record is not None:
@@ -132,7 +132,16 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
         logging.info('get instruct inference request')
         set_all_random_seed(seed)
         output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
-    audio_data = output['tts_speech'].numpy().flatten()
+    
+    if speed_factor != 1.0:
+        try:
+            audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
+            audio_data = audio_data.numpy().flatten()
+        except Exception as e:
+            print(f"Failed to change speed of audio: \n{e}")
+    else:
+        audio_data = output['tts_speech'].numpy().flatten()
+
     return (target_sr, audio_data)
 
 def main():
@@ -141,7 +150,7 @@ def main():
         gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
 
         tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
-
+        speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True)
         with gr.Row():
             mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
             instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
@@ -162,7 +171,7 @@ def main():
 
         seed_button.click(generate_seed, inputs=[], outputs=seed)
         generate_button.click(generate_audio,
-                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed],
+                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor],
                               outputs=[audio_output])
         mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
     demo.queue(max_size=4, default_concurrency_limit=2)