před 1 rokem · 2895d99b9a
--- a/README.md
+++ b/README.md
@@ -156,4 +156,4 @@ You can also scan the QR code to join our official Dingding chat group.
 
				 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
			
 
				 
			
 
				 ## Disclaimer
			
 
				-The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
			
 
				+The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
			
--- a/cosyvoice/utils/file_utils.py
+++ b/cosyvoice/utils/file_utils.py
@@ -39,3 +39,15 @@ def load_wav(wav, target_sr):
 
				         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
			
 
				         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
			
 
				     return speech
			
 
				+
			
 
				+def speed_change(waveform, sample_rate, speed_factor: str):
			
 
				+    effects = [
			
 
				+        ["tempo", speed_factor],  # speed_factor
			
 
				+        ["rate", f"{sample_rate}"]
			
 
				+    ]
			
 
				+    augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
			
 
				+        waveform,
			
 
				+        sample_rate,
			
 
				+        effects
			
 
				+    )
			
 
				+    return augmented_waveform, new_sample_rate
			
--- a/webui.py
+++ b/webui.py
@@ -1,4 +1,4 @@
 
				-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
			
 
				+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
			
 
				 #
			
 
				 # Licensed under the Apache License, Version 2.0 (the "License");
			
 
				 # you may not use this file except in compliance with the License.
			
@@ -28,7 +28,7 @@ import logging
 
				 logging.getLogger('matplotlib').setLevel(logging.WARNING)
			
 
				 
			
 
				 from cosyvoice.cli.cosyvoice import CosyVoice
			
 
				-from cosyvoice.utils.file_utils import load_wav
			
 
				+from cosyvoice.utils.file_utils import load_wav, speed_change
			
 
				 
			
 
				 logging.basicConfig(level=logging.DEBUG,
			
 
				                     format='%(asctime)s %(levelname)s %(message)s')
			
@@ -66,7 +66,7 @@ instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成
 
				 def change_instruction(mode_checkbox_group):
			
 
				     return instruct_dict[mode_checkbox_group]
			
 
				 
			
 
				-def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed):
			
 
				+def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor):
			
 
				     if prompt_wav_upload is not None:
			
 
				         prompt_wav = prompt_wav_upload
			
 
				     elif prompt_wav_record is not None:
			
@@ -132,7 +132,16 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
 
				         logging.info('get instruct inference request')
			
 
				         set_all_random_seed(seed)
			
 
				         output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
			
 
				-    audio_data = output['tts_speech'].numpy().flatten()
			
 
				+    
			
 
				+    if speed_factor != 1.0:
			
 
				+        try:
			
 
				+            audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
			
 
				+            audio_data = audio_data.numpy().flatten()
			
 
				+        except Exception as e:
			
 
				+            print(f"Failed to change speed of audio: \n{e}")
			
 
				+    else:
			
 
				+        audio_data = output['tts_speech'].numpy().flatten()
			
 
				+
			
 
				     return (target_sr, audio_data)
			
 
				 
			
 
				 def main():
			
@@ -141,7 +150,7 @@ def main():
 
				         gr.Markdown("#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作")
			
 
				 
			
 
				         tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型，提供舒适自然的语音合成能力。")
			
 
				-
			
 
				+        speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True)
			
 
				         with gr.Row():
			
 
				             mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
			
 
				             instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
			
@@ -162,7 +171,7 @@ def main():
 
				 
			
 
				         seed_button.click(generate_seed, inputs=[], outputs=seed)
			
 
				         generate_button.click(generate_audio,
			
 
				-                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed],
			
 
				+                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor],
			
 
				                               outputs=[audio_output])
			
 
				         mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
			
 
				     demo.queue(max_size=4, default_concurrency_limit=2)