1 hónapja · 4ddb9966b2
--- a/runtime/triton_trtllm/README.Cosyvoice3.md
+++ b/runtime/triton_trtllm/README.Cosyvoice3.md
@@ -0,0 +1,87 @@
 
				+## Accelerating CosyVoice3 with NVIDIA Triton Inference Server and TensorRT-LLM
			
 
				+
			
 
				+Contributed by Yuekai Zhang (NVIDIA).
			
 
				+
			
 
				+### Quick Start
			
 
				+
			
 
				+Launch the service directly with Docker Compose:
			
 
				+```sh
			
 
				+docker compose -f docker-compose.cosyvoice3.yml up
			
 
				+```
			
 
				+
			
 
				+### Build the Docker Image
			
 
				+
			
 
				+To build the image from scratch:
			
 
				+```sh
			
 
				+docker build . -f Dockerfile.server -t soar97/triton-cosyvoice:25.06
			
 
				+```
			
 
				+
			
 
				+### Run a Docker Container
			
 
				+```sh
			
 
				+your_mount_dir=/mnt:/mnt
			
 
				+docker run -it --name "cosyvoice-server" --gpus all --net host -v $your_mount_dir --shm-size=2g soar97/triton-cosyvoice:25.06
			
 
				+```
			
 
				+
			
 
				+### Understanding `run_cosyvoice3.sh`
			
 
				+
			
 
				+The `run_cosyvoice3.sh` script orchestrates the entire workflow through numbered stages.
			
 
				+
			
 
				+You can run a subset of stages with:
			
 
				+```sh
			
 
				+bash run_cosyvoice3.sh <start_stage> <stop_stage>
			
 
				+```
			
 
				+- `<start_stage>`: The stage to start from.
			
 
				+- `<stop_stage>`: The stage to stop after.
			
 
				+
			
 
				+**Stages:**
			
 
				+
			
 
				+- **Stage -1**: Clones the `CosyVoice` repository.
			
 
				+- **Stage 0**: Downloads the `Fun-CosyVoice3-0.5B-2512` model and its HuggingFace LLM checkpoint.
			
 
				+- **Stage 1**: Converts the HuggingFace checkpoint for the LLM to the TensorRT-LLM format and builds the TensorRT engines.
			
 
				+- **Stage 2**: Creates the Triton model repository, including configurations for `cosyvoice3`, `token2wav`, `vocoder`, `audio_tokenizer`, and `speaker_embedding`.
			
 
				+- **Stage 3**: Launches the Triton Inference Server for Token2Wav module and uses `trtllm-serve` to deploy CosyVoice3 LLM.
			
 
				+- **Stage 4**: Runs the gRPC benchmark client for performance testing.
			
 
				+- **Stage 5**: Runs the offline TTS inference benchmark test.
			
 
				+
			
 
				+### Export Models and Launch Server
			
 
				+
			
 
				+Inside the Docker container, prepare the models and start the Triton server by running stages 0-3:
			
 
				+```sh
			
 
				+# This command runs stages 0, 1, 2, and 3
			
 
				+bash run_cosyvoice3.sh 0 3
			
 
				+```
			
 
				+
			
 
				+### Benchmark with client-server mode
			
 
				+
			
 
				+To benchmark the running Triton server, run stage 4:
			
 
				+```sh
			
 
				+bash run_cosyvoice3.sh 4 4
			
 
				+
			
 
				+# You can customize parameters such as the number of tasks inside the script.
			
 
				+```
			
 
				+The following results were obtained by decoding on a single L20 GPU.
			
 
				+
			
 
				+#### Streaming TTS (Concurrent Tasks = 4)
			
 
				+
			
 
				+**First Chunk Latency**
			
 
				+
			
 
				+| Concurrent Tasks | Average (ms) | 50th Percentile (ms) | 90th Percentile (ms) | 95th Percentile (ms) | 99th Percentile (ms) |
			
 
				+| ---------------- | ------------ | -------------------- | -------------------- | -------------------- | -------------------- |
			
 
				+| 4                | 750.42       | 740.31               | 941.05               | 977.55               | 1002.37              |
			
 
				+
			
 
				+### Benchmark with offline inference mode
			
 
				+
			
 
				+For offline inference mode benchmark, please run stage 5:
			
 
				+```sh
			
 
				+bash run_cosyvoice3.sh 5 5
			
 
				+```
			
 
				+
			
 
				+#### Offline TTS (CosyVoice3 0.5B LLM + Token2Wav with TensorRT)
			
 
				+
			
 
				+| Backend | Batch Size | llm_time (s) | token2wav_time (s) | pipeline_time (s) | RTF    |
			
 
				+|---------|------------|--------------|--------------------|--------------------|--------|
			
 
				+| TRTLLM  | 1          | 13.21        | 5.72               | 19.48              | 0.1091 |
			
 
				+| TRTLLM  | 2          | 8.46         | 6.02               | 14.91              | 0.0822 |
			
 
				+| TRTLLM  | 4          | 5.07         | 5.95               | 11.43              | 0.0630 |
			
 
				+| TRTLLM  | 8          | 2.98         | 6.11               | 9.53               | 0.0562 |
			
 
				+| TRTLLM  | 16         | 2.12         | 6.27               | 8.83               | 0.0501 |
			
--- a/runtime/triton_trtllm/README.md
+++ b/runtime/triton_trtllm/README.md
@@ -0,0 +1,37 @@
 
				+# Accelerating CosyVoice with NVIDIA Triton Inference Server and TensorRT-LLM
			
 
				+
			
 
				+Contributed by Yuekai Zhang (NVIDIA).
			
 
				+
			
 
				+This repository provides three acceleration solutions for CosyVoice, each targeting a different model version and Token2Wav architecture. All solutions use TensorRT-LLM for LLM acceleration and NVIDIA Triton Inference Server for serving.
			
 
				+
			
 
				+## Solutions
			
 
				+
			
 
				+### [CosyVoice3](README.Cosyvoice3.md)
			
 
				+
			
 
				+Acceleration solution for [Fun-CosyVoice3-0.5B-2512](https://huggingface.co/FunAudioLLM/Fun-CosyVoice3-0.5B-2512), the latest CosyVoice model. The pipeline includes `audio_tokenizer`, `speaker_embedding`, `token2wav`, and `vocoder` modules managed by Triton, with the LLM served via `trtllm-serve`.
			
 
				+
			
 
				+### [CosyVoice2 + UNet Token2Wav](README.Cosyvoice2.Unet.md)
			
 
				+
			
 
				+The baseline acceleration solution for CosyVoice2, using the original UNet-based flow-matching Token2Wav module.
			
 
				+
			
 
				+### [CosyVoice2 + DiT Token2Wav](README.Cosyvoice2.DiT.md)
			
 
				+
			
 
				+Replaces the UNet Token2Wav with a DiT-based Token2Wav module from [Step-Audio2](https://github.com/stepfun-ai/Step-Audio-2). Supports disaggregated deployment where the LLM and Token2Wav run on separate GPUs for better resource utilization under high concurrency.
			
 
				+
			
 
				+
			
 
				+
			
 
				+## Quick Start
			
 
				+
			
 
				+Each solution can be launched with a single Docker Compose command:
			
 
				+
			
 
				+```sh
			
 
				+# CosyVoice3
			
 
				+docker compose -f docker-compose.cosyvoice3.yml up
			
 
				+
			
 
				+# CosyVoice2 + UNet Token2Wav
			
 
				+docker compose -f docker-compose.cosyvoice2.unet.yml up
			
 
				+
			
 
				+# CosyVoice2 + DiT Token2Wav
			
 
				+docker compose -f docker-compose.cosyvoice2.dit.yml up
			
 
				+```
			
 
				+
			
--- a/runtime/triton_trtllm/docker-compose.cosyvoice3.yml
+++ b/runtime/triton_trtllm/docker-compose.cosyvoice3.yml
@@ -0,0 +1,20 @@
 
				+services:
			
 
				+  tts:
			
 
				+    image: soar97/triton-cosyvoice:25.06
			
 
				+    shm_size: '1gb'
			
 
				+    ports:
			
 
				+      - "8000:8000"
			
 
				+      - "8001:8001"
			
 
				+      - "8002:8002"
			
 
				+    environment:
			
 
				+      - PYTHONIOENCODING=utf-8
			
 
				+      - MODEL_ID=${MODEL_ID}
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ['0']
			
 
				+              capabilities: [gpu]
			
 
				+    command: >
			
 
				+      /bin/bash -c "cd /workspace && git clone https://github.com/FunAudioLLM/CosyVoice.git && cd CosyVoice && git submodule update --init --recursive && cd runtime/triton_trtllm && bash run_cosyvoice3.sh 0 3"
			
--- a/runtime/triton_trtllm/run_cosyvoice3.sh
+++ b/runtime/triton_trtllm/run_cosyvoice3.sh
@@ -1,8 +1,7 @@
 
				 #!/bin/bash
			
 
				 # Copyright (c) 2026 NVIDIA (authors: Yuekai Zhang)
			
 
				 export CUDA_VISIBLE_DEVICES=0
			
 
				-# cosyvoice_path=/workspace/CosyVoice
			
 
				-cosyvoice_path=/workspace_yuekai/tts/CosyVoice
			
 
				+cosyvoice_path=/workspace/CosyVoice
			
 
				 
			
 
				 export PYTHONPATH=${cosyvoice_path}:$PYTHONPATH
			
 
				 export PYTHONPATH=${cosyvoice_path}/third_party/Matcha-TTS:$PYTHONPATH
			
@@ -24,7 +23,6 @@ bls_instance_num=10
 
				 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
			
 
				 
			
 
				     echo "Cloning CosyVoice"
			
 
				-    pip3 install --upgrade x_transformers s3tokenizer
			
 
				     git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git $cosyvoice_path
			
 
				     cd $cosyvoice_path
			
 
				     git submodule update --init --recursive
			
@@ -33,6 +31,10 @@ fi
 
				 
			
 
				 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
			
 
				     echo "Downloading CosyVoice3 Checkpoints"
			
 
				+    # if s3 tokenizer version is not 0.3.0
			
 
				+    if [ $(pip3 show s3tokenizer | grep -o "0\.2\.[0-9]") != "0.3.0" ]; then
			
 
				+        pip3 install --upgrade x_transformers s3tokenizer
			
 
				+    fi
			
 
				     huggingface-cli download --local-dir $huggingface_llm_local_dir yuekai/Fun-CosyVoice3-0.5B-2512-LLM-HF
			
 
				     huggingface-cli download --local-dir $cosyvoice3_official_model_dir yuekai/Fun-CosyVoice3-0.5B-2512-FP16-ONNX
			
 
				     huggingface-cli download --local-dir $cosyvoice3_official_model_dir FunAudioLLM/Fun-CosyVoice3-0.5B-2512
			
@@ -76,7 +78,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
 
				     LLM_TOKENIZER_DIR=$huggingface_llm_local_dir
			
 
				     BLS_INSTANCE_NUM=$bls_instance_num
			
 
				     TRITON_MAX_BATCH_SIZE=1
			
 
				-    DECOUPLED_MODE=True
			
 
				+    DECOUPLED_MODE=True # False for offline TTS
			
 
				 
			
 
				     python3 scripts/fill_template.py -i ${model_repo}/cosyvoice3/config.pbtxt model_dir:${MODEL_DIR},bls_instance_num:${BLS_INSTANCE_NUM},llm_tokenizer_dir:${LLM_TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
			
 
				     python3 scripts/fill_template.py -i ${model_repo}/token2wav/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
			
@@ -111,17 +113,17 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 
				 fi
			
 
				 
			
 
				 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
			
 
				-    echo "stage 10: Python script CosyVoice3 TTS (LLM + CosyVoice3 Token2Wav) inference"
			
 
				+    echo "stage 5: Python script CosyVoice3 TTS (LLM + CosyVoice3 Token2Wav) inference"
			
 
				 
			
 
				     datasets=(wenetspeech4tts) # wenetspeech4tts
			
 
				-    backend=trtllm-serve  # hf, trtllm, vllm, trtllm-serve
			
 
				+    backend=trtllm  # hf, trtllm, vllm, trtllm-serve
			
 
				 
			
 
				-    batch_sizes=(1)
			
 
				+    batch_sizes=(16 8 4 2 1)
			
 
				     token2wav_batch_size=1 # Only support 1 for now
			
 
				 
			
 
				     for batch_size in ${batch_sizes[@]}; do
			
 
				       for dataset in ${datasets[@]}; do
			
 
				-        output_dir=./cosyvoice3_${dataset}_${backend}_llm_batch_size_${batch_size}_token2wav_batch_size_${token2wav_batch_size}_streaming_trt
			
 
				+        output_dir=./cosyvoice3_${dataset}_${backend}_llm_batch_size_${batch_size}_token2wav_batch_size_${token2wav_batch_size}_offline_tts_trt
			
 
				         CUDA_VISIBLE_DEVICES=0 \
			
 
				             python3 infer_cosyvoice3.py \
			
 
				                 --output-dir $output_dir \
			
@@ -130,8 +132,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
 
				                 --backend $backend \
			
 
				                 --batch-size $batch_size --token2wav-batch-size $token2wav_batch_size \
			
 
				                 --engine-dir $trt_engines_dir \
			
 
				-                --enable-trt --streaming\
			
 
				-                --epoch 1 \
			
 
				+                --enable-trt \
			
 
				+                --epoch 3 \
			
 
				                 --split-name ${dataset} || exit 1
			
 
				       done
			
 
				     done