6 months ago · 1fc8435146
--- a/runtime/triton_trtllm/README.DIT.md
+++ b/runtime/triton_trtllm/README.DIT.md
@@ -45,7 +45,8 @@ bash run_stepaudio2_dit_token2wav.sh <start_stage> <stop_stage>
 
				 - **Stage 4**: Runs the gRPC benchmark client for performance testing.
			
 
				 - **Stage 5**: Runs the offline TTS inference benchmark test.
			
 
				 - **Stage 6**: Runs a standalone inference script for the Step-Audio2-mini DiT Token2Wav model.
			
 
				-
			
 
				+- **Stage 7**: Launches servers in a disaggregated setup, with the LLM on GPU 0 and Token2Wav servers on GPUs 1-3.
			
 
				+- **Stage 8**: Runs the benchmark client for the disaggregated server configuration.
			
 
				 ### Export Models and Launch Server
			
 
				 
			
 
				 Inside the Docker container, prepare the models and start the Triton server by running stages 0-3:
			
@@ -100,6 +101,40 @@ The following results were obtained by decoding on a single L20 GPU with the `yu
 
				 | TRTLLM | 16 | 2.01 |  5.03 | 0.0292 |
			
 
				 
			
 
				 
			
 
				+### Disaggregated Server
			
 
				+When the LLM and token2wav components are deployed on the same GPU, they compete for resources. To optimize performance, we use a disaggregated setup where the LLM is deployed on one dedicated L20 GPU, taking advantage of in-flight batching for inference. The token2wav module is deployed on separate, dedicated GPUs.
			
 
				+
			
 
				+The table below shows the first chunk latency results for this configuration. In our tests, we deploy two token2wav instances on each dedicated token2wav GPU.
			
 
				+
			
 
				+| token2wav_num_gpu | concurrent_task_per_instance | concurrent_tasks_per_gpu | avg (ms) | p50 (ms) | p90 (ms) | p99 (ms) |
			
 
				+|---|---|---|---|---|---|---|
			
 
				+| 1 | 1 | 1.00 | 218.53 | 217.86 | 254.07 | 296.49 |
			
 
				+| 2 | 1 | 1.33 | 218.82 | 219.21 | 256.62 | 303.13 |
			
 
				+| 3 | 1 | 1.50 | 229.08 | 223.27 | 302.13 | 324.41 |
			
 
				+| 4 | 1 | 1.60 | 203.87 | 198.23 | 254.92 | 279.31 |
			
 
				+| 1 | 2 | 2.00 | 293.46 | 280.53 | 370.81 | 407.40 |
			
 
				+| 2 | 2 | 2.67 | 263.38 | 236.84 | 350.82 | 397.39 |
			
 
				+| 3 | 2 | 3.00 | 308.09 | 275.48 | 385.22 | 521.45 |
			
 
				+| 4 | 2 | 3.20 | 271.85 | 253.25 | 359.03 | 387.91 |
			
 
				+| 1 | 3 | 3.00 | 389.15 | 373.01 | 469.22 | 542.89 |
			
 
				+| 2 | 3 | 4.00 | 403.48 | 394.80 | 481.24 | 507.75 |
			
 
				+| 3 | 3 | 4.50 | 406.33 | 391.28 | 495.43 | 571.29 |
			
 
				+| 4 | 3 | 4.80 | 436.72 | 383.81 | 638.44 | 879.23 |
			
 
				+| 1 | 4 | 4.00 | 520.12 | 493.98 | 610.38 | 739.85 |
			
 
				+| 2 | 4 | 5.33 | 494.60 | 490.50 | 605.93 | 708.09 |
			
 
				+| 3 | 4 | 6.00 | 538.23 | 508.33 | 687.62 | 736.96 |
			
 
				+| 4 | 4 | 6.40 | 579.68 | 546.20 | 721.53 | 958.04 |
			
 
				+| 1 | 5 | 5.00 | 635.02 | 623.30 | 786.85 | 819.84 |
			
 
				+| 2 | 5 | 6.67 | 598.23 | 617.09 | 741.00 | 788.96 |
			
 
				+| 3 | 5 | 7.50 | 644.78 | 684.40 | 786.45 | 1009.45 |
			
 
				+| 4 | 5 | 8.00 | 733.92 | 642.26 | 1024.79 | 1281.55 |
			
 
				+| 1 | 6 | 6.00 | 715.38 | 745.68 | 887.04 | 906.68 |
			
 
				+| 2 | 6 | 8.00 | 748.31 | 753.94 | 873.59 | 1007.14 |
			
 
				+| 3 | 6 | 9.00 | 900.27 | 822.28 | 1431.14 | 1800.23 |
			
 
				+| 4 | 6 | 9.60 | 857.54 | 820.33 | 1150.30 | 1298.53 |
			
 
				+
			
 
				+The `concurrent_task_per_gpu` is calculated as:
			
 
				+`concurrent_task_per_gpu = concurrent_task_per_instance * num_token2wav_instance_per_gpu (2) * token2wav_gpus / (token2wav_gpus + llm_gpus (1))`
			
 
				 
			
 
				 ### Acknowledgements
			
 
				 
			
--- a/runtime/triton_trtllm/client_grpc.py
+++ b/runtime/triton_trtllm/client_grpc.py
@@ -134,6 +134,8 @@ def write_triton_stats(stats, summary_file):
 
				                 compute_output = batch["compute_output"]
			
 
				                 compute_infer = batch["compute_infer"]
			
 
				                 batch_count = int(compute_infer["count"])
			
 
				+                if batch_count == 0:
			
 
				+                    continue
			
 
				                 assert compute_infer["count"] == compute_output["count"] == compute_input["count"]
			
 
				                 compute_infer_time_ms = int(compute_infer["ns"]) / 1e6
			
 
				                 compute_input_time_ms = int(compute_input["ns"]) / 1e6
			
--- a/runtime/triton_trtllm/run_stepaudio2_dit_token2wav.sh
+++ b/runtime/triton_trtllm/run_stepaudio2_dit_token2wav.sh
@@ -20,7 +20,7 @@ trt_weights_dir=./trt_weights_${trt_dtype}
 
				 trt_engines_dir=./trt_engines_${trt_dtype}
			
 
				 
			
 
				 model_repo=./model_repo_cosyvoice2_dit
			
 
				-bls_instance_num=4
			
 
				+bls_instance_num=10
			
 
				 
			
 
				 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
			
 
				 
			
@@ -58,7 +58,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 
				     echo "Building TensorRT engines"
			
 
				     trtllm-build --checkpoint_dir $trt_weights_dir \
			
 
				                 --output_dir $trt_engines_dir \
			
 
				-                --max_batch_size 16 \
			
 
				+                --max_batch_size 64 \
			
 
				                 --max_num_tokens 32768 \
			
 
				                 --gemm_plugin $trt_dtype || exit 1
			
 
				 
			
@@ -100,14 +100,14 @@ fi
 
				 
			
 
				 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
			
 
				    echo "Starting Token2wav Triton server and Cosyvoice2 llm using trtllm-serve"
			
 
				-   mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 16  --kv_cache_free_gpu_memory_fraction 0.4 &
			
 
				+   mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64  --kv_cache_free_gpu_memory_fraction 0.4 &
			
 
				    tritonserver --model-repository $model_repo --http-port 18000 &
			
 
				    wait
			
 
				     # Test using curl
			
 
				     # curl http://localhost:8000/v1/chat/completions \
			
 
				     #     -H "Content-Type: application/json" \
			
 
				     #     -d '{
			
 
				-    #         "model": "trt_engines_bfloat16",
			
 
				+    #         "model": "",
			
 
				     #         "messages":[{"role": "user", "content": "Where is New York?"},
			
 
				     #                     {"role": "assistant", "content": "<|s_1708|><|s_2050|><|s_2159|>"}],
			
 
				     #         "max_tokens": 512,
			
@@ -172,3 +172,54 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
 
				 fi
			
 
				 
			
 
				 
			
 
				+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
			
 
				+   echo "Disaggregated Server: LLM and Token2wav on different GPUs"
			
 
				+   echo "Starting LLM server on GPU 0"
			
 
				+   export CUDA_VISIBLE_DEVICES=0
			
 
				+   mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64  --kv_cache_free_gpu_memory_fraction 0.4 &
			
 
				+   echo "Starting Token2wav server on GPUs 1-3"
			
 
				+   Token2wav_num_gpus=3
			
 
				+   http_port=17000
			
 
				+   grpc_port=18000
			
 
				+   metrics_port=16000
			
 
				+   for i in $(seq 0 $(($Token2wav_num_gpus - 1))); do
			
 
				+       echo "Starting server on GPU $i"
			
 
				+       http_port=$((http_port + 1))
			
 
				+       grpc_port=$((grpc_port + 1))
			
 
				+       metrics_port=$((metrics_port + 1))
			
 
				+       # Two instances of Token2wav server on the same GPU
			
 
				+       CUDA_VISIBLE_DEVICES=$(($i + 1)) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
			
 
				+       http_port=$((http_port + 1))
			
 
				+       grpc_port=$((grpc_port + 1))
			
 
				+       metrics_port=$((metrics_port + 1))
			
 
				+       CUDA_VISIBLE_DEVICES=$(($i + 1)) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
			
 
				+   done
			
 
				+   wait
			
 
				+fi
			
 
				+
			
 
				+if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
			
 
				+    echo "Running benchmark client for Disaggregated Server"
			
 
				+    per_gpu_instances=2
			
 
				+    mode=streaming
			
 
				+    BLS_INSTANCE_NUM=$bls_instance_num
			
 
				+    Token2wav_num_gpus=(1 2 3)
			
 
				+    concurrent_tasks=(1 2 3 4 5 6)
			
 
				+    for n_gpu in ${Token2wav_num_gpus[@]}; do
			
 
				+        echo "Test 1 GPU for LLM server and $n_gpu GPUs for Token2wav servers"
			
 
				+        for concurrent_task in ${concurrent_tasks[@]}; do
			
 
				+            num_instances=$((per_gpu_instances * n_gpu))
			
 
				+            for i in $(seq 1 $num_instances); do
			
 
				+                port=$(($i + 18000))
			
 
				+                python3 client_grpc.py \
			
 
				+                    --server-addr localhost \
			
 
				+                    --server-port $port \
			
 
				+                    --model-name cosyvoice2_dit \
			
 
				+                    --num-tasks $concurrent_task \
			
 
				+                    --mode $mode \
			
 
				+                    --huggingface-dataset yuekai/seed_tts_cosy2 \
			
 
				+                    --log-dir ./log_disagg_concurrent_tasks_${concurrent_task}_per_instance_total_token2wav_instances_${num_instances}_port_${port} &
			
 
				+            done
			
 
				+            wait
			
 
				+        done
			
 
				+    done
			
 
				+fi