4 months ago · 07cbc51cd1
--- a/runtime/triton_trtllm/client_grpc.py
+++ b/runtime/triton_trtllm/client_grpc.py
@@ -1,4 +1,3 @@
 
															-#!/usr/bin/env python3
														
 
															 # Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
														
 
															 #                2023  Nvidia              (authors: Yuekai Zhang)
														
 
															 #                2023  Recurrent.ai        (authors: Songtao Shi)
														
@@ -46,7 +45,7 @@ import asyncio
 
															 import json
														
 
															 import queue  # Added
														
 
															 import uuid  # Added
														
 
															-import functools # Added
														
 
															+import functools  # Added
														
 
															 import os
														
 
															 import time
														
@@ -56,9 +55,9 @@ from pathlib import Path
 
															 import numpy as np
														
 
															 import soundfile as sf
														
 
															 import tritonclient
														
 
															-import tritonclient.grpc.aio as grpcclient_aio # Renamed original import
														
 
															-import tritonclient.grpc as grpcclient_sync # Added sync client import
														
 
															-from tritonclient.utils import np_to_triton_dtype, InferenceServerException # Added InferenceServerException
														
 
															+import tritonclient.grpc.aio as grpcclient_aio  # Renamed original import
														
 
															+import tritonclient.grpc as grpcclient_sync  # Added sync client import
														
 
															+from tritonclient.utils import np_to_triton_dtype, InferenceServerException  # Added InferenceServerException
														
 
															 # --- Added UserData and callback ---
														
@@ -76,9 +75,10 @@ class UserData:
 
															             return self._first_chunk_time - self._start_time
														
 
															         return None
														
 
															+
														
 
															 def callback(user_data, result, error):
														
 
															     if user_data._first_chunk_time is None and not error:
														
 
															-        user_data._first_chunk_time = time.time() # Record time of first successful chunk
														
 
															+        user_data._first_chunk_time = time.time()  # Record time of first successful chunk
														
 
															     if error:
														
 
															         user_data._completed_requests.put(error)
														
 
															     else:
														
@@ -206,8 +206,11 @@ def get_args():
 
															         "--model-name",
														
 
															         type=str,
														
 
															         default="f5_tts",
														
 
															-        choices=["f5_tts", "spark_tts", "cosyvoice2"],
														
 
															-        help="triton model_repo module name to request: transducer for k2, attention_rescoring for wenet offline, streaming_wenet for wenet streaming, infer_pipeline for paraformer large offline",
														
 
															+        choices=[
														
 
															+            "f5_tts",
														
 
															+            "spark_tts",
														
 
															+            "cosyvoice2"],
														
 
															+        help="triton model_repo module name to request",
														
 
															     )
														
 
															     parser.add_argument(
														
@@ -273,13 +276,14 @@ def load_audio(wav_path, target_sample_rate=16000):
 
															         waveform = resample(waveform, num_samples)
														
 
															     return waveform, target_sample_rate
														
 
															+
														
 
															 def prepare_request_input_output(
														
 
															-    protocol_client, # Can be grpcclient_aio or grpcclient_sync
														
 
															+    protocol_client,  # Can be grpcclient_aio or grpcclient_sync
														
 
															     waveform,
														
 
															     reference_text,
														
 
															     target_text,
														
 
															     sample_rate=16000,
														
 
															-    padding_duration: int = None # Optional padding for offline mode
														
 
															+    padding_duration: int = None  # Optional padding for offline mode
														
 
															 ):
														
 
															     """Prepares inputs for Triton inference (offline or streaming)."""
														
 
															     assert len(waveform.shape) == 1, "waveform should be 1D"
														
@@ -291,9 +295,9 @@ def prepare_request_input_output(
 
															         # Estimate target duration based on text length ratio (crude estimation)
														
 
															         # Avoid division by zero if reference_text is empty
														
 
															         if reference_text:
														
 
															-             estimated_target_duration = duration / len(reference_text) * len(target_text)
														
 
															+            estimated_target_duration = duration / len(reference_text) * len(target_text)
														
 
															         else:
														
 
															-             estimated_target_duration = duration # Assume target duration similar to reference if no text
														
 
															+            estimated_target_duration = duration  # Assume target duration similar to reference if no text
														
 
															         # Calculate required samples based on estimated total duration
														
 
															         required_total_samples = padding_duration * sample_rate * (
														
@@ -329,6 +333,7 @@ def prepare_request_input_output(
 
															     return inputs, outputs
														
 
															+
														
 
															 def run_sync_streaming_inference(
														
 
															     sync_triton_client: tritonclient.grpc.InferenceServerClient,
														
 
															     model_name: str,
														
@@ -342,7 +347,7 @@ def run_sync_streaming_inference(
 
															 ):
														
 
															     """Helper function to run the blocking sync streaming call."""
														
 
															     start_time_total = time.time()
														
 
															-    user_data.record_start_time() # Record start time for first chunk latency calculation
														
 
															+    user_data.record_start_time()  # Record start time for first chunk latency calculation
														
 
															     # Establish stream
														
 
															     sync_triton_client.start_stream(callback=functools.partial(callback, user_data))
														
@@ -360,11 +365,11 @@ def run_sync_streaming_inference(
 
															     audios = []
														
 
															     while True:
														
 
															         try:
														
 
															-            result = user_data._completed_requests.get() # Add timeout
														
 
															+            result = user_data._completed_requests.get()  # Add timeout
														
 
															             if isinstance(result, InferenceServerException):
														
 
															                 print(f"Received InferenceServerException: {result}")
														
 
															                 sync_triton_client.stop_stream()
														
 
															-                return None, None, None # Indicate error
														
 
															+                return None, None, None  # Indicate error
														
 
															             # Get response metadata
														
 
															             response = result.get_response()
														
 
															             final = response.parameters["triton_final_response"].bool_param
														
@@ -372,15 +377,15 @@ def run_sync_streaming_inference(
 
															                 break
														
 
															             audio_chunk = result.as_numpy("waveform").reshape(-1)
														
 
															-            if audio_chunk.size > 0: # Only append non-empty chunks
														
 
															-                 audios.append(audio_chunk)
														
 
															+            if audio_chunk.size > 0:  # Only append non-empty chunks
														
 
															+                audios.append(audio_chunk)
														
 
															             else:
														
 
															                 print("Warning: received empty audio chunk.")
														
 
															         except queue.Empty:
														
 
															             print(f"Timeout waiting for response for request id {request_id}")
														
 
															             sync_triton_client.stop_stream()
														
 
															-            return None, None, None # Indicate error
														
 
															+            return None, None, None  # Indicate error
														
 
															     sync_triton_client.stop_stream()
														
 
															     end_time_total = time.time()
														
@@ -398,19 +403,19 @@ def run_sync_streaming_inference(
 
															         # Simplified reconstruction based on client_grpc_streaming.py
														
 
															         if not audios:
														
 
															             print("Warning: No audio chunks received.")
														
 
															-            reconstructed_audio = np.array([], dtype=np.float32) # Empty array
														
 
															+            reconstructed_audio = np.array([], dtype=np.float32)  # Empty array
														
 
															         elif len(audios) == 1:
														
 
															             reconstructed_audio = audios[0]
														
 
															         else:
														
 
															-            reconstructed_audio = audios[0][:-cross_fade_samples] # Start with first chunk minus overlap
														
 
															+            reconstructed_audio = audios[0][:-cross_fade_samples]  # Start with first chunk minus overlap
														
 
															             for i in range(1, len(audios)):
														
 
															-                 # Cross-fade section
														
 
															-                 cross_faded_overlap = (audios[i][:cross_fade_samples] * fade_in +
														
 
															-                                        audios[i - 1][-cross_fade_samples:] * fade_out)
														
 
															-                 # Middle section of the current chunk
														
 
															-                 middle_part = audios[i][cross_fade_samples:-cross_fade_samples]
														
 
															-                 # Concatenate
														
 
															-                 reconstructed_audio = np.concatenate([reconstructed_audio, cross_faded_overlap, middle_part])
														
 
															+                # Cross-fade section
														
 
															+                cross_faded_overlap = (audios[i][:cross_fade_samples] * fade_in +
														
 
															+                                       audios[i - 1][-cross_fade_samples:] * fade_out)
														
 
															+                # Middle section of the current chunk
														
 
															+                middle_part = audios[i][cross_fade_samples:-cross_fade_samples]
														
 
															+                # Concatenate
														
 
															+                reconstructed_audio = np.concatenate([reconstructed_audio, cross_faded_overlap, middle_part])
														
 
															             # Add the last part of the final chunk
														
 
															             reconstructed_audio = np.concatenate([reconstructed_audio, audios[-1][-cross_fade_samples:]])
														
@@ -421,11 +426,11 @@ def run_sync_streaming_inference(
 
															             sf.write(audio_save_path, reconstructed_audio, save_sample_rate, "PCM_16")
														
 
															         else:
														
 
															             print("Warning: No audio chunks received or reconstructed.")
														
 
															-            actual_duration = 0 # Set duration to 0 if no audio
														
 
															+            actual_duration = 0  # Set duration to 0 if no audio
														
 
															     else:
														
 
															-         print("Warning: No audio chunks received.")
														
 
															-         actual_duration = 0
														
 
															+        print("Warning: No audio chunks received.")
														
 
															+        actual_duration = 0
														
 
															     return total_request_latency, first_chunk_latency, actual_duration
														
@@ -433,7 +438,7 @@ def run_sync_streaming_inference(
 
															 async def send_streaming(
														
 
															     manifest_item_list: list,
														
 
															     name: str,
														
 
															-    server_url: str, # Changed from sync_triton_client
														
 
															+    server_url: str,  # Changed from sync_triton_client
														
 
															     protocol_client: types.ModuleType,
														
 
															     log_interval: int,
														
 
															     model_name: str,
														
@@ -445,11 +450,11 @@ async def send_streaming(
 
															     total_duration = 0.0
														
 
															     latency_data = []
														
 
															     task_id = int(name[5:])
														
 
															-    sync_triton_client = None # Initialize client variable
														
 
															+    sync_triton_client = None  # Initialize client variable
														
 
															-    try: # Wrap in try...finally to ensure client closing
														
 
															+    try:  # Wrap in try...finally to ensure client closing
														
 
															         print(f"{name}: Initializing sync client for streaming...")
														
 
															-        sync_triton_client = grpcclient_sync.InferenceServerClient(url=server_url, verbose=False) # Create client here
														
 
															+        sync_triton_client = grpcclient_sync.InferenceServerClient(url=server_url, verbose=False)  # Create client here
														
 
															         print(f"{name}: Starting streaming processing for {len(manifest_item_list)} items.")
														
 
															         for i, item in enumerate(manifest_item_list):
														
@@ -491,8 +496,7 @@ async def send_streaming(
 
															                     latency_data.append((total_request_latency, first_chunk_latency, actual_duration))
														
 
															                     total_duration += actual_duration
														
 
															                 else:
														
 
															-                     print(f"{name}: Item {i} failed.")
														
 
															-
														
 
															+                    print(f"{name}: Item {i} failed.")
														
 
															             except FileNotFoundError:
														
 
															                 print(f"Error: Audio file not found for item {i}: {item['audio_filepath']}")
														
@@ -501,8 +505,7 @@ async def send_streaming(
 
															                 import traceback
														
 
															                 traceback.print_exc()
														
 
															-
														
 
															-    finally: # Ensure client is closed
														
 
															+    finally:  # Ensure client is closed
														
 
															         if sync_triton_client:
														
 
															             try:
														
 
															                 print(f"{name}: Closing sync client...")
														
@@ -510,10 +513,10 @@ async def send_streaming(
 
															             except Exception as e:
														
 
															                 print(f"{name}: Error closing sync client: {e}")
														
 
															-
														
 
															     print(f"{name}: Finished streaming processing. Total duration synthesized: {total_duration:.4f}s")
														
 
															     return total_duration, latency_data
														
 
															+
														
 
															 async def send(
														
 
															     manifest_item_list: list,
														
 
															     name: str,
														
@@ -605,6 +608,7 @@ def split_data(data, k):
 
															     return result
														
 
															+
														
 
															 async def main():
														
 
															     args = get_args()
														
 
															     url = f"{args.server_addr}:{args.server_port}"
														
@@ -622,7 +626,7 @@ async def main():
 
															         # Use the sync client for streaming tasks, handled via asyncio.to_thread
														
 
															         # We will create one sync client instance PER TASK inside send_streaming.
														
 
															         # triton_client = grpcclient_sync.InferenceServerClient(url=url, verbose=False) # REMOVED: Client created per task now
														
 
															-        protocol_client = grpcclient_sync # protocol client for input prep
														
 
															+        protocol_client = grpcclient_sync  # protocol client for input prep
														
 
															     else:
														
 
															         raise ValueError(f"Invalid mode: {args.mode}")
														
 
															     # --- End Client Initialization ---
														
@@ -682,11 +686,11 @@ async def main():
 
															                 )
														
 
															             )
														
 
															         elif args.mode == "streaming":
														
 
															-             task = asyncio.create_task(
														
 
															+            task = asyncio.create_task(
														
 
															                 send_streaming(
														
 
															                     manifest_item_list[i],
														
 
															                     name=f"task-{i}",
														
 
															-                    server_url=url, # Pass URL instead of client
														
 
															+                    server_url=url,  # Pass URL instead of client
														
 
															                     protocol_client=protocol_client,
														
 
															                     log_interval=args.log_interval,
														
 
															                     model_name=args.model_name,
														
@@ -709,16 +713,15 @@ async def main():
 
															     for ans in ans_list:
														
 
															         if ans:
														
 
															             total_duration += ans[0]
														
 
															-            latency_data.extend(ans[1]) # Use extend for list of lists
														
 
															+            latency_data.extend(ans[1])  # Use extend for list of lists
														
 
															         else:
														
 
															-             print("Warning: A task returned None, possibly due to an error.")
														
 
															-
														
 
															+            print("Warning: A task returned None, possibly due to an error.")
														
 
															     if total_duration == 0:
														
 
															         print("Total synthesized duration is zero. Cannot calculate RTF or latency percentiles.")
														
 
															         rtf = float('inf')
														
 
															     else:
														
 
															-         rtf = elapsed / total_duration
														
 
															+        rtf = elapsed / total_duration
														
 
															     s = f"Mode: {args.mode}\n"
														
 
															     s += f"RTF: {rtf:.4f}\n"
														
@@ -759,7 +762,7 @@ async def main():
 
															                 s += f"total_request_latency_99_percentile_ms: {np.percentile(total_latency_list, 99) * 1000.0:.2f}\n"
														
 
															                 s += f"average_total_request_latency_ms: {avg_total_latency_ms:.2f}\n"
														
 
															             else:
														
 
															-                 s += "No total request latency data collected.\n"
														
 
															+                s += "No total request latency data collected.\n"
														
 
															             s += "\n--- First Chunk Latency ---\n"
														
 
															             if first_chunk_latency_list:
														
@@ -772,7 +775,7 @@ async def main():
 
															                 s += f"first_chunk_latency_99_percentile_ms: {np.percentile(first_chunk_latency_list, 99) * 1000.0:.2f}\n"
														
 
															                 s += f"average_first_chunk_latency_ms: {avg_first_chunk_latency_ms:.2f}\n"
														
 
															             else:
														
 
															-                 s += "No first chunk latency data collected (check for errors or if all requests failed before first chunk).\n"
														
 
															+                s += "No first chunk latency data collected (check for errors or if all requests failed before first chunk).\n"
														
 
															     else:
														
 
															         s += "No latency data collected.\n"
														
 
															     # --- End Statistics Reporting ---
														
@@ -785,7 +788,7 @@ async def main():
 
															     elif args.reference_audio:
														
 
															         name = Path(args.reference_audio).stem
														
 
															     else:
														
 
															-        name = "results" # Default name if no manifest/split/audio provided
														
 
															+        name = "results"  # Default name if no manifest/split/audio provided
														
 
															     with open(f"{args.log_dir}/rtf-{name}.txt", "w") as f:
														
 
															         f.write(s)
														
--- a/runtime/triton_trtllm/client_http.py
+++ b/runtime/triton_trtllm/client_http.py
@@ -29,6 +29,7 @@ import json
 
															 import numpy as np
														
 
															 import argparse
														
 
															+
														
 
															 def get_args():
														
 
															     parser = argparse.ArgumentParser(
														
 
															         formatter_class=argparse.ArgumentDefaultsHelpFormatter
														
@@ -67,9 +68,10 @@ def get_args():
 
															         type=str,
														
 
															         default="spark_tts",
														
 
															         choices=[
														
 
															-            "f5_tts", "spark_tts", "cosyvoice2"
														
 
															-        ],
														
 
															-        help="triton model_repo module name to request: transducer for k2, attention_rescoring for wenet offline, streaming_wenet for wenet streaming, infer_pipeline for paraformer large offline",
														
 
															+            "f5_tts",
														
 
															+            "spark_tts",
														
 
															+            "cosyvoice2"],
														
 
															+        help="triton model_repo module name to request",
														
 
															     )
														
 
															     parser.add_argument(
														
@@ -80,6 +82,7 @@ def get_args():
 
															     )
														
 
															     return parser.parse_args()
														
 
															+
														
 
															 def prepare_request(
														
 
															     waveform,
														
 
															     reference_text,
														
@@ -97,7 +100,7 @@ def prepare_request(
 
															                 1,
														
 
															                 padding_duration
														
 
															                 * sample_rate
														
 
															-                * ((int(duration) // padding_duration) + 1),
														
 
															+                * ((int(len(waveform) / sample_rate) // padding_duration) + 1),
														
 
															             ),
														
 
															             dtype=np.float32,
														
 
															         )
														
@@ -105,11 +108,11 @@ def prepare_request(
 
															         samples[0, : len(waveform)] = waveform
														
 
															     else:
														
 
															         samples = waveform
														
 
															-        
														
 
															+
														
 
															     samples = samples.reshape(1, -1).astype(np.float32)
														
 
															     data = {
														
 
															-        "inputs":[
														
 
															+        "inputs": [
														
 
															             {
														
 
															                 "name": "reference_wav",
														
 
															                 "shape": samples.shape,
														
@@ -139,16 +142,17 @@ def prepare_request(
 
															     return data
														
 
															+
														
 
															 if __name__ == "__main__":
														
 
															     args = get_args()
														
 
															     server_url = args.server_url
														
 
															     if not server_url.startswith(("http://", "https://")):
														
 
															         server_url = f"http://{server_url}"
														
 
															-    
														
 
															+
														
 
															     url = f"{server_url}/v2/models/{args.model_name}/infer"
														
 
															     waveform, sr = sf.read(args.reference_audio)
														
 
															     assert sr == 16000, "sample rate hardcoded in server"
														
 
															-    
														
 
															+
														
 
															     samples = np.array(waveform, dtype=np.float32)
														
 
															     data = prepare_request(samples, args.reference_text, args.target_text)
														
@@ -166,4 +170,4 @@ if __name__ == "__main__":
 
															         sample_rate = 16000
														
 
															     else:
														
 
															         sample_rate = 24000
														
 
															-    sf.write(args.output_audio, audio, sample_rate, "PCM_16")
														
 
															+    sf.write(args.output_audio, audio, sample_rate, "PCM_16")
														
--- a/runtime/triton_trtllm/model_repo/audio_tokenizer/1/model.py
+++ b/runtime/triton_trtllm/model_repo/audio_tokenizer/1/model.py
@@ -35,33 +35,34 @@ import s3tokenizer
 
															 ORIGINAL_VOCAB_SIZE = 151663
														
 
															+
														
 
															 class TritonPythonModel:
														
 
															     """Triton Python model for audio tokenization.
														
 
															-    
														
 
															+
														
 
															     This model takes reference audio input and extracts semantic tokens
														
 
															     using s3tokenizer.
														
 
															     """
														
 
															     def initialize(self, args):
														
 
															         """Initialize the model.
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             args: Dictionary containing model configuration
														
 
															         """
														
 
															         # Parse model parameters
														
 
															         parameters = json.loads(args['model_config'])['parameters']
														
 
															         model_params = {k: v["string_value"] for k, v in parameters.items()}
														
 
															-        
														
 
															+
														
 
															         self.device = torch.device("cuda")
														
 
															         model_path = os.path.join(model_params["model_dir"], "speech_tokenizer_v2.onnx")
														
 
															         self.audio_tokenizer = s3tokenizer.load_model(model_path).to(self.device)
														
 
															     def execute(self, requests):
														
 
															         """Execute inference on the batched requests.
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             requests: List of inference requests
														
 
															-            
														
 
															+
														
 
															         Returns:
														
 
															             List of inference responses containing tokenized outputs
														
 
															         """
														
@@ -79,18 +80,18 @@ class TritonPythonModel:
 
															             # Prepare inputs
														
 
															             wav = wav_array[:, :wav_len].squeeze(0)
														
 
															             mels.append(s3tokenizer.log_mel_spectrogram(wav))
														
 
															-            
														
 
															+
														
 
															         mels, mels_lens = s3tokenizer.padding(mels)
														
 
															         codes, codes_lens = self.audio_tokenizer.quantize(mels.to(self.device), mels_lens.to(self.device))
														
 
															         codes = codes.clone() + ORIGINAL_VOCAB_SIZE
														
 
															-        
														
 
															+
														
 
															         responses = []
														
 
															         for i in range(len(requests)):
														
 
															-            prompt_speech_tokens = codes[i, :codes_lens[i].item()]            
														
 
															+            prompt_speech_tokens = codes[i, :codes_lens[i].item()]
														
 
															             prompt_speech_tokens_tensor = pb_utils.Tensor.from_dlpack(
														
 
															                 "prompt_speech_tokens", to_dlpack(prompt_speech_tokens))
														
 
															             inference_response = pb_utils.InferenceResponse(
														
 
															                 output_tensors=[prompt_speech_tokens_tensor])
														
 
															             responses.append(inference_response)
														
 
															-                             
														
 
															-        return responses
														
 
															+
														
 
															+        return responses
														
--- a/runtime/triton_trtllm/model_repo/cosyvoice2/1/model.py
+++ b/runtime/triton_trtllm/model_repo/cosyvoice2/1/model.py
@@ -42,16 +42,17 @@ import onnxruntime
 
															 from matcha.utils.audio import mel_spectrogram
														
 
															+
														
 
															 class TritonPythonModel:
														
 
															     """Triton Python model for Spark TTS.
														
 
															-    
														
 
															+
														
 
															     This model orchestrates the end-to-end TTS pipeline by coordinating
														
 
															     between audio tokenizer, LLM, and vocoder components.
														
 
															     """
														
 
															-    
														
 
															+
														
 
															     def initialize(self, args):
														
 
															         """Initialize the model.
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             args: Dictionary containing model configuration
														
 
															         """
														
@@ -116,58 +117,58 @@ class TritonPythonModel:
 
															             "input_ids": input_ids,
														
 
															             "input_lengths": np.array([[input_ids.shape[1]]], dtype=np.int32),
														
 
															         }
														
 
															-        
														
 
															+
														
 
															         # Convert inputs to Triton tensors
														
 
															         input_tensor_list = [
														
 
															             pb_utils.Tensor(k, v) for k, v in input_dict.items()
														
 
															         ]
														
 
															-        
														
 
															+
														
 
															         # Create and execute inference request
														
 
															         llm_request = pb_utils.InferenceRequest(
														
 
															             model_name="tensorrt_llm",
														
 
															             requested_output_names=["output_ids", "sequence_length"],
														
 
															             inputs=input_tensor_list,
														
 
															         )
														
 
															-        
														
 
															+
														
 
															         llm_responses = llm_request.exec(decoupled=self.decoupled)
														
 
															         if self.decoupled:
														
 
															             for llm_response in llm_responses:
														
 
															                 if llm_response.has_error():
														
 
															                     raise pb_utils.TritonModelException(llm_response.error().message())
														
 
															-                
														
 
															+
														
 
															                 # Extract and process output
														
 
															                 output_ids = pb_utils.get_output_tensor_by_name(
														
 
															                     llm_response, "output_ids").as_numpy()
														
 
															                 seq_lens = pb_utils.get_output_tensor_by_name(
														
 
															                     llm_response, "sequence_length").as_numpy()
														
 
															-                
														
 
															+
														
 
															                 # Get actual output IDs up to the sequence length
														
 
															                 actual_output_ids = output_ids[0][0][:seq_lens[0][0]]
														
 
															-                
														
 
															+
														
 
															                 yield actual_output_ids
														
 
															         else:
														
 
															             llm_response = llm_responses
														
 
															             if llm_response.has_error():
														
 
															                 raise pb_utils.TritonModelException(llm_response.error().message())
														
 
															-            
														
 
															+
														
 
															             # Extract and process output
														
 
															             output_ids = pb_utils.get_output_tensor_by_name(
														
 
															                 llm_response, "output_ids").as_numpy()
														
 
															             seq_lens = pb_utils.get_output_tensor_by_name(
														
 
															                 llm_response, "sequence_length").as_numpy()
														
 
															-            
														
 
															+
														
 
															             # Get actual output IDs up to the sequence length
														
 
															             actual_output_ids = output_ids[0][0][:seq_lens[0][0]]
														
 
															-            
														
 
															-            yield actual_output_ids    
														
 
															-                
														
 
															+
														
 
															+            yield actual_output_ids
														
 
															+
														
 
															     def forward_audio_tokenizer(self, wav, wav_len):
														
 
															         """Forward pass through the audio tokenizer component.
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             wav: Input waveform tensor
														
 
															             wav_len: Waveform length tensor
														
 
															-            
														
 
															+
														
 
															         Returns:
														
 
															             Tuple of global and semantic tokens
														
 
															         """
														
@@ -176,26 +177,31 @@ class TritonPythonModel:
 
															             requested_output_names=['prompt_speech_tokens'],
														
 
															             inputs=[wav, wav_len]
														
 
															         )
														
 
															-        
														
 
															+
														
 
															         inference_response = inference_request.exec()
														
 
															         if inference_response.has_error():
														
 
															             raise pb_utils.TritonModelException(inference_response.error().message())
														
 
															-        
														
 
															+
														
 
															         # Extract and convert output tensors
														
 
															         prompt_speech_tokens = pb_utils.get_output_tensor_by_name(inference_response, 'prompt_speech_tokens')
														
 
															         prompt_speech_tokens = torch.utils.dlpack.from_dlpack(prompt_speech_tokens.to_dlpack()).cpu()
														
 
															         return prompt_speech_tokens
														
 
															-    def forward_token2wav(self, prompt_speech_tokens: torch.Tensor, prompt_speech_feat: torch.Tensor, prompt_spk_embedding: torch.Tensor, target_speech_tokens: torch.Tensor) -> torch.Tensor:
														
 
															+    def forward_token2wav(
														
 
															+            self,
														
 
															+            prompt_speech_tokens: torch.Tensor,
														
 
															+            prompt_speech_feat: torch.Tensor,
														
 
															+            prompt_spk_embedding: torch.Tensor,
														
 
															+            target_speech_tokens: torch.Tensor) -> torch.Tensor:
														
 
															         """Forward pass through the vocoder component.
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             prompt_speech_tokens: Prompt speech tokens tensor
														
 
															             prompt_speech_feat: Prompt speech feat tensor
														
 
															             prompt_spk_embedding: Prompt spk embedding tensor
														
 
															             target_speech_tokens: Target speech tokens tensor
														
 
															-            
														
 
															+
														
 
															         Returns:
														
 
															             Generated waveform tensor
														
 
															         """
														
@@ -203,22 +209,22 @@ class TritonPythonModel:
 
															         prompt_speech_feat_tensor = pb_utils.Tensor.from_dlpack("prompt_speech_feat", to_dlpack(prompt_speech_feat))
														
 
															         prompt_spk_embedding_tensor = pb_utils.Tensor.from_dlpack("prompt_spk_embedding", to_dlpack(prompt_spk_embedding))
														
 
															         target_speech_tokens_tensor = pb_utils.Tensor.from_dlpack("target_speech_tokens", to_dlpack(target_speech_tokens))
														
 
															-        
														
 
															+
														
 
															         # Create and execute inference request
														
 
															         inference_request = pb_utils.InferenceRequest(
														
 
															             model_name='token2wav',
														
 
															             requested_output_names=['waveform'],
														
 
															             inputs=[prompt_speech_tokens_tensor, prompt_speech_feat_tensor, prompt_spk_embedding_tensor, target_speech_tokens_tensor]
														
 
															         )
														
 
															-        
														
 
															+
														
 
															         inference_response = inference_request.exec()
														
 
															         if inference_response.has_error():
														
 
															             raise pb_utils.TritonModelException(inference_response.error().message())
														
 
															-        
														
 
															+
														
 
															         # Extract and convert output waveform
														
 
															         waveform = pb_utils.get_output_tensor_by_name(inference_response, 'waveform')
														
 
															         waveform = torch.utils.dlpack.from_dlpack(waveform.to_dlpack()).cpu()
														
 
															-        
														
 
															+
														
 
															         return waveform
														
 
															     def parse_input(self, text, prompt_text, prompt_speech_tokens):
														
@@ -231,43 +237,53 @@ class TritonPythonModel:
 
															     def _extract_spk_embedding(self, speech):
														
 
															         feat = kaldi.fbank(speech,
														
 
															-                            num_mel_bins=80,
														
 
															-                            dither=0,
														
 
															-                            sample_frequency=16000)
														
 
															+                           num_mel_bins=80,
														
 
															+                           dither=0,
														
 
															+                           sample_frequency=16000)
														
 
															         feat = feat - feat.mean(dim=0, keepdim=True)
														
 
															         embedding = self.campplus_session.run(None,
														
 
															-                                                {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
														
 
															+                                              {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
														
 
															         embedding = torch.tensor([embedding]).to(self.device).half()
														
 
															         return embedding
														
 
															-
														
 
															     def _extract_speech_feat(self, speech):
														
 
															-        speech_feat = mel_spectrogram(speech, n_fft=1920, num_mels=80, sampling_rate=24000, hop_size=480, win_size=1920, fmin=0, fmax=8000).squeeze(dim=0).transpose(0, 1).to(self.device)
														
 
															+        speech_feat = mel_spectrogram(
														
 
															+            speech,
														
 
															+            n_fft=1920,
														
 
															+            num_mels=80,
														
 
															+            sampling_rate=24000,
														
 
															+            hop_size=480,
														
 
															+            win_size=1920,
														
 
															+            fmin=0,
														
 
															+            fmax=8000).squeeze(
														
 
															+            dim=0).transpose(
														
 
															+            0,
														
 
															+            1).to(
														
 
															+                self.device)
														
 
															         speech_feat = speech_feat.unsqueeze(dim=0)
														
 
															         return speech_feat
														
 
															     def execute(self, requests):
														
 
															         """Execute inference on the batched requests.
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             requests: List of inference requests
														
 
															-            
														
 
															+
														
 
															         Returns:
														
 
															             List of inference responses containing generated audio
														
 
															         """
														
 
															         responses = []
														
 
															-        
														
 
															+
														
 
															         for request in requests:
														
 
															             # Extract input tensors
														
 
															             wav = pb_utils.get_input_tensor_by_name(request, "reference_wav")
														
 
															             wav_len = pb_utils.get_input_tensor_by_name(request, "reference_wav_len")
														
 
															-            
														
 
															+
														
 
															             # Process reference audio through audio tokenizer
														
 
															             prompt_speech_tokens = self.forward_audio_tokenizer(wav, wav_len)
														
 
															             prompt_speech_tokens = prompt_speech_tokens.unsqueeze(0)
														
 
															-
														
 
															             wav_tensor = wav.as_numpy()
														
 
															             wav_tensor = torch.from_numpy(wav_tensor)[:, :wav_len.as_numpy()[0][0]]
														
 
															             prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=24000)(wav_tensor)
														
@@ -275,20 +291,20 @@ class TritonPythonModel:
 
															             token_len = min(int(speech_feat.shape[1] / 2), prompt_speech_tokens.shape[-1])
														
 
															             prompt_speech_feat = speech_feat[:, :2 * token_len].contiguous().half()
														
 
															             prompt_speech_tokens = prompt_speech_tokens[:, :token_len].contiguous()
														
 
															-            
														
 
															+
														
 
															             reference_text = pb_utils.get_input_tensor_by_name(request, "reference_text").as_numpy()
														
 
															             reference_text = reference_text[0][0].decode('utf-8')
														
 
															-            
														
 
															+
														
 
															             target_text = pb_utils.get_input_tensor_by_name(request, "target_text").as_numpy()
														
 
															             target_text = target_text[0][0].decode('utf-8')
														
 
															-            
														
 
															+
														
 
															             # Prepare prompt for LLM
														
 
															             input_ids = self.parse_input(
														
 
															                 text=target_text,
														
 
															                 prompt_text=reference_text,
														
 
															                 prompt_speech_tokens=prompt_speech_tokens,
														
 
															             )
														
 
															-            
														
 
															+
														
 
															             # Generate semantic tokens with LLM
														
 
															             generated_ids_iter = self.forward_llm(input_ids)
														
@@ -305,13 +321,13 @@ class TritonPythonModel:
 
															                 generated_ids = torch.tensor(generated_ids).unsqueeze(0).to(torch.int32).to(self.device)
														
 
															                 prompt_spk_embedding = self._extract_spk_embedding(wav_tensor)
														
 
															                 audio = self.forward_token2wav(prompt_speech_tokens, prompt_speech_feat, prompt_spk_embedding, generated_ids)
														
 
															-                
														
 
															+
														
 
															                 # Prepare response
														
 
															                 audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(audio))
														
 
															                 inference_response = pb_utils.InferenceResponse(output_tensors=[audio_tensor])
														
 
															                 response_sender.send(inference_response)
														
 
															                 response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
														
 
															-                self.logger.log_info(f"send tritonserver_response_complete_final to end")
														
 
															+                self.logger.log_info("send tritonserver_response_complete_final to end")
														
 
															             else:
														
 
															                 generated_ids = next(generated_ids_iter)
														
 
															                 generated_ids = torch.tensor(generated_ids).unsqueeze(0).to(self.device)
														
@@ -320,11 +336,11 @@ class TritonPythonModel:
 
															                 prompt_spk_embedding = self._extract_spk_embedding(wav_tensor)
														
 
															                 audio = self.forward_token2wav(prompt_speech_tokens, prompt_speech_feat, prompt_spk_embedding, generated_ids)
														
 
															-                
														
 
															+
														
 
															                 # Prepare response
														
 
															                 audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(audio))
														
 
															                 inference_response = pb_utils.InferenceResponse(output_tensors=[audio_tensor])
														
 
															                 responses.append(inference_response)
														
 
															-            
														
 
															+
														
 
															         if not self.decoupled:
														
 
															-            return responses
														
 
															+            return responses
														
--- a/runtime/triton_trtllm/model_repo/token2wav/1/model.py
+++ b/runtime/triton_trtllm/model_repo/token2wav/1/model.py
@@ -44,6 +44,7 @@ logger = logging.getLogger(__name__)
 
															 ORIGINAL_VOCAB_SIZE = 151663
														
 
															+
														
 
															 class CosyVoice2:
														
 
															     def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_concurrent=1):
														
@@ -66,6 +67,7 @@ class CosyVoice2:
 
															                                 trt_concurrent,
														
 
															                                 self.fp16)
														
 
															+
														
 
															 class CosyVoice2Model:
														
 
															     def __init__(self,
														
@@ -109,16 +111,17 @@ class CosyVoice2Model:
 
															         input_names = ["x", "mask", "mu", "cond"]
														
 
															         return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
														
 
															+
														
 
															 class TritonPythonModel:
														
 
															     """Triton Python model for vocoder.
														
 
															-    
														
 
															+
														
 
															     This model takes global and semantic tokens as input and generates audio waveforms
														
 
															     using the BiCodec vocoder.
														
 
															     """
														
 
															     def initialize(self, args):
														
 
															         """Initialize the model.
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             args: Dictionary containing model configuration
														
 
															         """
														
@@ -126,24 +129,23 @@ class TritonPythonModel:
 
															         parameters = json.loads(args['model_config'])['parameters']
														
 
															         model_params = {key: value["string_value"] for key, value in parameters.items()}
														
 
															         model_dir = model_params["model_dir"]
														
 
															-        
														
 
															+
														
 
															         # Initialize device and vocoder
														
 
															         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
														
 
															         logger.info(f"Initializing vocoder from {model_dir} on {self.device}")
														
 
															-        
														
 
															+
														
 
															         self.token2wav_model = CosyVoice2(
														
 
															             model_dir, load_jit=True, load_trt=True, fp16=True
														
 
															         )
														
 
															         logger.info("Token2Wav initialized successfully")
														
 
															-
														
 
															     def execute(self, requests):
														
 
															         """Execute inference on the batched requests.
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             requests: List of inference requests
														
 
															-            
														
 
															+
														
 
															         Returns:
														
 
															             List of inference responses containing generated waveforms
														
 
															         """
														
@@ -163,7 +165,7 @@ class TritonPythonModel:
 
															             # shift the speech tokens according to the original vocab size
														
 
															             prompt_speech_tokens = prompt_speech_tokens - ORIGINAL_VOCAB_SIZE
														
 
															             target_speech_tokens = target_speech_tokens - ORIGINAL_VOCAB_SIZE
														
 
															-            
														
 
															+
														
 
															             tts_mel, _ = self.token2wav_model.model.flow.inference(
														
 
															                 token=target_speech_tokens,
														
 
															                 token_len=torch.tensor([target_speech_tokens.shape[1]], dtype=torch.int32).to(
														
@@ -189,9 +191,5 @@ class TritonPythonModel:
 
															             wav_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(audio_hat))
														
 
															             inference_response = pb_utils.InferenceResponse(output_tensors=[wav_tensor])
														
 
															             responses.append(inference_response)
														
 
															-                             
														
 
															-        return responses
														
 
															-
														
 
															-
														
 
															-
														
 
															+        return responses
														
--- a/runtime/triton_trtllm/scripts/convert_checkpoint.py
+++ b/runtime/triton_trtllm/scripts/convert_checkpoint.py
@@ -35,8 +35,7 @@ def parse_arguments():
 
															         type=str,
														
 
															         default='auto',
														
 
															         choices=['auto', 'float16', 'bfloat16', 'float32'],
														
 
															-        help=
														
 
															-        "The data type for the model weights and activations if not quantized. "
														
 
															+        help="The data type for the model weights and activations if not quantized. "
														
 
															         "If 'auto', the data type is automatically inferred from the source model; "
														
 
															         "however, if the source dtype is float32, it is converted to float16.")
														
 
															     parser.add_argument(
														
@@ -49,8 +48,7 @@ def parse_arguments():
 
															         '--disable_weight_only_quant_plugin',
														
 
															         default=False,
														
 
															         action="store_true",
														
 
															-        help=
														
 
															-        'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
														
 
															+        help='By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
														
 
															         'You must also use --use_weight_only for that argument to have an impact.'
														
 
															     )
														
 
															     parser.add_argument(
														
@@ -60,16 +58,14 @@ def parse_arguments():
 
															         nargs='?',
														
 
															         default='int8',
														
 
															         choices=['int8', 'int4', 'int4_gptq'],
														
 
															-        help=
														
 
															-        'Define the precision for the weights when using weight-only quantization.'
														
 
															+        help='Define the precision for the weights when using weight-only quantization.'
														
 
															         'You must also use --use_weight_only for that argument to have an impact.'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--calib_dataset',
														
 
															         type=str,
														
 
															         default='ccdv/cnn_dailymail',
														
 
															-        help=
														
 
															-        "The huggingface dataset name or the local directory of the dataset for calibration."
														
 
															+        help="The huggingface dataset name or the local directory of the dataset for calibration."
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--smoothquant",
														
@@ -83,31 +79,27 @@ def parse_arguments():
 
															         '--per_channel',
														
 
															         action="store_true",
														
 
															         default=False,
														
 
															-        help=
														
 
															-        'By default, we use a single static scaling factor for the GEMM\'s result. '
														
 
															+        help='By default, we use a single static scaling factor for the GEMM\'s result. '
														
 
															         'per_channel instead uses a different static scaling factor for each channel. '
														
 
															         'The latter is usually more accurate, but a little slower.')
														
 
															     parser.add_argument(
														
 
															         '--per_token',
														
 
															         action="store_true",
														
 
															         default=False,
														
 
															-        help=
														
 
															-        'By default, we use a single static scaling factor to scale activations in the int8 range. '
														
 
															+        help='By default, we use a single static scaling factor to scale activations in the int8 range. '
														
 
															         'per_token chooses at run time, and for each token, a custom scaling factor. '
														
 
															         'The latter is usually more accurate, but a little slower.')
														
 
															     parser.add_argument(
														
 
															         '--int8_kv_cache',
														
 
															         default=False,
														
 
															         action="store_true",
														
 
															-        help=
														
 
															-        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
														
 
															+        help='By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--per_group',
														
 
															         default=False,
														
 
															         action="store_true",
														
 
															-        help=
														
 
															-        'By default, we use a single static scaling factor to scale weights in the int4 range. '
														
 
															+        help='By default, we use a single static scaling factor to scale weights in the int4 range. '
														
 
															         'per_group chooses at run time, and for each group, a custom scaling factor. '
														
 
															         'The flag is built for GPTQ/AWQ quantization.')
														
@@ -121,16 +113,14 @@ def parse_arguments():
 
															         '--use_parallel_embedding',
														
 
															         action="store_true",
														
 
															         default=False,
														
 
															-        help=
														
 
															-        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
														
 
															+        help='By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--embedding_sharding_dim',
														
 
															         type=int,
														
 
															         default=0,
														
 
															         choices=[0, 1],
														
 
															-        help=
														
 
															-        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
														
 
															+        help='By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
														
 
															         'To shard it along hidden dimension, set embedding_sharding_dim=1'
														
 
															         'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
														
 
															     )
														
@@ -147,15 +137,13 @@ def parse_arguments():
 
															         '--moe_tp_size',
														
 
															         type=int,
														
 
															         default=-1,
														
 
															-        help=
														
 
															-        'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE'
														
 
															+        help='N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE'
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         '--moe_ep_size',
														
 
															         type=int,
														
 
															         default=-1,
														
 
															-        help=
														
 
															-        'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE'
														
 
															+        help='N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE'
														
 
															     )
														
 
															     args = parser.parse_args()
														
 
															     return args
														
@@ -249,7 +237,7 @@ def convert_and_save_hf(args):
 
															                                                trust_remote_code=True)
														
 
															         quant_config, override_fields = update_quant_config_from_hf(
														
 
															             quant_config, hf_config, override_fields)
														
 
															-    except:
														
 
															+    except BaseException:
														
 
															         logger.warning("AutoConfig cannot load the huggingface config.")
														
 
															     if args.smoothquant is not None or args.int8_kv_cache:
														
@@ -339,4 +327,4 @@ def main():
 
															 if __name__ == '__main__':
														
 
															-    main()
														
 
															+    main()
														
--- a/runtime/triton_trtllm/scripts/fill_template.py
+++ b/runtime/triton_trtllm/scripts/fill_template.py
@@ -1,4 +1,4 @@
 
															-#! /usr/bin/env python3
														
 
															+# /usr/bin/env python3
														
 
															 from argparse import ArgumentParser
														
 
															 from string import Template
														
@@ -59,8 +59,7 @@ if __name__ == "__main__":
 
															     parser.add_argument("file_path", help="path of the .pbtxt to modify")
														
 
															     parser.add_argument(
														
 
															         "substitutions",
														
 
															-        help=
														
 
															-        "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
														
 
															+        help="substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
														
 
															     )
														
 
															     parser.add_argument("--in_place",
														
 
															                         "-i",
														
--- a/runtime/triton_trtllm/scripts/test_llm.py
+++ b/runtime/triton_trtllm/scripts/test_llm.py
@@ -46,7 +46,6 @@ def parse_arguments(args=None):
 
															     parser.add_argument('--top_k', type=int, default=50)
														
 
															     parser.add_argument('--top_p', type=float, default=0.95)
														
 
															-
														
 
															     return parser.parse_args(args=args)
														
@@ -60,7 +59,7 @@ def parse_input(tokenizer,
 
															         input_ids = tokenizer.encode(
														
 
															             curr_text)
														
 
															         batch_input_ids.append(input_ids)
														
 
															- 
														
 
															+
														
 
															     batch_input_ids = [
														
 
															         torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
														
 
															     ]