Ver código fonte

rename files

root 1 mês atrás
pai
commit
c7686faccc

+ 1 - 1
runtime/triton_trtllm/README.DIT.md → runtime/triton_trtllm/README.Cosyvoice2.DiT.md

@@ -8,7 +8,7 @@ This document describes how to accelerate CosyVoice with a DiT-based Token2Wav m
 
 Launch the service directly with Docker Compose:
 ```sh
-docker compose -f docker-compose.dit.yml up
+docker compose -f docker-compose.cosyvoice2.dit.yml up
 ```
 
 ### Build the Docker Image

+ 1 - 1
runtime/triton_trtllm/README.md → runtime/triton_trtllm/README.Cosyvoice2.Unet.md

@@ -6,7 +6,7 @@ Contributed by Yuekai Zhang (NVIDIA).
 
 Launch the service directly with Docker Compose:
 ```sh
-docker compose up
+docker compose -f docker-compose.cosyvoice2.unet.yml up
 ```
 
 ### Build the Docker Image

+ 0 - 0
runtime/triton_trtllm/docker-compose.dit.yml → runtime/triton_trtllm/docker-compose.cosyvoice2.dit.yml


+ 0 - 0
runtime/triton_trtllm/docker-compose.yml → runtime/triton_trtllm/docker-compose.cosyvoice2.unet.yml


+ 1 - 0
runtime/triton_trtllm/requirements.txt

@@ -12,3 +12,4 @@ pyworld
 openai-whisper
 tritonclient
 modelscope
+x_transformers

+ 21 - 135
runtime/triton_trtllm/run_cosyvoice3.sh

@@ -10,20 +10,21 @@ export PYTHONPATH=${cosyvoice_path}/third_party/Matcha-TTS:$PYTHONPATH
 stage=$1
 stop_stage=$2
 
-huggingface_model_local_dir=./hf_cosyvoice3_llm
-model_scope_model_local_dir=/workspace_yuekai/HF/Fun-CosyVoice3-0.5B-2512
+huggingface_llm_local_dir=$cosyvoice_path/runtime/triton_trtllm/hf_cosyvoice3_llm
+cosyvoice3_official_model_dir=$cosyvoice_path/runtime/triton_trtllm/Fun-CosyVoice3-0.5B-2512
 
 trt_dtype=bfloat16
-trt_weights_dir=./trt_weights_${trt_dtype}
-trt_engines_dir=./trt_engines_${trt_dtype}
+trt_weights_dir=$cosyvoice_path/runtime/triton_trtllm/trt_weights_${trt_dtype}
+trt_engines_dir=$cosyvoice_path/runtime/triton_trtllm/trt_engines_${trt_dtype}
 
-model_repo_src=./model_repo_cosyvoice3
-model_repo=./deploy_cosyvoice3
-bls_instance_num=1
+model_repo_src=$cosyvoice_path/runtime/triton_trtllm/model_repo_cosyvoice3
+model_repo=$cosyvoice_path/runtime/triton_trtllm/model_repo_cosyvoice3_copy
+bls_instance_num=10
 
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
 
     echo "Cloning CosyVoice"
+    pip3 install --upgrade x_transformers s3tokenizer
     git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git $cosyvoice_path
     cd $cosyvoice_path
     git submodule update --init --recursive
@@ -31,23 +32,16 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
 fi
 
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
-    echo ""
-    # see https://github.com/nvidia-china-sae/mair-hub/blob/main/rl-tutorial/cosyvoice_llm/pretrained_to_huggingface.py
-    # huggingface-cli download --local-dir $huggingface_model_local_dir yuekai/cosyvoice2_llm
-    # modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_local_dir
-
-    # pip3 install --upgrade x_transformers s3tokenizer 
-    # pip install -U nvidia-modelopt[all]
-    python3 scripts/convert_cosyvoice3_to_hf.py \
-        --model-dir $model_scope_model_local_dir \
-        --output-dir $huggingface_model_local_dir || exit 1 # TODO: output dir should be here
-
+    echo "Downloading CosyVoice3 Checkpoints"
+    huggingface-cli download --local-dir $huggingface_llm_local_dir yuekai/Fun-CosyVoice3-0.5B-2512-LLM-HF
+    huggingface-cli download --local-dir $cosyvoice3_official_model_dir yuekai/Fun-CosyVoice3-0.5B-2512-FP16-ONNX
+    huggingface-cli download --local-dir $cosyvoice3_official_model_dir FunAudioLLM/Fun-CosyVoice3-0.5B-2512
 fi
 
 
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
     echo "Converting checkpoint to TensorRT weights"
-    python3 scripts/convert_checkpoint.py --model_dir $huggingface_model_local_dir \
+    python3 scripts/convert_checkpoint.py --model_dir $huggingface_llm_local_dir \
                                 --output_dir $trt_weights_dir \
                                 --dtype $trt_dtype || exit 1
 
@@ -60,7 +54,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 
     echo "Testing TensorRT engines"
     python3 ./scripts/test_llm.py --input_text "你好,请问你叫什么?" \
-                    --tokenizer_dir $huggingface_model_local_dir \
+                    --tokenizer_dir $huggingface_llm_local_dir \
                     --top_k 50 --top_p 0.95 --temperature 0.8 \
                     --engine_dir=$trt_engines_dir  || exit 1
 fi
@@ -78,8 +72,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
     cp -r ${model_repo_src}/speaker_embedding $model_repo/
 
     MAX_QUEUE_DELAY_MICROSECONDS=0
-    MODEL_DIR=$model_scope_model_local_dir
-    LLM_TOKENIZER_DIR=$huggingface_model_local_dir
+    MODEL_DIR=$cosyvoice3_official_model_dir
+    LLM_TOKENIZER_DIR=$huggingface_llm_local_dir
     BLS_INSTANCE_NUM=$bls_instance_num
     TRITON_MAX_BATCH_SIZE=1
     DECOUPLED_MODE=True
@@ -92,44 +86,16 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
 
 fi
 
-if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
-    echo "Starting CosyVoice3 Triton server and LLM using trtllm-serve"
-    CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64  --kv_cache_free_gpu_memory_fraction 0.4
-fi
-
-
-if [ $stage -le 40 ] && [ $stop_stage -ge 40 ]; then
-
-   CUDA_VISIBLE_DEVICES=1 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
-fi
-
-
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
    echo "Starting CosyVoice3 Triton server and LLM using trtllm-serve"
-   CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64  --kv_cache_free_gpu_memory_fraction 0.4 &
-   CUDA_VISIBLE_DEVICES=0,1,2,3 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
+   CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_llm_local_dir $trt_engines_dir --max_batch_size 64  --kv_cache_free_gpu_memory_fraction 0.4 &
+   CUDA_VISIBLE_DEVICES=0 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
    wait
-    # Test using curl
-    # curl http://localhost:8000/v1/chat/completions \
-    #     -H "Content-Type: application/json" \
-    #     -d '{
-    #         "model": "",
-    #         "messages":[{"role": "user", "content": "Where is New York?"},
-    #                     {"role": "assistant", "content": "<|s_1708|><|s_2050|><|s_2159|>"}],
-    #         "max_tokens": 512,
-    #         "temperature": 0.8,
-    #         "top_p": 0.95,
-    #         "top_k": 50,
-    #         "stop": ["<|eos1|>"],
-    #         "repetition_penalty": 1.2,
-    #         "stream": false
-    #     }'
 fi
 
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
     echo "Running benchmark client for CosyVoice3"
     num_task=4
-    mode=offline
     mode=streaming
     BLS_INSTANCE_NUM=$bls_instance_num
 
@@ -145,93 +111,13 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi
 
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  echo "stage 5: Offline TTS (Cosyvoice2 LLM + Step-Audio2-mini DiT Token2Wav) inference using a single python script"
-
-  datasets=(wenetspeech4tts) # wenetspeech4tts, test_zh, zero_shot_zh
-  backend=trtllm # hf, trtllm, vllm, trtllm-serve
-
-  batch_sizes=(16)
-  token2wav_batch_size=1
-
-  for batch_size in ${batch_sizes[@]}; do
-    for dataset in ${datasets[@]}; do
-    output_dir=./${dataset}_${backend}_llm_batch_size_${batch_size}_token2wav_batch_size_${token2wav_batch_size}
-    CUDA_VISIBLE_DEVICES=1 \
-        python3 offline_inference.py \
-            --output-dir $output_dir \
-            --llm-model-name-or-path $huggingface_model_local_dir \
-            --token2wav-path $step_audio_model_dir/token2wav \
-            --backend $backend \
-            --batch-size $batch_size --token2wav-batch-size $token2wav_batch_size \
-            --engine-dir $trt_engines_dir \
-            --split-name ${dataset} || exit 1
-    done
-  done
-fi
-
-
-
-
-if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
-   echo "Disaggregated Server: LLM and Token2wav on different GPUs"
-   echo "Starting LLM server on GPU 0"
-   export CUDA_VISIBLE_DEVICES=0
-   mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64  --kv_cache_free_gpu_memory_fraction 0.4 &
-   echo "Starting Token2wav server on GPUs 1-3"
-   Token2wav_num_gpus=3
-   http_port=17000
-   grpc_port=18000
-   metrics_port=16000
-   for i in $(seq 0 $(($Token2wav_num_gpus - 1))); do
-       echo "Starting server on GPU $i"
-       http_port=$((http_port + 1))
-       grpc_port=$((grpc_port + 1))
-       metrics_port=$((metrics_port + 1))
-       # Two instances of Token2wav server on the same GPU
-       CUDA_VISIBLE_DEVICES=$(($i + 1)) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
-       http_port=$((http_port + 1))
-       grpc_port=$((grpc_port + 1))
-       metrics_port=$((metrics_port + 1))
-       CUDA_VISIBLE_DEVICES=$(($i + 1)) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
-   done
-   wait
-fi
-
-if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
-    echo "Running benchmark client for Disaggregated Server"
-    per_gpu_instances=2
-    mode=streaming
-    BLS_INSTANCE_NUM=$bls_instance_num
-    Token2wav_num_gpus=(1 2 3)
-    concurrent_tasks=(1 2 3 4 5 6)
-    for n_gpu in ${Token2wav_num_gpus[@]}; do
-        echo "Test 1 GPU for LLM server and $n_gpu GPUs for Token2wav servers"
-        for concurrent_task in ${concurrent_tasks[@]}; do
-            num_instances=$((per_gpu_instances * n_gpu))
-            for i in $(seq 1 $num_instances); do
-                port=$(($i + 18000))
-                python3 client_grpc.py \
-                    --server-addr localhost \
-                    --server-port $port \
-                    --model-name cosyvoice2_dit \
-                    --num-tasks $concurrent_task \
-                    --mode $mode \
-                    --huggingface-dataset yuekai/seed_tts_cosy2 \
-                    --log-dir ./log_disagg_concurrent_tasks_${concurrent_task}_per_instance_total_token2wav_instances_${num_instances}_port_${port} &
-            done
-            wait
-        done
-    done
-fi
-
-if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
     echo "stage 10: Python script CosyVoice3 TTS (LLM + CosyVoice3 Token2Wav) inference"
 
     datasets=(wenetspeech4tts) # wenetspeech4tts
     backend=trtllm-serve  # hf, trtllm, vllm, trtllm-serve
 
     batch_sizes=(1)
-    token2wav_batch_size=1
+    token2wav_batch_size=1 # Only support 1 for now
 
     for batch_size in ${batch_sizes[@]}; do
       for dataset in ${datasets[@]}; do
@@ -239,8 +125,8 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
         CUDA_VISIBLE_DEVICES=0 \
             python3 infer_cosyvoice3.py \
                 --output-dir $output_dir \
-                --llm-model-name-or-path $huggingface_model_local_dir \
-                --token2wav-path $model_scope_model_local_dir \
+                --llm-model-name-or-path $huggingface_llm_local_dir \
+                --token2wav-path $cosyvoice3_official_model_dir \
                 --backend $backend \
                 --batch-size $batch_size --token2wav-batch-size $token2wav_batch_size \
                 --engine-dir $trt_engines_dir \

+ 52 - 53
runtime/triton_trtllm/scripts/convert_cosyvoice3_to_hf.py

@@ -13,21 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Конвертация CosyVoice3 LLM в HuggingFace формат с объединёнными embeddings.
+Convert CosyVoice3 LLM to HuggingFace format with merged embeddings.
 
-Этот скрипт:
-1. Загружает CosyVoice3 модель
-2. Расширяет vocab токенизатора с speech токенами
-3. Объединяет speech_embedding в embed_tokens Qwen2
-4. Заменяет lm_head на llm_decoder с расширенным vocab
-5. Сохраняет модель в HuggingFace формате для TRT-LLM конвертации
+This script:
+1. Loads CosyVoice3 model
+2. Extends tokenizer vocab with speech tokens
+3. Merges speech_embedding into embed_tokens of Qwen2
+4. Replaces lm_head with llm_decoder using extended vocab
+5. Saves model in HuggingFace format for TRT-LLM conversion
 
 Usage:
     python scripts/convert_cosyvoice3_to_hf.py \
         --model-dir pretrained_models/Fun-CosyVoice3-0.5B \
         --output-dir pretrained_models/Fun-CosyVoice3-0.5B/hf_merged
 
-После этого можно конвертировать в TRT-LLM:
+Then convert to TRT-LLM:
     trtllm-build --checkpoint_dir <output_dir> --output_dir <trt_engines_dir> ...
 """
 import argparse
@@ -70,7 +70,7 @@ def parse_args():
 
 
 def load_cosyvoice3_model(model_dir: str):
-    """Загружает CosyVoice3 модель для извлечения весов."""
+    """Load CosyVoice3 model for weight extraction."""
     from hyperpyyaml import load_hyperpyyaml
     from cosyvoice.utils.class_utils import get_model_type
     
@@ -86,7 +86,7 @@ def load_cosyvoice3_model(model_dir: str):
             overrides={'qwen_pretrain_path': hf_llm_dir}
         )
     
-    # Загружаем только LLM
+    # Load LLM only
     llm = configs['llm']
     llm_weights_path = os.path.join(model_dir, 'llm.pt')
     llm.load_state_dict(torch.load(llm_weights_path, map_location='cpu'), strict=True)
@@ -98,12 +98,11 @@ def load_cosyvoice3_model(model_dir: str):
 
 
 def get_speech_token_size(llm) -> int:
-    """Определяет размер speech token vocabulary из модели."""
-    # CosyVoice3LM имеет: speech_token_size + 200 в llm_decoder
-    # speech_embedding имеет: speech_token_size + 200
+    """Determine speech token vocabulary size from the model."""
+    # CosyVoice3LM has: speech_token_size + 200 in llm_decoder
+    # speech_embedding has: speech_token_size + 200
     speech_embedding_size = llm.speech_embedding.num_embeddings
-    # Вычитаем 200 специальных токенов (sos, eos, task_id, fill, и т.д.)
-    # Но для безопасности используем полный размер embedding
+    # Use full embedding size (includes speech special tokens)
     return speech_embedding_size
 
 
@@ -113,32 +112,32 @@ def convert_cosyvoice3_to_hf(
     dtype: str = "bfloat16",
 ):
     """
-    Конвертирует CosyVoice3 LLM в HuggingFace формат с объединёнными embeddings.
-    
-    Архитектура объединения:
-    - embed_tokens[0:original_vocab_size] = оригинальные text embeddings
+    Convert CosyVoice3 LLM to HuggingFace format with merged embeddings.
+
+    Merging architecture:
+    - embed_tokens[0:original_vocab_size] = original text embeddings
     - embed_tokens[original_vocab_size:original_vocab_size+speech_token_size] = speech_embedding
     - lm_head[original_vocab_size:original_vocab_size+speech_token_size] = llm_decoder
-    
+
     Args:
-        model_dir: Путь к CosyVoice3 модели
-        output_dir: Путь для сохранения HF модели
-        dtype: Тип данных для сохранения
+        model_dir: Path to CosyVoice3 model
+        output_dir: Path to save HF model
+        dtype: Data type for saving
     """
     logger.info(f"Loading CosyVoice3 model from {model_dir}")
     
-    # 1. Загружаем CosyVoice3 компоненты
+    # 1. Load CosyVoice3 components
     cosyvoice3_llm, hf_llm_dir, configs = load_cosyvoice3_model(model_dir)
     
-    # Извлекаем ключевые компоненты
+    # Extract key components
     qwen_model = cosyvoice3_llm.llm.model  # Qwen2ForCausalLM
-    speech_embedding = cosyvoice3_llm.speech_embedding  # Embedding для speech токенов
-    llm_decoder = cosyvoice3_llm.llm_decoder  # Linear для декодирования в speech токены
+    speech_embedding = cosyvoice3_llm.speech_embedding  # Embedding for speech tokens
+    llm_decoder = cosyvoice3_llm.llm_decoder  # Linear for decoding to speech tokens
     
     speech_token_size = get_speech_token_size(cosyvoice3_llm)
     logger.info(f"Speech token size: {speech_token_size}")
     
-    # 2. Загружаем tokenizer и добавляем CosyVoice3 text special tokens + speech токены
+    # 2. Load tokenizer and add CosyVoice3 text special tokens + speech tokens
     tokenizer = AutoTokenizer.from_pretrained(hf_llm_dir, trust_remote_code=True)
     base_vocab_size = len(tokenizer)
     logger.info(f"Base tokenizer vocab size: {base_vocab_size}")
@@ -210,8 +209,8 @@ def convert_cosyvoice3_to_hf(
     logger.info(f"New tokenizer vocab size: {new_vocab_size}")
     logger.info(f"Added {new_vocab_size - base_vocab_size} tokens total (text special + speech tokens)")
     
-    # 3. Изменяем размер embeddings в Qwen модели
-    # Выравниваем по 128 для эффективности TensorRT
+    # 3. Resize embeddings in Qwen model
+    # Align to 128 for TensorRT efficiency
     padded_vocab_size = ((new_vocab_size + 127) // 128) * 128
     qwen_model.resize_token_embeddings(padded_vocab_size)
     logger.info(f"Resized embeddings to: {padded_vocab_size}")
@@ -219,7 +218,7 @@ def convert_cosyvoice3_to_hf(
     # Speech tokens start after text vocab (base + CosyVoice3 text special tokens)
     speech_token_offset = text_vocab_size
 
-    # 4. Копируем speech_embedding в расширенную часть embed_tokens
+    # 4. Copy speech_embedding into extended embed_tokens
     input_embeddings = qwen_model.get_input_embeddings()
     hidden_size = input_embeddings.weight.shape[1]
     
@@ -228,7 +227,7 @@ def convert_cosyvoice3_to_hf(
     logger.info(f"llm_decoder shape: {llm_decoder.weight.shape}")
     
     with torch.no_grad():
-        # Копируем speech_embedding веса в embed_tokens
+        # Copy speech_embedding weights into embed_tokens
         # Indices: [speech_token_offset, speech_token_offset + speech_token_size)
         src_size = min(speech_embedding.weight.shape[0], actual_speech_tokens)
         input_embeddings.weight[speech_token_offset:speech_token_offset + src_size] = \
@@ -236,12 +235,12 @@ def convert_cosyvoice3_to_hf(
     
     logger.info(f"Copied speech_embedding to embed_tokens[{speech_token_offset}:{speech_token_offset + src_size}]")
     
-    # 5. Создаём новый lm_head с расширенным vocab и копируем llm_decoder
-    # Оригинальный lm_head: hidden_size -> original_vocab_size
-    # Новый lm_head: hidden_size -> padded_vocab_size
+    # 5. Create new lm_head with extended vocab and copy llm_decoder
+    # Original lm_head: hidden_size -> original_vocab_size
+    # New lm_head: hidden_size -> padded_vocab_size
     # llm_decoder: hidden_size -> speech_token_size
-    
-    # Создаём новый lm_head
+
+    # Create new lm_head
     has_bias = llm_decoder.bias is not None
     new_lm_head = torch.nn.Linear(
         in_features=hidden_size,
@@ -250,24 +249,24 @@ def convert_cosyvoice3_to_hf(
     )
     
     with torch.no_grad():
-        # Инициализируем веса:
-        # - Text часть: копируем из оригинального lm_head (или нули)
-        # - Speech часть: копируем из llm_decoder
-        # - Padding: нули
-        
-        # Сначала заполняем нулями и -inf в bias (чтобы text токены не генерировались)
+        # Initialize weights:
+        # - Text part: copy from original lm_head (or zeros)
+        # - Speech part: copy from llm_decoder
+        # - Padding: zeros
+
+        # Fill with zeros and -inf in bias (so text tokens are not generated)
         new_lm_head.weight.data.zero_()
         if has_bias:
             new_lm_head.bias.data.fill_(-float('inf'))
         
-        # Копируем оригинальный lm_head для text токенов (опционально)
+        # Copy original lm_head for text tokens (optional)
         original_lm_head = qwen_model.lm_head
         if original_lm_head is not None and original_lm_head.weight.shape[0] >= text_vocab_size:
             new_lm_head.weight[:text_vocab_size] = original_lm_head.weight[:text_vocab_size]
             if has_bias and original_lm_head.bias is not None:
                 new_lm_head.bias[:text_vocab_size] = original_lm_head.bias[:text_vocab_size]
         
-        # Копируем llm_decoder для speech токенов
+        # Copy llm_decoder for speech tokens
         decoder_size = min(llm_decoder.weight.shape[0], actual_speech_tokens)
         new_lm_head.weight[speech_token_offset:speech_token_offset + decoder_size] = \
             llm_decoder.weight[:decoder_size].to(new_lm_head.weight.dtype)
@@ -276,18 +275,18 @@ def convert_cosyvoice3_to_hf(
             new_lm_head.bias[speech_token_offset:speech_token_offset + decoder_size] = \
                 llm_decoder.bias[:decoder_size].to(new_lm_head.bias.dtype)
         else:
-            # Если llm_decoder не имеет bias, но мы хотим его для text токенов
+            # If llm_decoder has no bias but we want it for text tokens
             pass
     
-    # Заменяем lm_head
+    # Replace lm_head
     qwen_model.lm_head = new_lm_head
     
     logger.info(f"Created new lm_head with shape: {new_lm_head.weight.shape}")
     logger.info(f"Copied llm_decoder to lm_head[{speech_token_offset}:{speech_token_offset + decoder_size}]")
     
-    # 6. Обновляем конфигурацию модели
+    # 6. Update model configuration
     qwen_model.config.vocab_size = padded_vocab_size
-    qwen_model.config.tie_word_embeddings = False  # Embeddings и lm_head теперь разные!
+    qwen_model.config.tie_word_embeddings = False  # Embeddings and lm_head are now different!
     
     # Set EOS token for generation (speech EOS lives inside speech_embedding as <|s_{base_speech_token_size+1}|>)
     base_speech_token_size = getattr(cosyvoice3_llm, "speech_token_size", 6561)
@@ -295,7 +294,7 @@ def convert_cosyvoice3_to_hf(
     eos_id = speech_token_offset + eos_speech_idx
     qwen_model.config.eos_token_id = eos_id
     
-    # Настройки генерации
+    # Generation settings
     qwen_model.generation_config.eos_token_id = eos_id
     qwen_model.generation_config.pad_token_id = eos_id
     qwen_model.generation_config.temperature = 0.8
@@ -304,7 +303,7 @@ def convert_cosyvoice3_to_hf(
     qwen_model.generation_config.repetition_penalty = 1.1
     qwen_model.generation_config.max_new_tokens = 2048
     
-    # 7. Конвертируем в нужный dtype
+    # 7. Convert to target dtype
     dtype_map = {
         "float16": torch.float16,
         "bfloat16": torch.bfloat16,
@@ -313,7 +312,7 @@ def convert_cosyvoice3_to_hf(
     target_dtype = dtype_map[dtype]
     qwen_model.to(target_dtype)
     
-    # 8. Сохраняем модель и tokenizer
+    # 8. Save model and tokenizer
     os.makedirs(output_dir, exist_ok=True)
     
     qwen_model.save_pretrained(output_dir)
@@ -322,7 +321,7 @@ def convert_cosyvoice3_to_hf(
     tokenizer.chat_template = TEMPLATE
     tokenizer.save_pretrained(output_dir)
     
-    # Сохраняем метаданные для TRT-LLM inference
+    # Save metadata for TRT-LLM inference
     metadata = {
         "original_vocab_size": base_vocab_size,
         "text_vocab_size": text_vocab_size,