root пре 1 месец
родитељ
комит
914454e3fd

+ 1 - 1
cosyvoice/dataset/processor.py

@@ -181,7 +181,7 @@ def compute_fbank(data,
 
 
 def compute_whisper_fbank(data, num_frames=-1, mode='train'):
-    """ Extract whisper fbank 
+    """ Extract whisper fbank
 
         Args:
             data: Iterable[{key, wav, label, sample_rate}]

+ 1 - 1
cosyvoice/vllm/cosyvoice2.py

@@ -99,7 +99,7 @@ class CosyVoice2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         sampling_metadata: Optional[SamplingMetadata] = None,
     ) -> Optional[torch.Tensor]:
         if VLLM_V1_ENGINE_ONLY:
-            logits = self.logits_processor(self.lm_head, hidden_states, 
+            logits = self.logits_processor(self.lm_head, hidden_states,
                                            self.lm_head.bias)
         else:
             logits = self.logits_processor(self.lm_head, hidden_states,

+ 1 - 1
example.py

@@ -99,7 +99,7 @@ def cosyvoice3_example():
     # 歴史的世界においては、過去は単に過ぎ去ったものではない、プラトンのいう如く非有が有である。 -> レキシ テキ セカイ ニ オイ テ ワ、カコ ワ タンニ スギサッ タ モノ デ ワ ナイ、プラトン ノ イウ ゴトク ヒ ユー ガ ユー デ アル。
     for i, j in enumerate(cosyvoice.inference_cross_lingual('You are a helpful assistant.<|endofprompt|>レキシ テキ セカイ ニ オイ テ ワ、カコ ワ タンニ スギサッ タ モノ デ ワ ナイ、プラトン ノ イウ ゴトク ヒ ユー ガ ユー デ アル。',
                                                             './asset/zero_shot_prompt.wav', stream=False)):
-        torchaudio.save('japanese_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)    
+        torchaudio.save('japanese_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
 
 
 def main():

+ 40 - 40
runtime/triton_trtllm/scripts/convert_cosyvoice3_to_hf.py

@@ -73,27 +73,27 @@ def load_cosyvoice3_model(model_dir: str):
     """Load CosyVoice3 model for weight extraction."""
     from hyperpyyaml import load_hyperpyyaml
     from cosyvoice.utils.class_utils import get_model_type
-    
+
     hyper_yaml_path = os.path.join(model_dir, 'cosyvoice3.yaml')
     hf_llm_dir = os.path.join(model_dir, 'CosyVoice-BlankEN')
-    
+
     if not os.path.exists(hyper_yaml_path):
         raise ValueError(f'{hyper_yaml_path} not found!')
-    
+
     with open(hyper_yaml_path, 'r') as f:
         configs = load_hyperpyyaml(
-            f, 
+            f,
             overrides={'qwen_pretrain_path': hf_llm_dir}
         )
-    
+
     # Load LLM only
     llm = configs['llm']
     llm_weights_path = os.path.join(model_dir, 'llm.pt')
     llm.load_state_dict(torch.load(llm_weights_path, map_location='cpu'), strict=True)
     llm.eval()
-    
+
     logger.info(f"Loaded CosyVoice3 LLM from {model_dir}")
-    
+
     return llm, hf_llm_dir, configs
 
 
@@ -125,23 +125,23 @@ def convert_cosyvoice3_to_hf(
         dtype: Data type for saving
     """
     logger.info(f"Loading CosyVoice3 model from {model_dir}")
-    
+
     # 1. Load CosyVoice3 components
     cosyvoice3_llm, hf_llm_dir, configs = load_cosyvoice3_model(model_dir)
-    
+
     # Extract key components
     qwen_model = cosyvoice3_llm.llm.model  # Qwen2ForCausalLM
     speech_embedding = cosyvoice3_llm.speech_embedding  # Embedding for speech tokens
     llm_decoder = cosyvoice3_llm.llm_decoder  # Linear for decoding to speech tokens
-    
+
     speech_token_size = get_speech_token_size(cosyvoice3_llm)
     logger.info(f"Speech token size: {speech_token_size}")
-    
+
     # 2. Load tokenizer and add CosyVoice3 text special tokens + speech tokens
     tokenizer = AutoTokenizer.from_pretrained(hf_llm_dir, trust_remote_code=True)
     base_vocab_size = len(tokenizer)
     logger.info(f"Base tokenizer vocab size: {base_vocab_size}")
-    
+
     # IMPORTANT:
     # - In CosyVoice3, LLM speech special tokens (sos/eos/task_id/fill) are INSIDE speech_embedding,
     #   i.e. represented as <|s_6561|>, <|s_6562|>, <|s_6563|>, <|s_6564|>.
@@ -185,7 +185,7 @@ def convert_cosyvoice3_to_hf(
     tokenizer.add_special_tokens(special_tokens)
     text_vocab_size = len(tokenizer)
     logger.info(f"Tokenizer vocab after CosyVoice3 text special tokens: {text_vocab_size}")
-    
+
     # Add speech tokens: <|s_0|>, <|s_1|>, ..., <|s_{embedding_size-1}|>
     # IMPORTANT: This range must match speech_embedding.num_embeddings (includes speech special tokens).
     actual_speech_tokens = speech_token_size  # Full embedding size (with speech special tokens)
@@ -204,37 +204,37 @@ def convert_cosyvoice3_to_hf(
     assert "<s_6563>" not in speech_tokens
     assert "<s_6564>" not in speech_tokens
     tokenizer.add_tokens(speech_tokens)
-    
+
     new_vocab_size = len(tokenizer)
     logger.info(f"New tokenizer vocab size: {new_vocab_size}")
     logger.info(f"Added {new_vocab_size - base_vocab_size} tokens total (text special + speech tokens)")
-    
+
     # 3. Resize embeddings in Qwen model
     # Align to 128 for TensorRT efficiency
     padded_vocab_size = ((new_vocab_size + 127) // 128) * 128
     qwen_model.resize_token_embeddings(padded_vocab_size)
     logger.info(f"Resized embeddings to: {padded_vocab_size}")
-    
+
     # Speech tokens start after text vocab (base + CosyVoice3 text special tokens)
     speech_token_offset = text_vocab_size
 
     # 4. Copy speech_embedding into extended embed_tokens
     input_embeddings = qwen_model.get_input_embeddings()
     hidden_size = input_embeddings.weight.shape[1]
-    
+
     logger.info(f"Hidden size: {hidden_size}")
     logger.info(f"speech_embedding shape: {speech_embedding.weight.shape}")
     logger.info(f"llm_decoder shape: {llm_decoder.weight.shape}")
-    
+
     with torch.no_grad():
         # Copy speech_embedding weights into embed_tokens
         # Indices: [speech_token_offset, speech_token_offset + speech_token_size)
         src_size = min(speech_embedding.weight.shape[0], actual_speech_tokens)
         input_embeddings.weight[speech_token_offset:speech_token_offset + src_size] = \
             speech_embedding.weight[:src_size].to(input_embeddings.weight.dtype)
-    
+
     logger.info(f"Copied speech_embedding to embed_tokens[{speech_token_offset}:{speech_token_offset + src_size}]")
-    
+
     # 5. Create new lm_head with extended vocab and copy llm_decoder
     # Original lm_head: hidden_size -> original_vocab_size
     # New lm_head: hidden_size -> padded_vocab_size
@@ -247,7 +247,7 @@ def convert_cosyvoice3_to_hf(
         out_features=padded_vocab_size,
         bias=has_bias
     )
-    
+
     with torch.no_grad():
         # Initialize weights:
         # - Text part: copy from original lm_head (or zeros)
@@ -258,42 +258,42 @@ def convert_cosyvoice3_to_hf(
         new_lm_head.weight.data.zero_()
         if has_bias:
             new_lm_head.bias.data.fill_(-float('inf'))
-        
+
         # Copy original lm_head for text tokens (optional)
         original_lm_head = qwen_model.lm_head
         if original_lm_head is not None and original_lm_head.weight.shape[0] >= text_vocab_size:
             new_lm_head.weight[:text_vocab_size] = original_lm_head.weight[:text_vocab_size]
             if has_bias and original_lm_head.bias is not None:
                 new_lm_head.bias[:text_vocab_size] = original_lm_head.bias[:text_vocab_size]
-        
+
         # Copy llm_decoder for speech tokens
         decoder_size = min(llm_decoder.weight.shape[0], actual_speech_tokens)
         new_lm_head.weight[speech_token_offset:speech_token_offset + decoder_size] = \
             llm_decoder.weight[:decoder_size].to(new_lm_head.weight.dtype)
-        
+
         if has_bias:
             new_lm_head.bias[speech_token_offset:speech_token_offset + decoder_size] = \
                 llm_decoder.bias[:decoder_size].to(new_lm_head.bias.dtype)
         else:
             # If llm_decoder has no bias but we want it for text tokens
             pass
-    
+
     # Replace lm_head
     qwen_model.lm_head = new_lm_head
-    
+
     logger.info(f"Created new lm_head with shape: {new_lm_head.weight.shape}")
     logger.info(f"Copied llm_decoder to lm_head[{speech_token_offset}:{speech_token_offset + decoder_size}]")
-    
+
     # 6. Update model configuration
     qwen_model.config.vocab_size = padded_vocab_size
     qwen_model.config.tie_word_embeddings = False  # Embeddings and lm_head are now different!
-    
+
     # Set EOS token for generation (speech EOS lives inside speech_embedding as <|s_{base_speech_token_size+1}|>)
     base_speech_token_size = getattr(cosyvoice3_llm, "speech_token_size", 6561)
     eos_speech_idx = base_speech_token_size + 1
     eos_id = speech_token_offset + eos_speech_idx
     qwen_model.config.eos_token_id = eos_id
-    
+
     # Generation settings
     qwen_model.generation_config.eos_token_id = eos_id
     qwen_model.generation_config.pad_token_id = eos_id
@@ -302,7 +302,7 @@ def convert_cosyvoice3_to_hf(
     qwen_model.generation_config.top_k = 25
     qwen_model.generation_config.repetition_penalty = 1.1
     qwen_model.generation_config.max_new_tokens = 2048
-    
+
     # 7. Convert to target dtype
     dtype_map = {
         "float16": torch.float16,
@@ -311,16 +311,16 @@ def convert_cosyvoice3_to_hf(
     }
     target_dtype = dtype_map[dtype]
     qwen_model.to(target_dtype)
-    
+
     # 8. Save model and tokenizer
     os.makedirs(output_dir, exist_ok=True)
-    
+
     qwen_model.save_pretrained(output_dir)
-    
+
     TEMPLATE = "{%- for message in messages %}{%- if message['role'] == 'user' %}{{- '<|sos|>' + message['content'] + '<|task_id|>' }}{%- elif message['role'] == 'assistant' %}{{- message['content']}}{%- endif %}{%- endfor %}"
     tokenizer.chat_template = TEMPLATE
     tokenizer.save_pretrained(output_dir)
-    
+
     # Save metadata for TRT-LLM inference
     metadata = {
         "original_vocab_size": base_vocab_size,
@@ -332,30 +332,30 @@ def convert_cosyvoice3_to_hf(
         "speech_token_offset": speech_token_offset,
         "dtype": dtype,
     }
-    
+
     import json
     with open(os.path.join(output_dir, "cosyvoice3_metadata.json"), "w") as f:
         json.dump(metadata, f, indent=2)
-    
+
     logger.info(f"Saved HuggingFace model to {output_dir}")
     logger.info(f"Metadata: {metadata}")
-    
+
     return output_dir, metadata
 
 
 def main():
     args = parse_args()
-    
+
     output_dir = args.output_dir
     if output_dir is None:
         output_dir = os.path.join(args.model_dir, "hf_merged")
-    
+
     convert_cosyvoice3_to_hf(
         model_dir=args.model_dir,
         output_dir=output_dir,
         dtype=args.dtype,
     )
-    
+
     print("\n" + "=" * 70)
     print("✅ Conversion complete!")
     print("=" * 70)