|
|
@@ -73,27 +73,27 @@ def load_cosyvoice3_model(model_dir: str):
|
|
|
"""Load CosyVoice3 model for weight extraction."""
|
|
|
from hyperpyyaml import load_hyperpyyaml
|
|
|
from cosyvoice.utils.class_utils import get_model_type
|
|
|
-
|
|
|
+
|
|
|
hyper_yaml_path = os.path.join(model_dir, 'cosyvoice3.yaml')
|
|
|
hf_llm_dir = os.path.join(model_dir, 'CosyVoice-BlankEN')
|
|
|
-
|
|
|
+
|
|
|
if not os.path.exists(hyper_yaml_path):
|
|
|
raise ValueError(f'{hyper_yaml_path} not found!')
|
|
|
-
|
|
|
+
|
|
|
with open(hyper_yaml_path, 'r') as f:
|
|
|
configs = load_hyperpyyaml(
|
|
|
- f,
|
|
|
+ f,
|
|
|
overrides={'qwen_pretrain_path': hf_llm_dir}
|
|
|
)
|
|
|
-
|
|
|
+
|
|
|
# Load LLM only
|
|
|
llm = configs['llm']
|
|
|
llm_weights_path = os.path.join(model_dir, 'llm.pt')
|
|
|
llm.load_state_dict(torch.load(llm_weights_path, map_location='cpu'), strict=True)
|
|
|
llm.eval()
|
|
|
-
|
|
|
+
|
|
|
logger.info(f"Loaded CosyVoice3 LLM from {model_dir}")
|
|
|
-
|
|
|
+
|
|
|
return llm, hf_llm_dir, configs
|
|
|
|
|
|
|
|
|
@@ -125,23 +125,23 @@ def convert_cosyvoice3_to_hf(
|
|
|
dtype: Data type for saving
|
|
|
"""
|
|
|
logger.info(f"Loading CosyVoice3 model from {model_dir}")
|
|
|
-
|
|
|
+
|
|
|
# 1. Load CosyVoice3 components
|
|
|
cosyvoice3_llm, hf_llm_dir, configs = load_cosyvoice3_model(model_dir)
|
|
|
-
|
|
|
+
|
|
|
# Extract key components
|
|
|
qwen_model = cosyvoice3_llm.llm.model # Qwen2ForCausalLM
|
|
|
speech_embedding = cosyvoice3_llm.speech_embedding # Embedding for speech tokens
|
|
|
llm_decoder = cosyvoice3_llm.llm_decoder # Linear for decoding to speech tokens
|
|
|
-
|
|
|
+
|
|
|
speech_token_size = get_speech_token_size(cosyvoice3_llm)
|
|
|
logger.info(f"Speech token size: {speech_token_size}")
|
|
|
-
|
|
|
+
|
|
|
# 2. Load tokenizer and add CosyVoice3 text special tokens + speech tokens
|
|
|
tokenizer = AutoTokenizer.from_pretrained(hf_llm_dir, trust_remote_code=True)
|
|
|
base_vocab_size = len(tokenizer)
|
|
|
logger.info(f"Base tokenizer vocab size: {base_vocab_size}")
|
|
|
-
|
|
|
+
|
|
|
# IMPORTANT:
|
|
|
# - In CosyVoice3, LLM speech special tokens (sos/eos/task_id/fill) are INSIDE speech_embedding,
|
|
|
# i.e. represented as <|s_6561|>, <|s_6562|>, <|s_6563|>, <|s_6564|>.
|
|
|
@@ -185,7 +185,7 @@ def convert_cosyvoice3_to_hf(
|
|
|
tokenizer.add_special_tokens(special_tokens)
|
|
|
text_vocab_size = len(tokenizer)
|
|
|
logger.info(f"Tokenizer vocab after CosyVoice3 text special tokens: {text_vocab_size}")
|
|
|
-
|
|
|
+
|
|
|
# Add speech tokens: <|s_0|>, <|s_1|>, ..., <|s_{embedding_size-1}|>
|
|
|
# IMPORTANT: This range must match speech_embedding.num_embeddings (includes speech special tokens).
|
|
|
actual_speech_tokens = speech_token_size # Full embedding size (with speech special tokens)
|
|
|
@@ -204,37 +204,37 @@ def convert_cosyvoice3_to_hf(
|
|
|
assert "<s_6563>" not in speech_tokens
|
|
|
assert "<s_6564>" not in speech_tokens
|
|
|
tokenizer.add_tokens(speech_tokens)
|
|
|
-
|
|
|
+
|
|
|
new_vocab_size = len(tokenizer)
|
|
|
logger.info(f"New tokenizer vocab size: {new_vocab_size}")
|
|
|
logger.info(f"Added {new_vocab_size - base_vocab_size} tokens total (text special + speech tokens)")
|
|
|
-
|
|
|
+
|
|
|
# 3. Resize embeddings in Qwen model
|
|
|
# Align to 128 for TensorRT efficiency
|
|
|
padded_vocab_size = ((new_vocab_size + 127) // 128) * 128
|
|
|
qwen_model.resize_token_embeddings(padded_vocab_size)
|
|
|
logger.info(f"Resized embeddings to: {padded_vocab_size}")
|
|
|
-
|
|
|
+
|
|
|
# Speech tokens start after text vocab (base + CosyVoice3 text special tokens)
|
|
|
speech_token_offset = text_vocab_size
|
|
|
|
|
|
# 4. Copy speech_embedding into extended embed_tokens
|
|
|
input_embeddings = qwen_model.get_input_embeddings()
|
|
|
hidden_size = input_embeddings.weight.shape[1]
|
|
|
-
|
|
|
+
|
|
|
logger.info(f"Hidden size: {hidden_size}")
|
|
|
logger.info(f"speech_embedding shape: {speech_embedding.weight.shape}")
|
|
|
logger.info(f"llm_decoder shape: {llm_decoder.weight.shape}")
|
|
|
-
|
|
|
+
|
|
|
with torch.no_grad():
|
|
|
# Copy speech_embedding weights into embed_tokens
|
|
|
# Indices: [speech_token_offset, speech_token_offset + speech_token_size)
|
|
|
src_size = min(speech_embedding.weight.shape[0], actual_speech_tokens)
|
|
|
input_embeddings.weight[speech_token_offset:speech_token_offset + src_size] = \
|
|
|
speech_embedding.weight[:src_size].to(input_embeddings.weight.dtype)
|
|
|
-
|
|
|
+
|
|
|
logger.info(f"Copied speech_embedding to embed_tokens[{speech_token_offset}:{speech_token_offset + src_size}]")
|
|
|
-
|
|
|
+
|
|
|
# 5. Create new lm_head with extended vocab and copy llm_decoder
|
|
|
# Original lm_head: hidden_size -> original_vocab_size
|
|
|
# New lm_head: hidden_size -> padded_vocab_size
|
|
|
@@ -247,7 +247,7 @@ def convert_cosyvoice3_to_hf(
|
|
|
out_features=padded_vocab_size,
|
|
|
bias=has_bias
|
|
|
)
|
|
|
-
|
|
|
+
|
|
|
with torch.no_grad():
|
|
|
# Initialize weights:
|
|
|
# - Text part: copy from original lm_head (or zeros)
|
|
|
@@ -258,42 +258,42 @@ def convert_cosyvoice3_to_hf(
|
|
|
new_lm_head.weight.data.zero_()
|
|
|
if has_bias:
|
|
|
new_lm_head.bias.data.fill_(-float('inf'))
|
|
|
-
|
|
|
+
|
|
|
# Copy original lm_head for text tokens (optional)
|
|
|
original_lm_head = qwen_model.lm_head
|
|
|
if original_lm_head is not None and original_lm_head.weight.shape[0] >= text_vocab_size:
|
|
|
new_lm_head.weight[:text_vocab_size] = original_lm_head.weight[:text_vocab_size]
|
|
|
if has_bias and original_lm_head.bias is not None:
|
|
|
new_lm_head.bias[:text_vocab_size] = original_lm_head.bias[:text_vocab_size]
|
|
|
-
|
|
|
+
|
|
|
# Copy llm_decoder for speech tokens
|
|
|
decoder_size = min(llm_decoder.weight.shape[0], actual_speech_tokens)
|
|
|
new_lm_head.weight[speech_token_offset:speech_token_offset + decoder_size] = \
|
|
|
llm_decoder.weight[:decoder_size].to(new_lm_head.weight.dtype)
|
|
|
-
|
|
|
+
|
|
|
if has_bias:
|
|
|
new_lm_head.bias[speech_token_offset:speech_token_offset + decoder_size] = \
|
|
|
llm_decoder.bias[:decoder_size].to(new_lm_head.bias.dtype)
|
|
|
else:
|
|
|
# If llm_decoder has no bias but we want it for text tokens
|
|
|
pass
|
|
|
-
|
|
|
+
|
|
|
# Replace lm_head
|
|
|
qwen_model.lm_head = new_lm_head
|
|
|
-
|
|
|
+
|
|
|
logger.info(f"Created new lm_head with shape: {new_lm_head.weight.shape}")
|
|
|
logger.info(f"Copied llm_decoder to lm_head[{speech_token_offset}:{speech_token_offset + decoder_size}]")
|
|
|
-
|
|
|
+
|
|
|
# 6. Update model configuration
|
|
|
qwen_model.config.vocab_size = padded_vocab_size
|
|
|
qwen_model.config.tie_word_embeddings = False # Embeddings and lm_head are now different!
|
|
|
-
|
|
|
+
|
|
|
# Set EOS token for generation (speech EOS lives inside speech_embedding as <|s_{base_speech_token_size+1}|>)
|
|
|
base_speech_token_size = getattr(cosyvoice3_llm, "speech_token_size", 6561)
|
|
|
eos_speech_idx = base_speech_token_size + 1
|
|
|
eos_id = speech_token_offset + eos_speech_idx
|
|
|
qwen_model.config.eos_token_id = eos_id
|
|
|
-
|
|
|
+
|
|
|
# Generation settings
|
|
|
qwen_model.generation_config.eos_token_id = eos_id
|
|
|
qwen_model.generation_config.pad_token_id = eos_id
|
|
|
@@ -302,7 +302,7 @@ def convert_cosyvoice3_to_hf(
|
|
|
qwen_model.generation_config.top_k = 25
|
|
|
qwen_model.generation_config.repetition_penalty = 1.1
|
|
|
qwen_model.generation_config.max_new_tokens = 2048
|
|
|
-
|
|
|
+
|
|
|
# 7. Convert to target dtype
|
|
|
dtype_map = {
|
|
|
"float16": torch.float16,
|
|
|
@@ -311,16 +311,16 @@ def convert_cosyvoice3_to_hf(
|
|
|
}
|
|
|
target_dtype = dtype_map[dtype]
|
|
|
qwen_model.to(target_dtype)
|
|
|
-
|
|
|
+
|
|
|
# 8. Save model and tokenizer
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
-
|
|
|
+
|
|
|
qwen_model.save_pretrained(output_dir)
|
|
|
-
|
|
|
+
|
|
|
TEMPLATE = "{%- for message in messages %}{%- if message['role'] == 'user' %}{{- '<|sos|>' + message['content'] + '<|task_id|>' }}{%- elif message['role'] == 'assistant' %}{{- message['content']}}{%- endif %}{%- endfor %}"
|
|
|
tokenizer.chat_template = TEMPLATE
|
|
|
tokenizer.save_pretrained(output_dir)
|
|
|
-
|
|
|
+
|
|
|
# Save metadata for TRT-LLM inference
|
|
|
metadata = {
|
|
|
"original_vocab_size": base_vocab_size,
|
|
|
@@ -332,30 +332,30 @@ def convert_cosyvoice3_to_hf(
|
|
|
"speech_token_offset": speech_token_offset,
|
|
|
"dtype": dtype,
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
import json
|
|
|
with open(os.path.join(output_dir, "cosyvoice3_metadata.json"), "w") as f:
|
|
|
json.dump(metadata, f, indent=2)
|
|
|
-
|
|
|
+
|
|
|
logger.info(f"Saved HuggingFace model to {output_dir}")
|
|
|
logger.info(f"Metadata: {metadata}")
|
|
|
-
|
|
|
+
|
|
|
return output_dir, metadata
|
|
|
|
|
|
|
|
|
def main():
|
|
|
args = parse_args()
|
|
|
-
|
|
|
+
|
|
|
output_dir = args.output_dir
|
|
|
if output_dir is None:
|
|
|
output_dir = os.path.join(args.model_dir, "hf_merged")
|
|
|
-
|
|
|
+
|
|
|
convert_cosyvoice3_to_hf(
|
|
|
model_dir=args.model_dir,
|
|
|
output_dir=output_dir,
|
|
|
dtype=args.dtype,
|
|
|
)
|
|
|
-
|
|
|
+
|
|
|
print("\n" + "=" * 70)
|
|
|
print("✅ Conversion complete!")
|
|
|
print("=" * 70)
|