convert_cosyvoice3_to_hf.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. #!/usr/bin/env python3
  2. # Copyright 2025 CosyVoice3 TRT-LLM Integration
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Конвертация CosyVoice3 LLM в HuggingFace формат с объединёнными embeddings.
  17. Этот скрипт:
  18. 1. Загружает CosyVoice3 модель
  19. 2. Расширяет vocab токенизатора с speech токенами
  20. 3. Объединяет speech_embedding в embed_tokens Qwen2
  21. 4. Заменяет lm_head на llm_decoder с расширенным vocab
  22. 5. Сохраняет модель в HuggingFace формате для TRT-LLM конвертации
  23. Usage:
  24. python scripts/convert_cosyvoice3_to_hf.py \
  25. --model-dir pretrained_models/Fun-CosyVoice3-0.5B \
  26. --output-dir pretrained_models/Fun-CosyVoice3-0.5B/hf_merged
  27. После этого можно конвертировать в TRT-LLM:
  28. trtllm-build --checkpoint_dir <output_dir> --output_dir <trt_engines_dir> ...
  29. """
  30. import argparse
  31. import os
  32. import sys
  33. import logging
  34. import torch
  35. from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  36. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  37. sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'third_party/Matcha-TTS'))
  38. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  39. logger = logging.getLogger(__name__)
  40. def parse_args():
  41. parser = argparse.ArgumentParser(description="Convert CosyVoice3 to HuggingFace format with merged embeddings")
  42. parser.add_argument(
  43. "--model-dir",
  44. type=str,
  45. default="pretrained_models/Fun-CosyVoice3-0.5B",
  46. help="Path to CosyVoice3 model directory",
  47. )
  48. parser.add_argument(
  49. "--output-dir",
  50. type=str,
  51. default=None,
  52. help="Output directory for HuggingFace model (default: <model-dir>/hf_merged)",
  53. )
  54. parser.add_argument(
  55. "--dtype",
  56. type=str,
  57. default="bfloat16",
  58. choices=["float16", "bfloat16", "float32"],
  59. help="Output dtype for the model",
  60. )
  61. return parser.parse_args()
  62. def load_cosyvoice3_model(model_dir: str):
  63. """Загружает CosyVoice3 модель для извлечения весов."""
  64. from hyperpyyaml import load_hyperpyyaml
  65. from cosyvoice.utils.class_utils import get_model_type
  66. hyper_yaml_path = os.path.join(model_dir, 'cosyvoice3.yaml')
  67. hf_llm_dir = os.path.join(model_dir, 'CosyVoice-BlankEN')
  68. if not os.path.exists(hyper_yaml_path):
  69. raise ValueError(f'{hyper_yaml_path} not found!')
  70. with open(hyper_yaml_path, 'r') as f:
  71. configs = load_hyperpyyaml(
  72. f,
  73. overrides={'qwen_pretrain_path': hf_llm_dir}
  74. )
  75. # Загружаем только LLM
  76. llm = configs['llm']
  77. llm_weights_path = os.path.join(model_dir, 'llm.pt')
  78. llm.load_state_dict(torch.load(llm_weights_path, map_location='cpu'), strict=True)
  79. llm.eval()
  80. logger.info(f"Loaded CosyVoice3 LLM from {model_dir}")
  81. return llm, hf_llm_dir, configs
  82. def get_speech_token_size(llm) -> int:
  83. """Определяет размер speech token vocabulary из модели."""
  84. # CosyVoice3LM имеет: speech_token_size + 200 в llm_decoder
  85. # speech_embedding имеет: speech_token_size + 200
  86. speech_embedding_size = llm.speech_embedding.num_embeddings
  87. # Вычитаем 200 специальных токенов (sos, eos, task_id, fill, и т.д.)
  88. # Но для безопасности используем полный размер embedding
  89. return speech_embedding_size
  90. def convert_cosyvoice3_to_hf(
  91. model_dir: str,
  92. output_dir: str,
  93. dtype: str = "bfloat16",
  94. ):
  95. """
  96. Конвертирует CosyVoice3 LLM в HuggingFace формат с объединёнными embeddings.
  97. Архитектура объединения:
  98. - embed_tokens[0:original_vocab_size] = оригинальные text embeddings
  99. - embed_tokens[original_vocab_size:original_vocab_size+speech_token_size] = speech_embedding
  100. - lm_head[original_vocab_size:original_vocab_size+speech_token_size] = llm_decoder
  101. Args:
  102. model_dir: Путь к CosyVoice3 модели
  103. output_dir: Путь для сохранения HF модели
  104. dtype: Тип данных для сохранения
  105. """
  106. logger.info(f"Loading CosyVoice3 model from {model_dir}")
  107. # 1. Загружаем CosyVoice3 компоненты
  108. cosyvoice3_llm, hf_llm_dir, configs = load_cosyvoice3_model(model_dir)
  109. # Извлекаем ключевые компоненты
  110. qwen_model = cosyvoice3_llm.llm.model # Qwen2ForCausalLM
  111. speech_embedding = cosyvoice3_llm.speech_embedding # Embedding для speech токенов
  112. llm_decoder = cosyvoice3_llm.llm_decoder # Linear для декодирования в speech токены
  113. speech_token_size = get_speech_token_size(cosyvoice3_llm)
  114. logger.info(f"Speech token size: {speech_token_size}")
  115. # 2. Загружаем tokenizer и добавляем CosyVoice3 text special tokens + speech токены
  116. tokenizer = AutoTokenizer.from_pretrained(hf_llm_dir, trust_remote_code=True)
  117. base_vocab_size = len(tokenizer)
  118. logger.info(f"Base tokenizer vocab size: {base_vocab_size}")
  119. # IMPORTANT:
  120. # - In CosyVoice3, LLM speech special tokens (sos/eos/task_id/fill) are INSIDE speech_embedding,
  121. # i.e. represented as <|s_6561|>, <|s_6562|>, <|s_6563|>, <|s_6564|>.
  122. # - But text-level special tokens like [cough]/[laughter] MUST exist in tokenizer
  123. # (mirrors `CosyVoice3Tokenizer` from `cosyvoice/tokenizer/tokenizer.py`).
  124. special_tokens = {
  125. 'eos_token': '<|endoftext|>',
  126. 'pad_token': '<|endoftext|>',
  127. 'additional_special_tokens': [
  128. '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
  129. '[breath]', '<strong>', '</strong>', '[noise]',
  130. '[laughter]', '[cough]', '[clucking]', '[accent]',
  131. '[quick_breath]',
  132. "<laughter>", "</laughter>",
  133. "[hissing]", "[sigh]", "[vocalized-noise]",
  134. "[lipsmack]", "[mn]", "<|endofsystem|>",
  135. # Phoneme tokens (kept consistent with CosyVoice3Tokenizer)
  136. "[AA]", "[AA0]", "[AA1]", "[AA2]", "[AE]", "[AE0]", "[AE1]", "[AE2]", "[AH]", "[AH0]", "[AH1]", "[AH2]",
  137. "[AO]", "[AO0]", "[AO1]", "[AO2]", "[AW]", "[AW0]", "[AW1]", "[AW2]", "[AY]", "[AY0]", "[AY1]", "[AY2]",
  138. "[B]", "[CH]", "[D]", "[DH]", "[EH]", "[EH0]", "[EH1]", "[EH2]", "[ER]", "[ER0]", "[ER1]", "[ER2]", "[EY]",
  139. "[EY0]", "[EY1]", "[EY2]", "[F]", "[G]", "[HH]", "[IH]", "[IH0]", "[IH1]", "[IH2]", "[IY]", "[IY0]", "[IY1]",
  140. "[IY2]", "[JH]", "[K]", "[L]", "[M]", "[N]", "[NG]", "[OW]", "[OW0]", "[OW1]", "[OW2]", "[OY]", "[OY0]",
  141. "[OY1]", "[OY2]", "[P]", "[R]", "[S]", "[SH]", "[T]", "[TH]", "[UH]", "[UH0]", "[UH1]", "[UH2]", "[UW]",
  142. "[UW0]", "[UW1]", "[UW2]", "[V]", "[W]", "[Y]", "[Z]", "[ZH]",
  143. "[a]", "[ai]", "[an]", "[ang]", "[ao]", "[b]", "[c]", "[ch]", "[d]", "[e]", "[ei]", "[en]", "[eng]", "[f]",
  144. "[g]", "[h]", "[i]", "[ian]", "[in]", "[ing]", "[iu]", "[ià]", "[iàn]", "[iàng]", "[iào]", "[iá]", "[ián]",
  145. "[iáng]", "[iáo]", "[iè]", "[ié]", "[iòng]", "[ióng]", "[iù]", "[iú]", "[iā]", "[iān]", "[iāng]", "[iāo]",
  146. "[iē]", "[iě]", "[iōng]", "[iū]", "[iǎ]", "[iǎn]", "[iǎng]", "[iǎo]", "[iǒng]", "[iǔ]", "[j]", "[k]", "[l]",
  147. "[m]", "[n]", "[o]", "[ong]", "[ou]", "[p]", "[q]", "[r]",
  148. "[s]", "[sh]", "[t]", "[u]", "[uang]", "[ue]",
  149. "[un]", "[uo]", "[uà]", "[uài]", "[uàn]", "[uàng]", "[uá]", "[uái]", "[uán]", "[uáng]", "[uè]", "[ué]", "[uì]",
  150. "[uí]", "[uò]", "[uó]", "[uā]", "[uāi]", "[uān]", "[uāng]", "[uē]", "[uě]", "[uī]", "[uō]", "[uǎ]", "[uǎi]",
  151. "[uǎn]", "[uǎng]", "[uǐ]", "[uǒ]", "[vè]", "[w]", "[x]", "[y]", "[z]", "[zh]", "[à]", "[ài]", "[àn]", "[àng]",
  152. "[ào]", "[á]", "[ái]", "[án]", "[áng]", "[áo]", "[è]", "[èi]", "[èn]", "[èng]", "[èr]", "[é]", "[éi]", "[én]",
  153. "[éng]", "[ér]", "[ì]", "[ìn]", "[ìng]", "[í]", "[ín]", "[íng]", "[ò]", "[òng]", "[òu]", "[ó]", "[óng]", "[óu]",
  154. "[ù]", "[ùn]", "[ú]", "[ún]", "[ā]", "[āi]", "[ān]", "[āng]", "[āo]", "[ē]", "[ēi]", "[ēn]", "[ēng]", "[ě]",
  155. "[ěi]", "[ěn]", "[ěng]", "[ěr]", "[ī]", "[īn]", "[īng]", "[ō]", "[ōng]", "[ōu]", "[ū]", "[ūn]", "[ǎ]", "[ǎi]",
  156. "[ǎn]", "[ǎng]", "[ǎo]", "[ǐ]", "[ǐn]", "[ǐng]", "[ǒ]", "[ǒng]", "[ǒu]", "[ǔ]", "[ǔn]", "[ǘ]", "[ǚ]", "[ǜ]"
  157. ]
  158. }
  159. tokenizer.add_special_tokens(special_tokens)
  160. text_vocab_size = len(tokenizer)
  161. logger.info(f"Tokenizer vocab after CosyVoice3 text special tokens: {text_vocab_size}")
  162. # Add speech tokens: <|s_0|>, <|s_1|>, ..., <|s_{embedding_size-1}|>
  163. # IMPORTANT: This range must match speech_embedding.num_embeddings (includes speech special tokens).
  164. actual_speech_tokens = speech_token_size # Full embedding size (with speech special tokens)
  165. # replace <s_6561> to <|sos|>
  166. # replace <s_6562> to <|eos1|>
  167. # replace <s_6563> to <|task_id|>
  168. # replace <s_6564> to <|fill|>
  169. speech_tokens = [f"<|s_{i}|>" for i in range(actual_speech_tokens)]
  170. speech_tokens[6561] = "<|sos|>"
  171. speech_tokens[6562] = "<|eos1|>"
  172. speech_tokens[6563] = "<|task_id|>"
  173. speech_tokens[6564] = "<|fill|>"
  174. assert "<s_6561>" not in speech_tokens
  175. assert "<s_6562>" not in speech_tokens
  176. assert "<s_6563>" not in speech_tokens
  177. assert "<s_6564>" not in speech_tokens
  178. tokenizer.add_tokens(speech_tokens)
  179. new_vocab_size = len(tokenizer)
  180. logger.info(f"New tokenizer vocab size: {new_vocab_size}")
  181. logger.info(f"Added {new_vocab_size - base_vocab_size} tokens total (text special + speech tokens)")
  182. # 3. Изменяем размер embeddings в Qwen модели
  183. # Выравниваем по 128 для эффективности TensorRT
  184. padded_vocab_size = ((new_vocab_size + 127) // 128) * 128
  185. qwen_model.resize_token_embeddings(padded_vocab_size)
  186. logger.info(f"Resized embeddings to: {padded_vocab_size}")
  187. # Speech tokens start after text vocab (base + CosyVoice3 text special tokens)
  188. speech_token_offset = text_vocab_size
  189. # 4. Копируем speech_embedding в расширенную часть embed_tokens
  190. input_embeddings = qwen_model.get_input_embeddings()
  191. hidden_size = input_embeddings.weight.shape[1]
  192. logger.info(f"Hidden size: {hidden_size}")
  193. logger.info(f"speech_embedding shape: {speech_embedding.weight.shape}")
  194. logger.info(f"llm_decoder shape: {llm_decoder.weight.shape}")
  195. with torch.no_grad():
  196. # Копируем speech_embedding веса в embed_tokens
  197. # Indices: [speech_token_offset, speech_token_offset + speech_token_size)
  198. src_size = min(speech_embedding.weight.shape[0], actual_speech_tokens)
  199. input_embeddings.weight[speech_token_offset:speech_token_offset + src_size] = \
  200. speech_embedding.weight[:src_size].to(input_embeddings.weight.dtype)
  201. logger.info(f"Copied speech_embedding to embed_tokens[{speech_token_offset}:{speech_token_offset + src_size}]")
  202. # 5. Создаём новый lm_head с расширенным vocab и копируем llm_decoder
  203. # Оригинальный lm_head: hidden_size -> original_vocab_size
  204. # Новый lm_head: hidden_size -> padded_vocab_size
  205. # llm_decoder: hidden_size -> speech_token_size
  206. # Создаём новый lm_head
  207. has_bias = llm_decoder.bias is not None
  208. new_lm_head = torch.nn.Linear(
  209. in_features=hidden_size,
  210. out_features=padded_vocab_size,
  211. bias=has_bias
  212. )
  213. with torch.no_grad():
  214. # Инициализируем веса:
  215. # - Text часть: копируем из оригинального lm_head (или нули)
  216. # - Speech часть: копируем из llm_decoder
  217. # - Padding: нули
  218. # Сначала заполняем нулями и -inf в bias (чтобы text токены не генерировались)
  219. new_lm_head.weight.data.zero_()
  220. if has_bias:
  221. new_lm_head.bias.data.fill_(-float('inf'))
  222. # Копируем оригинальный lm_head для text токенов (опционально)
  223. original_lm_head = qwen_model.lm_head
  224. if original_lm_head is not None and original_lm_head.weight.shape[0] >= text_vocab_size:
  225. new_lm_head.weight[:text_vocab_size] = original_lm_head.weight[:text_vocab_size]
  226. if has_bias and original_lm_head.bias is not None:
  227. new_lm_head.bias[:text_vocab_size] = original_lm_head.bias[:text_vocab_size]
  228. # Копируем llm_decoder для speech токенов
  229. decoder_size = min(llm_decoder.weight.shape[0], actual_speech_tokens)
  230. new_lm_head.weight[speech_token_offset:speech_token_offset + decoder_size] = \
  231. llm_decoder.weight[:decoder_size].to(new_lm_head.weight.dtype)
  232. if has_bias:
  233. new_lm_head.bias[speech_token_offset:speech_token_offset + decoder_size] = \
  234. llm_decoder.bias[:decoder_size].to(new_lm_head.bias.dtype)
  235. else:
  236. # Если llm_decoder не имеет bias, но мы хотим его для text токенов
  237. pass
  238. # Заменяем lm_head
  239. qwen_model.lm_head = new_lm_head
  240. logger.info(f"Created new lm_head with shape: {new_lm_head.weight.shape}")
  241. logger.info(f"Copied llm_decoder to lm_head[{speech_token_offset}:{speech_token_offset + decoder_size}]")
  242. # 6. Обновляем конфигурацию модели
  243. qwen_model.config.vocab_size = padded_vocab_size
  244. qwen_model.config.tie_word_embeddings = False # Embeddings и lm_head теперь разные!
  245. # Set EOS token for generation (speech EOS lives inside speech_embedding as <|s_{base_speech_token_size+1}|>)
  246. base_speech_token_size = getattr(cosyvoice3_llm, "speech_token_size", 6561)
  247. eos_speech_idx = base_speech_token_size + 1
  248. eos_id = speech_token_offset + eos_speech_idx
  249. qwen_model.config.eos_token_id = eos_id
  250. # Настройки генерации
  251. qwen_model.generation_config.eos_token_id = eos_id
  252. qwen_model.generation_config.pad_token_id = eos_id
  253. qwen_model.generation_config.temperature = 0.8
  254. qwen_model.generation_config.top_p = 0.95
  255. qwen_model.generation_config.top_k = 25
  256. qwen_model.generation_config.repetition_penalty = 1.1
  257. qwen_model.generation_config.max_new_tokens = 2048
  258. # 7. Конвертируем в нужный dtype
  259. dtype_map = {
  260. "float16": torch.float16,
  261. "bfloat16": torch.bfloat16,
  262. "float32": torch.float32,
  263. }
  264. target_dtype = dtype_map[dtype]
  265. qwen_model.to(target_dtype)
  266. # 8. Сохраняем модель и tokenizer
  267. os.makedirs(output_dir, exist_ok=True)
  268. qwen_model.save_pretrained(output_dir)
  269. TEMPLATE = "{%- for message in messages %}{%- if message['role'] == 'user' %}{{- '<|sos|>' + message['content'] + '<|task_id|>' }}{%- elif message['role'] == 'assistant' %}{{- message['content']}}{%- endif %}{%- endfor %}"
  270. tokenizer.chat_template = TEMPLATE
  271. tokenizer.save_pretrained(output_dir)
  272. # Сохраняем метаданные для TRT-LLM inference
  273. metadata = {
  274. "original_vocab_size": base_vocab_size,
  275. "text_vocab_size": text_vocab_size,
  276. "base_speech_token_size": base_speech_token_size,
  277. "embedding_size": actual_speech_tokens,
  278. "padded_vocab_size": padded_vocab_size,
  279. "eos_token_id": eos_id,
  280. "speech_token_offset": speech_token_offset,
  281. "dtype": dtype,
  282. }
  283. import json
  284. with open(os.path.join(output_dir, "cosyvoice3_metadata.json"), "w") as f:
  285. json.dump(metadata, f, indent=2)
  286. logger.info(f"Saved HuggingFace model to {output_dir}")
  287. logger.info(f"Metadata: {metadata}")
  288. return output_dir, metadata
  289. def main():
  290. args = parse_args()
  291. output_dir = args.output_dir
  292. if output_dir is None:
  293. output_dir = os.path.join(args.model_dir, "hf_merged")
  294. convert_cosyvoice3_to_hf(
  295. model_dir=args.model_dir,
  296. output_dir=output_dir,
  297. dtype=args.dtype,
  298. )
  299. print("\n" + "=" * 70)
  300. print("✅ Conversion complete!")
  301. print("=" * 70)
  302. print(f"\nHuggingFace model saved to: {output_dir}")
  303. print("\nNext steps:")
  304. print("1. Convert to TRT-LLM weights:")
  305. print(f" python -c \"from tensorrt_llm.models import QWenForCausalLM; ...")
  306. print("\n2. Build TRT-LLM engines:")
  307. print(f" trtllm-build --checkpoint_dir <trt_weights_dir> --output_dir <trt_engines_dir> ...")
  308. print("=" * 70)
  309. if __name__ == "__main__":
  310. main()