pretrained_to_huggingface.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  2. # SPDX-License-Identifier: Apache-2.0
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Usage: Instruct TTS
  17. python3 infer.py \
  18. --token2wav-path /workspace/CosyVoice2-0.5B \
  19. --prompt-text "吃燕窝就选燕之屋,本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝,营养更均衡,本节目由豆本豆豆奶特约播出。" \
  20. --prompt-speech-path ./assets/prompt_audio.wav \
  21. --model-path ./transformers_cosyvoice2_llm \
  22. --input-text "用四川话说<|endofprompt|>扁担长,板凳宽,扁担绑在板凳上。吃葡萄不吐葡萄皮,不吃葡萄倒吐葡萄皮。"
  23. """
  24. from cosyvoice.cli.cosyvoice import CosyVoice2
  25. import sys
  26. from argparse import ArgumentParser
  27. from transformers import AutoTokenizer, AutoModelForCausalLM
  28. import torch
  29. sys.path.append("/workspace/CosyVoice/third_party/Matcha-TTS")
  30. def get_args():
  31. parser = ArgumentParser()
  32. parser.add_argument(
  33. "--pretrained-cosyvoice2-path",
  34. type=str,
  35. default="/workspace/CosyVoice2-0.5B",
  36. help="Token2Wav path, default to %(default)r",
  37. )
  38. parser.add_argument(
  39. "--save-path",
  40. type=str,
  41. default='./transformers_cosyvoice2_llm',
  42. help="The path to save the model",
  43. )
  44. args = parser.parse_args()
  45. return args
  46. if __name__ == "__main__":
  47. args = get_args()
  48. cosy2_model = CosyVoice2(
  49. args.pretrained_cosyvoice2_path, load_jit=False, load_trt=False, fp16=False
  50. )
  51. llm = cosy2_model.model.llm.llm.model
  52. speech_embedding = cosy2_model.model.llm.speech_embedding
  53. llm_decoder = cosy2_model.model.llm.llm_decoder
  54. llm_embedding = cosy2_model.model.llm.llm_embedding
  55. tokenizer = AutoTokenizer.from_pretrained(f"{args.pretrained_cosyvoice2_path}/CosyVoice-BlankEN")
  56. special_tokens = {
  57. 'eos_token': '<|endoftext|>',
  58. 'pad_token': '<|endoftext|>',
  59. 'additional_special_tokens': [
  60. '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
  61. '[breath]', '<strong>', '</strong>', '[noise]',
  62. '[laughter]', '[cough]', '[clucking]', '[accent]',
  63. '[quick_breath]',
  64. "<laughter>", "</laughter>",
  65. "[hissing]", "[sigh]", "[vocalized-noise]",
  66. "[lipsmack]", "[mn]"
  67. ]
  68. }
  69. tokenizer.add_special_tokens(special_tokens)
  70. original_tokenizer_vocab_size = len(tokenizer)
  71. cosyvoice2_token_size = 6561
  72. new_tokens = [f"<|s_{i}|>" for i in range(cosyvoice2_token_size)] + [
  73. "<|eos1|>", "<|eos2|>", "<|eos3|>", "<|sos|>", "<|task_id|>"
  74. ]
  75. num_added_tokens = tokenizer.add_tokens(new_tokens)
  76. llm.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)
  77. vocab_size = llm.get_input_embeddings().weight.shape[0]
  78. feature_size = speech_embedding.embedding_dim
  79. new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=vocab_size, bias=True)
  80. with torch.no_grad():
  81. # set the weight and bias of the new lm_head to 0
  82. new_lm_head.weight.data.zero_()
  83. # make bias value -inf
  84. new_lm_head.bias.data.fill_(-float('inf'))
  85. new_lm_head.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = llm_decoder.weight
  86. new_lm_head.bias[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = llm_decoder.bias
  87. llm.lm_head = new_lm_head
  88. input_embeddings = llm.get_input_embeddings()
  89. with torch.no_grad():
  90. input_embeddings.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = speech_embedding.weight
  91. input_embeddings.weight[original_tokenizer_vocab_size + cosyvoice2_token_size + 3:original_tokenizer_vocab_size + cosyvoice2_token_size + 3 + 2] = llm_embedding.weight
  92. eos_token_ids = [original_tokenizer_vocab_size + cosyvoice2_token_size,
  93. original_tokenizer_vocab_size + cosyvoice2_token_size + 1,
  94. original_tokenizer_vocab_size + cosyvoice2_token_size + 2]
  95. llm.generation_config.eos_token_id = eos_token_ids
  96. llm.generation_config.temperature = 1.0
  97. llm.generation_config.top_p = 0.8
  98. llm.generation_config.top_k = 25
  99. llm.config.eos_token_id = original_tokenizer_vocab_size + cosyvoice2_token_size
  100. llm.config.vocab_size = vocab_size
  101. llm.config.tie_word_embeddings = False
  102. llm.config.use_bias = True
  103. llm.to(torch.bfloat16)
  104. llm.save_pretrained(args.save_path)
  105. TEMPLATE = (
  106. "{%- for message in messages %}"
  107. "{%- if message['role'] == 'user' %}"
  108. "{{- '<|sos|>' + message['content'] + '<|task_id|>' }}"
  109. "{%- elif message['role'] == 'assistant' %}"
  110. "{{- message['content']}}"
  111. "{%- endif %}"
  112. "{%- endfor %}"
  113. )
  114. tokenizer.chat_template = TEMPLATE
  115. tokenizer.save_pretrained(args.save_path)