pretrained_to_huggingface.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. #!/usr/bin/env python3
  2. # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  3. # SPDX-License-Identifier: Apache-2.0
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """
  17. Usage: Instruct TTS
  18. python3 infer.py \
  19. --token2wav-path /workspace/CosyVoice2-0.5B \
  20. --prompt-text "吃燕窝就选燕之屋,本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝,营养更均衡,本节目由豆本豆豆奶特约播出。" \
  21. --prompt-speech-path ./assets/prompt_audio.wav \
  22. --model-path ./transformers_cosyvoice2_llm \
  23. --input-text "用四川话说<|endofprompt|>扁担长,板凳宽,扁担绑在板凳上。吃葡萄不吐葡萄皮,不吃葡萄倒吐葡萄皮。"
  24. """
  25. from cosyvoice.cli.cosyvoice import CosyVoice2
  26. import sys
  27. from argparse import ArgumentParser
  28. from transformers import AutoTokenizer, AutoModelForCausalLM
  29. import torch
  30. sys.path.append("/workspace/CosyVoice/third_party/Matcha-TTS")
  31. def get_args():
  32. parser = ArgumentParser()
  33. parser.add_argument(
  34. "--pretrained-cosyvoice2-path",
  35. type=str,
  36. default="/workspace/CosyVoice2-0.5B",
  37. help="Token2Wav path, default to %(default)r",
  38. )
  39. parser.add_argument(
  40. "--save-path",
  41. type=str,
  42. default='./transformers_cosyvoice2_llm',
  43. help="The path to save the model",
  44. )
  45. args = parser.parse_args()
  46. return args
  47. if __name__ == "__main__":
  48. args = get_args()
  49. cosy2_model = CosyVoice2(
  50. args.pretrained_cosyvoice2_path, load_jit=False, load_trt=False, fp16=False
  51. )
  52. llm = cosy2_model.model.llm.llm.model
  53. speech_embedding = cosy2_model.model.llm.speech_embedding
  54. llm_decoder = cosy2_model.model.llm.llm_decoder
  55. llm_embedding = cosy2_model.model.llm.llm_embedding
  56. tokenizer = AutoTokenizer.from_pretrained(f"{args.pretrained_cosyvoice2_path}/CosyVoice-BlankEN")
  57. special_tokens = {
  58. 'eos_token': '<|endoftext|>',
  59. 'pad_token': '<|endoftext|>',
  60. 'additional_special_tokens': [
  61. '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
  62. '[breath]', '<strong>', '</strong>', '[noise]',
  63. '[laughter]', '[cough]', '[clucking]', '[accent]',
  64. '[quick_breath]',
  65. "<laughter>", "</laughter>",
  66. "[hissing]", "[sigh]", "[vocalized-noise]",
  67. "[lipsmack]", "[mn]"
  68. ]
  69. }
  70. tokenizer.add_special_tokens(special_tokens)
  71. original_tokenizer_vocab_size = len(tokenizer)
  72. cosyvoice2_token_size = 6561
  73. new_tokens = [f"<|s_{i}|>" for i in range(cosyvoice2_token_size)] + [
  74. "<|eos1|>", "<|eos2|>", "<|eos3|>", "<|sos|>", "<|task_id|>"
  75. ]
  76. num_added_tokens = tokenizer.add_tokens(new_tokens)
  77. llm.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=128)
  78. vocab_size = llm.get_input_embeddings().weight.shape[0]
  79. feature_size = speech_embedding.embedding_dim
  80. new_lm_head = torch.nn.Linear(in_features=feature_size, out_features=vocab_size, bias=True)
  81. with torch.no_grad():
  82. # set the weight and bias of the new lm_head to 0
  83. new_lm_head.weight.data.zero_()
  84. # make bias value -inf
  85. new_lm_head.bias.data.fill_(-float('inf'))
  86. new_lm_head.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = llm_decoder.weight
  87. new_lm_head.bias[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = llm_decoder.bias
  88. llm.lm_head = new_lm_head
  89. input_embeddings = llm.get_input_embeddings()
  90. with torch.no_grad():
  91. input_embeddings.weight[original_tokenizer_vocab_size:original_tokenizer_vocab_size + cosyvoice2_token_size + 3] = speech_embedding.weight
  92. input_embeddings.weight[original_tokenizer_vocab_size + cosyvoice2_token_size + 3:original_tokenizer_vocab_size + cosyvoice2_token_size + 3 + 2] = llm_embedding.weight
  93. eos_token_ids = [original_tokenizer_vocab_size + cosyvoice2_token_size,
  94. original_tokenizer_vocab_size + cosyvoice2_token_size + 1,
  95. original_tokenizer_vocab_size + cosyvoice2_token_size + 2]
  96. llm.generation_config.eos_token_id = eos_token_ids
  97. llm.generation_config.temperature = 1.0
  98. llm.generation_config.top_p = 0.8
  99. llm.generation_config.top_k = 25
  100. llm.config.eos_token_id = original_tokenizer_vocab_size + cosyvoice2_token_size
  101. llm.config.vocab_size = vocab_size
  102. llm.config.tie_word_embeddings = False
  103. llm.config.use_bias = True
  104. llm.to(torch.bfloat16)
  105. llm.save_pretrained(args.save_path)
  106. TEMPLATE = (
  107. "{%- for message in messages %}"
  108. "{%- if message['role'] == 'user' %}"
  109. "{{- '<|sos|>' + message['content'] + '<|task_id|>' }}"
  110. "{%- elif message['role'] == 'assistant' %}"
  111. "{{- message['content']}}"
  112. "{%- endif %}"
  113. "{%- endfor %}"
  114. )
  115. tokenizer.chat_template = TEMPLATE
  116. tokenizer.save_pretrained(args.save_path)