run.sh 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. #!/usr/bin/env bash
  2. set -eou pipefail
  3. stage=-1
  4. stop_stage=4
  5. log() {
  6. # This function is from espnet
  7. local fname=${BASH_SOURCE[1]##*/}
  8. echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  9. }
  10. export PYTHONPATH=/workspace/CosyVoice
  11. model_scope_model_path=./CosyVoice2-0.5B
  12. sft_model_path=./transformers_cosyvoice2_llm
  13. if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then
  14. log "stage -2: install dependencies locally if pre-built docker image is not available"
  15. conda create -n cosyvoice2 python=3.10 -y
  16. conda activate cosyvoice2
  17. # install verl
  18. git clone https://github.com/yuekaizhang/verl.git -b thread
  19. cd verl
  20. USE_MEGATRON=0 bash scripts/install_vllm_sglang_mcore.sh
  21. pip install --no-deps -e .
  22. cd -
  23. # install requirements
  24. pip install -r requirements.txt
  25. pip install -U nvidia-pytriton
  26. git clone https://github.com/yuekaizhang/PytritonSenseVoice.git && cd PytritonSenseVoice && pip install -e .
  27. fi
  28. if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
  29. log "stage -1: download official CosyVoice2-0.5B LLM model and convert to huggingface compatible checkpoint"
  30. modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_path
  31. python3 pretrained_to_huggingface.py \
  32. --pretrained-cosyvoice2-path $model_scope_model_path \
  33. --save-path $sft_model_path
  34. # Or, you could use the following command to download the huggingface compatible checkpoint
  35. # huggingface-cli download --local-dir $sft_model_path yuekai/cosyvoice2_llm
  36. # Note: we remove the lm_head's bias to make it compatible with the Qwen2.5-0.5B model in Transformers.
  37. fi
  38. data_dir=data/parquet_aishell3
  39. if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  40. log "stage 0: prepare data into verl format"
  41. mkdir -p $data_dir
  42. wget -O data/aishell-3.jsonl https://huggingface.co/datasets/SparkAudio/voxbox/resolve/main/metadata/aishell-3.jsonl
  43. # total 88035 samples
  44. head -n 80000 data/aishell-3.jsonl > data/train.jsonl
  45. tail -n 100 data/aishell-3.jsonl > data/test.jsonl
  46. python prepare_data.py \
  47. --train_file data/train.jsonl \
  48. --test_file data/test.jsonl \
  49. --local_dir $data_dir
  50. fi
  51. if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  52. log "stage 1: start token2wav asr server for reward function"
  53. python3 token2wav_asr_server.py --number-of-devices 8
  54. fi
  55. exp_name=official_llm_aishell3_grpo
  56. if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  57. log "stage 2: grpo train"
  58. export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
  59. export MKL_SERVICE_FORCE_INTEL=TRUE
  60. n_gpus_per_node=8
  61. micro_batch_size=4
  62. train_batch_size=32
  63. python3 -m verl.trainer.main_ppo \
  64. algorithm.adv_estimator=grpo \
  65. data.train_files=$data_dir/train.parquet \
  66. data.val_files=$data_dir/test.parquet \
  67. data.train_batch_size=$train_batch_size \
  68. data.max_prompt_length=1024 \
  69. data.max_response_length=512 \
  70. data.truncation='error' \
  71. actor_rollout_ref.model.use_remove_padding=False \
  72. actor_rollout_ref.model.path=$sft_model_path \
  73. actor_rollout_ref.actor.optim.lr=1e-6 \
  74. actor_rollout_ref.actor.ppo_mini_batch_size=32 \
  75. actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$micro_batch_size \
  76. actor_rollout_ref.actor.use_kl_loss=False \
  77. actor_rollout_ref.model.enable_gradient_checkpointing=True \
  78. actor_rollout_ref.actor.fsdp_config.param_offload=False \
  79. actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
  80. actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$micro_batch_size \
  81. actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
  82. actor_rollout_ref.rollout.name=vllm \
  83. actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
  84. actor_rollout_ref.rollout.do_sample=true \
  85. actor_rollout_ref.rollout.temperature=0.8 \
  86. actor_rollout_ref.rollout.top_p=0.95 \
  87. actor_rollout_ref.rollout.top_k=25 \
  88. actor_rollout_ref.rollout.n=4 \
  89. actor_rollout_ref.rollout.val_kwargs.do_sample=true \
  90. actor_rollout_ref.rollout.val_kwargs.temperature=0.8 \
  91. actor_rollout_ref.rollout.val_kwargs.top_p=0.95 \
  92. actor_rollout_ref.rollout.val_kwargs.top_k=25 \
  93. reward_model.reward_manager=prime \
  94. custom_reward_function.path=reward_tts.py \
  95. custom_reward_function.name=compute_score \
  96. trainer.project_name='cosyvoice2_grpo' \
  97. trainer.experiment_name=$exp_name \
  98. trainer.logger=['console','wandb'] \
  99. trainer.n_gpus_per_node=$n_gpus_per_node \
  100. trainer.nnodes=1 \
  101. trainer.save_freq=100 \
  102. trainer.test_freq=100 \
  103. trainer.resume_mode='auto' \
  104. trainer.total_epochs=1 \
  105. trainer.val_before_train=False
  106. fi
  107. steps=(100 200 300 400 500)
  108. for step in ${steps[@]}; do
  109. llm_path=./checkpoints/cosyvoice2_grpo/$exp_name/global_step_${step}
  110. if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  111. log "stage 3: merge the model"
  112. python -m verl.model_merger merge \
  113. --backend fsdp \
  114. --local_dir $llm_path/actor \
  115. --target_dir $llm_path/merged_hf_model || exit 1
  116. fi
  117. if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  118. log "stage 4: Test the model"
  119. dataset=zero_shot_zh # from CosyVoice3 test set
  120. # dataset=test_zh # from seed_tts test set
  121. output_dir=./outputs_${exp_name}_${step}_${dataset}
  122. token2wav_path=/workspace/CosyVoice2-0.5B
  123. model_path=$llm_path/merged_hf_model
  124. CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
  125. torchrun --nproc_per_node=8 \
  126. infer_dataset.py \
  127. --output-dir $output_dir \
  128. --llm-model-name-or-path $model_path \
  129. --token2wav-path $token2wav_path \
  130. --split-name ${dataset} || exit 1
  131. bash scripts/compute_wer.sh $output_dir ${dataset}
  132. fi
  133. done
  134. if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  135. log "stage 5: Convert the RL trained model to CosyVoice repo format"
  136. python3 huggingface_to_pretrained.py \
  137. --hf-cosyvoice2-llm-path $llm_path/merged_hf_model \
  138. --output-path /workspace/CosyVoice2-0.5B/llm-new.pt
  139. # You need to manually move the llm-new.pt to overwrite /workspace/CosyVoice2-0.5B/llm.pt
  140. # However, we found that the RL trained model accuracy would slightly drop after this conversion.
  141. # Please be careful or use the huggingface format inference code.
  142. fi