8 mēneši atpakaļ · c3250c222f
--- a/examples/magicdata-read/cosyvoice/conf
+++ b/examples/magicdata-read/cosyvoice/conf
@@ -0,0 +1 @@
 
				+../../libritts/cosyvoice/conf
			
--- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml
+++ b/examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml
@@ -1,203 +0,0 @@
 
				-# set random seed, so that you may reproduce your result.
			
 
				-__set_seed1: !apply:random.seed [1986]
			
 
				-__set_seed2: !apply:numpy.random.seed [1986]
			
 
				-__set_seed3: !apply:torch.manual_seed [1986]
			
 
				-__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
			
 
				-
			
 
				-# fixed params
			
 
				-sample_rate: 22050
			
 
				-text_encoder_input_size: 512
			
 
				-llm_input_size: 1024
			
 
				-llm_output_size: 1024
			
 
				-spk_embed_dim: 192
			
 
				-
			
 
				-# model params
			
 
				-# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
			
 
				-# for system/third_party class/function, we do not require this.
			
 
				-llm: !new:cosyvoice.llm.llm.TransformerLM
			
 
				-    text_encoder_input_size: !ref <text_encoder_input_size>
			
 
				-    llm_input_size: !ref <llm_input_size>
			
 
				-    llm_output_size: !ref <llm_output_size>
			
 
				-    text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    speech_token_size: 4096
			
 
				-    length_normalized_loss: True
			
 
				-    lsm_weight: 0
			
 
				-    spk_embed_dim: !ref <spk_embed_dim>
			
 
				-    text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
			
 
				-        input_size: !ref <text_encoder_input_size>
			
 
				-        output_size: 1024
			
 
				-        attention_heads: 8
			
 
				-        linear_units: 2048
			
 
				-        num_blocks: 3
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.0
			
 
				-        normalize_before: True
			
 
				-        input_layer: 'linear'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        use_cnn_module: False
			
 
				-        macaron_style: False
			
 
				-        use_dynamic_chunk: False
			
 
				-        use_dynamic_left_chunk: False
			
 
				-        static_chunk_size: 1
			
 
				-    llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
			
 
				-        input_size: !ref <llm_input_size>
			
 
				-        output_size: !ref <llm_output_size>
			
 
				-        attention_heads: 8
			
 
				-        linear_units: 2048
			
 
				-        num_blocks: 7
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.0
			
 
				-        input_layer: 'linear_legacy'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        static_chunk_size: 1
			
 
				-    sampling: !name:cosyvoice.utils.common.ras_sampling
			
 
				-        top_p: 0.8
			
 
				-        top_k: 25
			
 
				-        win_size: 10
			
 
				-        tau_r: 0.1
			
 
				-
			
 
				-flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
			
 
				-    input_size: 512
			
 
				-    output_size: 80
			
 
				-    spk_embed_dim: !ref <spk_embed_dim>
			
 
				-    output_type: 'mel'
			
 
				-    vocab_size: 4096
			
 
				-    input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    only_mask_loss: True
			
 
				-    encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
			
 
				-        output_size: 512
			
 
				-        attention_heads: 4
			
 
				-        linear_units: 1024
			
 
				-        num_blocks: 3
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.1
			
 
				-        normalize_before: True
			
 
				-        input_layer: 'linear'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        input_size: 512
			
 
				-        use_cnn_module: False
			
 
				-        macaron_style: False
			
 
				-    length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
			
 
				-        channels: 80
			
 
				-        sampling_ratios: [1, 1, 1, 1]
			
 
				-    decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
			
 
				-        in_channels: 240
			
 
				-        n_spks: 1
			
 
				-        spk_emb_dim: 80
			
 
				-        cfm_params: !new:omegaconf.DictConfig
			
 
				-            content:
			
 
				-                sigma_min: 1e-06
			
 
				-                solver: 'euler'
			
 
				-                t_scheduler: 'cosine'
			
 
				-                training_cfg_rate: 0.2
			
 
				-                inference_cfg_rate: 0.7
			
 
				-                reg_loss_type: 'l1'
			
 
				-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
			
 
				-            in_channels: 320
			
 
				-            out_channels: 80
			
 
				-            channels: [256, 256]
			
 
				-            dropout: 0.0
			
 
				-            attention_head_dim: 64
			
 
				-            n_blocks: 4
			
 
				-            num_mid_blocks: 8
			
 
				-            num_heads: 8
			
 
				-            act_fn: 'gelu'
			
 
				-
			
 
				-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
			
 
				-    in_channels: 80
			
 
				-    base_channels: 512
			
 
				-    nb_harmonics: 8
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    nsf_alpha: 0.1
			
 
				-    nsf_sigma: 0.003
			
 
				-    nsf_voiced_threshold: 10
			
 
				-    upsample_rates: [8, 8]
			
 
				-    upsample_kernel_sizes: [16, 16]
			
 
				-    istft_params:
			
 
				-        n_fft: 16
			
 
				-        hop_len: 4
			
 
				-    resblock_kernel_sizes: [3, 7, 11]
			
 
				-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    source_resblock_kernel_sizes: [7, 11]
			
 
				-    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
			
 
				-    lrelu_slope: 0.1
			
 
				-    audio_limit: 0.99
			
 
				-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
			
 
				-        num_class: 1
			
 
				-        in_channels: 80
			
 
				-        cond_channels: 512
			
 
				-
			
 
				-# processor functions
			
 
				-parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
			
 
				-get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    multilingual: True
			
 
				-    num_languages: 100
			
 
				-    language: 'en'
			
 
				-    task: 'transcribe'
			
 
				-allowed_special: 'all'
			
 
				-tokenize: !name:cosyvoice.dataset.processor.tokenize
			
 
				-    get_tokenizer: !ref <get_tokenizer>
			
 
				-    allowed_special: !ref <allowed_special>
			
 
				-filter: !name:cosyvoice.dataset.processor.filter
			
 
				-    max_length: 40960
			
 
				-    min_length: 0
			
 
				-    token_max_length: 200
			
 
				-    token_min_length: 1
			
 
				-resample: !name:cosyvoice.dataset.processor.resample
			
 
				-    resample_rate: !ref <sample_rate>
			
 
				-feat_extractor: !name:matcha.utils.audio.mel_spectrogram
			
 
				-    n_fft: 1024
			
 
				-    num_mels: 80
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    hop_size: 256
			
 
				-    win_size: 1024
			
 
				-    fmin: 0
			
 
				-    fmax: 8000
			
 
				-    center: False
			
 
				-compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
			
 
				-    feat_extractor: !ref <feat_extractor>
			
 
				-parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
			
 
				-    normalize: True
			
 
				-shuffle: !name:cosyvoice.dataset.processor.shuffle
			
 
				-    shuffle_size: 1000
			
 
				-sort: !name:cosyvoice.dataset.processor.sort
			
 
				-    sort_size: 500  # sort_size should be less than shuffle_size
			
 
				-batch: !name:cosyvoice.dataset.processor.batch
			
 
				-    batch_type: 'dynamic'
			
 
				-    max_frames_in_batch: 12000
			
 
				-padding: !name:cosyvoice.dataset.processor.padding
			
 
				-    use_spk_embedding: False # change to True during sft
			
 
				-
			
 
				-# dataset processor pipeline
			
 
				-data_pipeline: [
			
 
				-    !ref <parquet_opener>,
			
 
				-    !ref <tokenize>,
			
 
				-    !ref <filter>,
			
 
				-    !ref <resample>,
			
 
				-    !ref <compute_fbank>,
			
 
				-    !ref <parse_embedding>,
			
 
				-    !ref <shuffle>,
			
 
				-    !ref <sort>,
			
 
				-    !ref <batch>,
			
 
				-    !ref <padding>,
			
 
				-]
			
 
				-
			
 
				-# train conf
			
 
				-train_conf:
			
 
				-    optim: adam
			
 
				-    optim_conf:
			
 
				-        lr: 0.002 # change to 0.001 if you want to train flow from scratch
			
 
				-    scheduler: warmuplr
			
 
				-    scheduler_conf:
			
 
				-        warmup_steps: 25000
			
 
				-    max_epoch: 200
			
 
				-    grad_clip: 5
			
 
				-    accum_grad: 2
			
 
				-    log_interval: 100
			
 
				-    save_per_step: -1
			
--- a/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml
+++ b/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml
@@ -1,203 +0,0 @@
 
				-# set random seed, so that you may reproduce your result.
			
 
				-__set_seed1: !apply:random.seed [1986]
			
 
				-__set_seed2: !apply:numpy.random.seed [1986]
			
 
				-__set_seed3: !apply:torch.manual_seed [1986]
			
 
				-__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
			
 
				-
			
 
				-# fixed params
			
 
				-sample_rate: 22050
			
 
				-text_encoder_input_size: 512
			
 
				-llm_input_size: 1024
			
 
				-llm_output_size: 1024
			
 
				-spk_embed_dim: 192
			
 
				-
			
 
				-# model params
			
 
				-# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
			
 
				-# for system/third_party class/function, we do not require this.
			
 
				-llm: !new:cosyvoice.llm.llm.TransformerLM
			
 
				-    text_encoder_input_size: !ref <text_encoder_input_size>
			
 
				-    llm_input_size: !ref <llm_input_size>
			
 
				-    llm_output_size: !ref <llm_output_size>
			
 
				-    text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    speech_token_size: 4096
			
 
				-    length_normalized_loss: True
			
 
				-    lsm_weight: 0
			
 
				-    spk_embed_dim: !ref <spk_embed_dim>
			
 
				-    text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
			
 
				-        input_size: !ref <text_encoder_input_size>
			
 
				-        output_size: 1024
			
 
				-        attention_heads: 16
			
 
				-        linear_units: 4096
			
 
				-        num_blocks: 6
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.0
			
 
				-        normalize_before: True
			
 
				-        input_layer: 'linear'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        use_cnn_module: False
			
 
				-        macaron_style: False
			
 
				-        use_dynamic_chunk: False
			
 
				-        use_dynamic_left_chunk: False
			
 
				-        static_chunk_size: 1
			
 
				-    llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
			
 
				-        input_size: !ref <llm_input_size>
			
 
				-        output_size: !ref <llm_output_size>
			
 
				-        attention_heads: 16
			
 
				-        linear_units: 4096
			
 
				-        num_blocks: 14
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.0
			
 
				-        input_layer: 'linear_legacy'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        static_chunk_size: 1
			
 
				-    sampling: !name:cosyvoice.utils.common.ras_sampling
			
 
				-        top_p: 0.8
			
 
				-        top_k: 25
			
 
				-        win_size: 10
			
 
				-        tau_r: 0.1
			
 
				-
			
 
				-flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
			
 
				-    input_size: 512
			
 
				-    output_size: 80
			
 
				-    spk_embed_dim: !ref <spk_embed_dim>
			
 
				-    output_type: 'mel'
			
 
				-    vocab_size: 4096
			
 
				-    input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    only_mask_loss: True
			
 
				-    encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
			
 
				-        output_size: 512
			
 
				-        attention_heads: 8
			
 
				-        linear_units: 2048
			
 
				-        num_blocks: 6
			
 
				-        dropout_rate: 0.1
			
 
				-        positional_dropout_rate: 0.1
			
 
				-        attention_dropout_rate: 0.1
			
 
				-        normalize_before: True
			
 
				-        input_layer: 'linear'
			
 
				-        pos_enc_layer_type: 'rel_pos_espnet'
			
 
				-        selfattention_layer_type: 'rel_selfattn'
			
 
				-        input_size: 512
			
 
				-        use_cnn_module: False
			
 
				-        macaron_style: False
			
 
				-    length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
			
 
				-        channels: 80
			
 
				-        sampling_ratios: [1, 1, 1, 1]
			
 
				-    decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
			
 
				-        in_channels: 240
			
 
				-        n_spks: 1
			
 
				-        spk_emb_dim: 80
			
 
				-        cfm_params: !new:omegaconf.DictConfig
			
 
				-            content:
			
 
				-                sigma_min: 1e-06
			
 
				-                solver: 'euler'
			
 
				-                t_scheduler: 'cosine'
			
 
				-                training_cfg_rate: 0.2
			
 
				-                inference_cfg_rate: 0.7
			
 
				-                reg_loss_type: 'l1'
			
 
				-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
			
 
				-            in_channels: 320
			
 
				-            out_channels: 80
			
 
				-            channels: [256, 256]
			
 
				-            dropout: 0.0
			
 
				-            attention_head_dim: 64
			
 
				-            n_blocks: 4
			
 
				-            num_mid_blocks: 12
			
 
				-            num_heads: 8
			
 
				-            act_fn: 'gelu'
			
 
				-
			
 
				-hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
			
 
				-    in_channels: 80
			
 
				-    base_channels: 512
			
 
				-    nb_harmonics: 8
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    nsf_alpha: 0.1
			
 
				-    nsf_sigma: 0.003
			
 
				-    nsf_voiced_threshold: 10
			
 
				-    upsample_rates: [8, 8]
			
 
				-    upsample_kernel_sizes: [16, 16]
			
 
				-    istft_params:
			
 
				-        n_fft: 16
			
 
				-        hop_len: 4
			
 
				-    resblock_kernel_sizes: [3, 7, 11]
			
 
				-    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
			
 
				-    source_resblock_kernel_sizes: [7, 11]
			
 
				-    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
			
 
				-    lrelu_slope: 0.1
			
 
				-    audio_limit: 0.99
			
 
				-    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
			
 
				-        num_class: 1
			
 
				-        in_channels: 80
			
 
				-        cond_channels: 512
			
 
				-
			
 
				-# processor functions
			
 
				-parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
			
 
				-get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
			
 
				-    multilingual: True
			
 
				-    num_languages: 100
			
 
				-    language: 'en'
			
 
				-    task: 'transcribe'
			
 
				-allowed_special: 'all'
			
 
				-tokenize: !name:cosyvoice.dataset.processor.tokenize
			
 
				-    get_tokenizer: !ref <get_tokenizer>
			
 
				-    allowed_special: !ref <allowed_special>
			
 
				-filter: !name:cosyvoice.dataset.processor.filter
			
 
				-    max_length: 40960
			
 
				-    min_length: 0
			
 
				-    token_max_length: 200
			
 
				-    token_min_length: 1
			
 
				-resample: !name:cosyvoice.dataset.processor.resample
			
 
				-    resample_rate: !ref <sample_rate>
			
 
				-feat_extractor: !name:matcha.utils.audio.mel_spectrogram
			
 
				-    n_fft: 1024
			
 
				-    num_mels: 80
			
 
				-    sampling_rate: !ref <sample_rate>
			
 
				-    hop_size: 256
			
 
				-    win_size: 1024
			
 
				-    fmin: 0
			
 
				-    fmax: 8000
			
 
				-    center: False
			
 
				-compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
			
 
				-    feat_extractor: !ref <feat_extractor>
			
 
				-parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
			
 
				-    normalize: True
			
 
				-shuffle: !name:cosyvoice.dataset.processor.shuffle
			
 
				-    shuffle_size: 1000
			
 
				-sort: !name:cosyvoice.dataset.processor.sort
			
 
				-    sort_size: 500  # sort_size should be less than shuffle_size
			
 
				-batch: !name:cosyvoice.dataset.processor.batch
			
 
				-    batch_type: 'dynamic'
			
 
				-    max_frames_in_batch: 2000
			
 
				-padding: !name:cosyvoice.dataset.processor.padding
			
 
				-    use_spk_embedding: False # change to True during sft
			
 
				-
			
 
				-# dataset processor pipeline
			
 
				-data_pipeline: [
			
 
				-    !ref <parquet_opener>,
			
 
				-    !ref <tokenize>,
			
 
				-    !ref <filter>,
			
 
				-    !ref <resample>,
			
 
				-    !ref <compute_fbank>,
			
 
				-    !ref <parse_embedding>,
			
 
				-    !ref <shuffle>,
			
 
				-    !ref <sort>,
			
 
				-    !ref <batch>,
			
 
				-    !ref <padding>,
			
 
				-]
			
 
				-
			
 
				-# train conf
			
 
				-train_conf:
			
 
				-    optim: adam
			
 
				-    optim_conf:
			
 
				-        lr: 0.001 # change to 1e-5 during sft
			
 
				-    scheduler: warmuplr # change to constantlr during sft
			
 
				-    scheduler_conf:
			
 
				-        warmup_steps: 2500
			
 
				-    max_epoch: 200
			
 
				-    grad_clip: 5
			
 
				-    accum_grad: 2
			
 
				-    log_interval: 100
			
 
				-    save_per_step: -1
			
--- a/examples/magicdata-read/cosyvoice/conf/ds_stage2.json
+++ b/examples/magicdata-read/cosyvoice/conf/ds_stage2.json
@@ -1,42 +0,0 @@
 
				-{
			
 
				-  "train_micro_batch_size_per_gpu": 1,
			
 
				-  "gradient_accumulation_steps": 1,
			
 
				-  "steps_per_print": 100,
			
 
				-  "gradient_clipping": 5,
			
 
				-  "fp16": {
			
 
				-    "enabled": false,
			
 
				-    "auto_cast": false,
			
 
				-    "loss_scale": 0,
			
 
				-    "initial_scale_power": 16,
			
 
				-    "loss_scale_window": 256,
			
 
				-    "hysteresis": 2,
			
 
				-    "consecutive_hysteresis": false,
			
 
				-    "min_loss_scale": 1
			
 
				-  },
			
 
				-  "bf16": {
			
 
				-    "enabled": false
			
 
				-  },
			
 
				-  "zero_force_ds_cpu_optimizer": false,
			
 
				-  "zero_optimization": {
			
 
				-    "stage": 2,
			
 
				-    "offload_optimizer": {
			
 
				-      "device": "none",
			
 
				-      "pin_memory": true
			
 
				-    },
			
 
				-    "allgather_partitions": true,
			
 
				-    "allgather_bucket_size": 5e8,
			
 
				-    "overlap_comm": false,
			
 
				-    "reduce_scatter": true,
			
 
				-    "reduce_bucket_size": 5e8,
			
 
				-    "contiguous_gradients" : true
			
 
				-  },
			
 
				-  "optimizer": {
			
 
				-    "type": "AdamW",
			
 
				-    "params": {
			
 
				-        "lr": 0.001,
			
 
				-        "weight_decay": 0.0001,
			
 
				-        "torch_adam": true,
			
 
				-        "adam_w_mode": true
			
 
				-    }
			
 
				-  }
			
 
				-}
			
--- a/examples/magicdata-read/cosyvoice/run.sh
+++ b/examples/magicdata-read/cosyvoice/run.sh
@@ -83,7 +83,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
 
				   fi
			
 
				   cp data/train/parquet/data.list data/train.data.list
			
 
				   cp data/dev/parquet/data.list data/dev.data.list
			
 
				-  for model in llm flow; do
			
 
				+  for model in llm flow hifigan; do
			
 
				     torchrun --nnodes=1 --nproc_per_node=$num_gpus \
			
 
				         --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
			
 
				       cosyvoice/bin/train.py \
			
@@ -99,11 +99,26 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
 
				       --num_workers ${num_workers} \
			
 
				       --prefetch ${prefetch} \
			
 
				       --pin_memory \
			
 
				+      --use_amp \
			
 
				       --deepspeed_config ./conf/ds_stage2.json \
			
 
				       --deepspeed.save_states model+optimizer
			
 
				   done
			
 
				 fi
			
 
				 
			
 
				+# average model
			
 
				+average_num=5
			
 
				+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
			
 
				+  for model in llm flow hifigan; do
			
 
				+    decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt
			
 
				+    echo "do model average and final checkpoint is $decode_checkpoint"
			
 
				+    python cosyvoice/bin/average_model.py \
			
 
				+      --dst_model $decode_checkpoint \
			
 
				+      --src_path `pwd`/exp/cosyvoice/$model/$train_engine  \
			
 
				+      --num ${average_num} \
			
 
				+      --val_best
			
 
				+  done
			
 
				+fi
			
 
				+
			
 
				 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
			
 
				   echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir"
			
 
				   python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir