123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- # SPDX-License-Identifier: Apache-2.0
- # Adapted from
- # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
- # Copyright 2024 The Qwen team.
- # Copyright 2023 The vLLM team.
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
- #
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
- # and OPT implementations in this library. It has been modified from its
- # original forms to accommodate minor architectural differences compared
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """Inference-only Qwen2 model compatible with HuggingFace weights."""
- from vllm.model_executor.models.qwen2 import *
- class CosyVoice2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
- packed_modules_mapping = {
- "qkv_proj": [
- "q_proj",
- "k_proj",
- "v_proj",
- ],
- "gate_up_proj": [
- "gate_proj",
- "up_proj",
- ],
- }
- def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
- super().__init__()
- config = vllm_config.model_config.hf_config
- quant_config = vllm_config.quant_config
- lora_config = vllm_config.lora_config
- self.config = config
- self.lora_config = lora_config
- self.quant_config = quant_config
- self.model = Qwen2Model(vllm_config=vllm_config,
- prefix=maybe_prefix(prefix, "model"))
- if get_pp_group().is_last_rank:
- if config.tie_word_embeddings:
- self.lm_head = self.model.embed_tokens
- else:
- self.lm_head = ParallelLMHead(config.vocab_size,
- config.hidden_size,
- True,
- quant_config=quant_config,
- prefix=maybe_prefix(
- prefix, "lm_head"))
- else:
- self.lm_head = PPMissingLayer()
- self.logits_processor = LogitsProcessor(config.vocab_size)
- self.make_empty_intermediate_tensors = (
- self.model.make_empty_intermediate_tensors)
- def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
- return self.model.get_input_embeddings(input_ids)
- def forward(
- self,
- input_ids: torch.Tensor,
- positions: torch.Tensor,
- intermediate_tensors: Optional[IntermediateTensors] = None,
- inputs_embeds: Optional[torch.Tensor] = None,
- ) -> Union[torch.Tensor, IntermediateTensors]:
- hidden_states = self.model(input_ids, positions, intermediate_tensors,
- inputs_embeds)
- return hidden_states
- def compute_logits(
- self,
- hidden_states: torch.Tensor,
- sampling_metadata: SamplingMetadata,
- ) -> Optional[torch.Tensor]:
- logits = self.logits_processor(self.lm_head, hidden_states,
- sampling_metadata, self.lm_head.bias)
- return logits
- def load_weights(self, weights: Iterable[tuple[str,
- torch.Tensor]]) -> set[str]:
- loader = AutoWeightsLoader(
- self,
- skip_prefixes=(["lm_head."]
- if self.config.tie_word_embeddings else None),
- )
- return loader.load_weights(weights)
|