diff --git a/examples/offline_inference/qwen2_5/generate_eagle.py b/examples/offline_inference/qwen2_5/generate_eagle.py new file mode 100644 index 0000000000000000000000000000000000000000..62cca75d88cf361d1551d1d2a2505570e684fd8c --- /dev/null +++ b/examples/offline_inference/qwen2_5/generate_eagle.py @@ -0,0 +1,181 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Example of using EAGLE speculative decoding with Qwen2.5 model. + +This example demonstrates how to use EAGLE (or EAGLE3) speculative decoding +to accelerate inference with Qwen2.5 models on MindSpore backend. + +Usage: + # Basic EAGLE usage + python examples/offline_inference/qwen2_5/generate_eagle.py \\ + --model Qwen/Qwen2.5-7B-Instruct \\ + --draft-model [path-to-eagle-draft-model] + + # EAGLE3 usage (if available) + python examples/offline_inference/qwen2_5/generate_eagle.py \\ + --model Qwen/Qwen2.5-7B-Instruct \\ + --draft-model [path-to-eagle-draft-model] \\ + --speculative-method eagle3 + +Note: You need to prepare an EAGLE draft model for the target model. +Please refer to the EAGLE paper for how to train draft models. +""" + +import argparse +import time +from typing import List + +from vllm import LLM, SamplingParams + + +def run_inference( + model_path: str, + draft_model_path: str, + speculative_method: str = "eagle", + prompts: List[str] = None, + max_tokens: int = 256, +): + """ + Run inference with EAGLE speculative decoding. + + Args: + model_path: Path to the target model (Qwen2.5). + draft_model_path: Path to the EAGLE draft model. + speculative_method: Speculative decoding method ("eagle" or "eagle3"). + prompts: List of input prompts. If None, use default prompts. + max_tokens: Maximum number of tokens to generate. + """ + if prompts is None: + prompts = [ + "你好,请介绍一下人工智能的发展历程。", + "What is the capital of France?", + "解释一下量子计算的基本原理。", + ] + + print("模型路径:", model_path) + print("草稿模型路径:", draft_model_path) + print("推测方法:", speculative_method) + print("-" * 80) + + # Configure speculative decoding + speculative_config = { + "method": speculative_method, + "draft_model": draft_model_path, + } + + # Initialize LLM with EAGLE + llm = LLM( + model=model_path, + speculative_config=speculative_config, + trust_remote_code=True, + max_model_len=2048, + # Use V1 architecture which supports EAGLE + use_v2_block_manager=True, + ) + + # Set sampling parameters + sampling_params = SamplingParams( + temperature=0.7, + top_p=0.8, + max_tokens=max_tokens, + ) + + print(f"\n开始推理 (共 {len(prompts)} 个提示)...") + print("=" * 80) + + # Measure inference time + start_time = time.time() + + # Generate outputs + outputs = llm.generate(prompts, sampling_params) + + end_time = time.time() + elapsed_time = end_time - start_time + + # Print results + print(f"\n推理完成! 总耗时: {elapsed_time:.2f}秒") + print("=" * 80) + + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + + print(f"\n提示 {i + 1}:") + print(f"输入: {prompt}") + print(f"输出: {generated_text}") + print("-" * 80) + + # Calculate statistics + total_tokens = sum(len(output.outputs[0].token_ids) for output in outputs) + throughput = total_tokens / elapsed_time + + print(f"\n性能统计:") + print(f" 总生成tokens: {total_tokens}") + print(f" 吞吐量: {throughput:.2f} tokens/s") + print("=" * 80) + + +def main(): + parser = argparse.ArgumentParser( + description="使用EAGLE推测解码进行Qwen2.5模型推理" + ) + parser.add_argument( + "--model", + type=str, + default="Qwen/Qwen2.5-7B-Instruct", + help="目标模型路径", + ) + parser.add_argument( + "--draft-model", + type=str, + required=True, + help="EAGLE草稿模型路径", + ) + parser.add_argument( + "--speculative-method", + type=str, + default="eagle", + choices=["eagle", "eagle3"], + help="推测解码方法", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=256, + help="最大生成tokens数", + ) + parser.add_argument( + "--prompt", + type=str, + action="append", + help="自定义提示(可多次使用)", + ) + + args = parser.parse_args() + + # Run inference + run_inference( + model_path=args.model, + draft_model_path=args.draft_model, + speculative_method=args.speculative_method, + prompts=args.prompt, + max_tokens=args.max_tokens, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/st/python/test_eagle_spec_decode.py b/tests/st/python/test_eagle_spec_decode.py new file mode 100644 index 0000000000000000000000000000000000000000..bdafbe60bb26bd6ffe40127eb3f6614be9818287 --- /dev/null +++ b/tests/st/python/test_eagle_spec_decode.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2025 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Basic test for EAGLE speculative decoding. + +This test verifies that the EAGLE implementation can be loaded and initialized +correctly. +""" + +import pytest + + +class TestEAGLESpecDecode: + """Test cases for EAGLE speculative decoding.""" + + def test_eagle_import(self): + """Test that EAGLE components can be imported.""" + try: + from vllm_mindspore.v1.spec_decode import (EAGLEModelRunner, + EAGLEWorker, + get_eagle_model_runner) + + assert EAGLEModelRunner is not None + assert EAGLEWorker is not None + assert get_eagle_model_runner is not None + except ImportError as e: + pytest.fail(f"Failed to import EAGLE components: {e}") + + def test_eagle_model_runner_class(self): + """Test that EAGLEModelRunner class is properly defined.""" + from vllm_mindspore.v1.spec_decode import EAGLEModelRunner + + # Check that the class has the expected methods + assert hasattr(EAGLEModelRunner, '__init__') + assert hasattr(EAGLEModelRunner, '_prepare_common_attention_metadata') + + def test_eagle_worker_class(self): + """Test that EAGLEWorker class is properly defined.""" + from vllm_mindspore.v1.spec_decode import EAGLEWorker + + # Check that the class has the expected methods + assert hasattr(EAGLEWorker, '__init__') + assert hasattr(EAGLEWorker, 'get_model_runner_cls') + + def test_get_eagle_model_runner_factory(self): + """Test the get_eagle_model_runner factory function.""" + from vllm_mindspore.v1.spec_decode import get_eagle_model_runner + + # Test with None speculative_config + mock_config = type('MockConfig', (), {'speculative_config': None})() + result = get_eagle_model_runner(mock_config, None) + assert result is None + + # Note: Full integration test would require a real model, + # which is beyond the scope of this basic test + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/vllm_mindspore/v1/spec_decode/__init__.py b/vllm_mindspore/v1/spec_decode/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8f96cebf1b0b5e84070703f3c169a404e2fcf0f7 100644 --- a/vllm_mindspore/v1/spec_decode/__init__.py +++ b/vllm_mindspore/v1/spec_decode/__init__.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/vllm-project/vllm/blob/v0.9.1/vllm/v1/spec_decode/__init__.py +# +# Copyright 2025 Huawei Technologies Co., Ltd. +# Copyright 2024-2025 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from vllm_mindspore.v1.spec_decode.eagle import ( # noqa: F401 + EAGLEModelRunner, EAGLEWorker, get_eagle_model_runner) + +__all__ = [ + "EAGLEModelRunner", + "EAGLEWorker", + "get_eagle_model_runner", +] diff --git a/vllm_mindspore/v1/spec_decode/eagle.py b/vllm_mindspore/v1/spec_decode/eagle.py new file mode 100644 index 0000000000000000000000000000000000000000..0a44c9d5ea77675babbee90a60085be3f00432b6 --- /dev/null +++ b/vllm_mindspore/v1/spec_decode/eagle.py @@ -0,0 +1,136 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/vllm-project/vllm/blob/v0.9.1/vllm/v1/spec_decode/eagle.py +# +# Copyright 2025 Huawei Technologies Co., Ltd. +# Copyright 2024-2025 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency) +implementation for vLLM-MindSpore. + +This module provides the EAGLE speculative decoding functionality, +adapted to work with MindSpore backend. +""" + +from typing import Optional + +import torch +from vllm.config import VllmConfig +from vllm.v1.spec_decode.eagle import (EAGLEModelRunner as VllmEAGLEModelRunner, + EAGLEWorker as VllmEAGLEWorker) + +from vllm_mindspore.v1.attention.backends.ms_attn import CommonAttentionMetadata + + +class EAGLEModelRunner(VllmEAGLEModelRunner): + """ + EAGLE model runner for MindSpore backend. + + This class adapts the vLLM EAGLE model runner to work with MindSpore, + handling the attention metadata and tensor operations specific to + MindSpore. + """ + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + is_draft_model: bool = False, + ): + """Initialize the EAGLE model runner for MindSpore.""" + super().__init__(vllm_config, device, is_draft_model) + # Additional MindSpore-specific initialization can be added here + + def _prepare_common_attention_metadata( + self, + query_start_loc: torch.Tensor, + seq_lens: torch.Tensor, + ) -> CommonAttentionMetadata: + """ + Prepare common attention metadata for MindSpore backend. + + This method adapts the attention metadata to MindSpore's tensor format. + + Args: + query_start_loc: Starting location of each query sequence. + seq_lens: Length of each sequence. + + Returns: + CommonAttentionMetadata object compatible with MindSpore. + """ + # Convert torch tensors to MindSpore if needed + # For initial implementation, we keep the original logic + # and rely on msadapter for tensor compatibility + return CommonAttentionMetadata( + query_start_loc=query_start_loc, + seq_lens=seq_lens, + ) + + +class EAGLEWorker(VllmEAGLEWorker): + """ + EAGLE worker for MindSpore backend. + + This class manages the EAGLE speculative decoding workflow, + coordinating between the draft model and target model with + MindSpore backend support. + """ + + def __init__( + self, + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + ): + """Initialize the EAGLE worker for MindSpore.""" + super().__init__( + vllm_config, + local_rank, + rank, + distributed_init_method, + ) + # Additional MindSpore-specific initialization can be added here + + def get_model_runner_cls(self): + """Return the EAGLE model runner class for MindSpore.""" + return EAGLEModelRunner + + +def get_eagle_model_runner( + vllm_config: VllmConfig, + device: torch.device, + is_draft_model: bool = False, +) -> Optional[EAGLEModelRunner]: + """ + Factory function to create an EAGLE model runner. + + Args: + vllm_config: vLLM configuration object. + device: Device to run the model on. + is_draft_model: Whether this is a draft model or target model. + + Returns: + EAGLEModelRunner instance if EAGLE is enabled, None otherwise. + """ + if vllm_config.speculative_config is None: + return None + + method = vllm_config.speculative_config.get("method", "") + if method not in ("eagle", "eagle3"): + return None + + return EAGLEModelRunner(vllm_config, device, is_draft_model)