vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/e2e/multicard/test_offline_inference_distributed.py‎
Lines changed: 7 additions & 30 deletions b/‎tests/e2e/multicard/test_offline_inference_distributed.py‎
Lines changed: 7 additions & 30 deletions
diff --git a/‎tests/ut/ops/test_common_fused_moe.py‎
Lines changed: 69 additions & 0 deletions b/‎tests/ut/ops/test_common_fused_moe.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎tests/ut/ops/test_fused_ops.py‎
Lines changed: 40 additions & 48 deletions b/‎tests/ut/ops/test_fused_ops.py‎
Lines changed: 40 additions & 48 deletions
diff --git a/‎tests/ut/ops/test_token_dispatcher.py‎
Lines changed: 4 additions & 48 deletions b/‎tests/ut/ops/test_token_dispatcher.py‎
Lines changed: 4 additions & 48 deletions
diff --git a/‎tests/ut/torchair/ops/test_torchair_fused_moe.py‎
Lines changed: 3 additions & 8 deletions b/‎tests/ut/torchair/ops/test_torchair_fused_moe.py‎
Lines changed: 3 additions & 8 deletions
@@ -279,7 +279,6 @@ jobs:
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
 
@@ -108,14 +108,13 @@ def test_models_distributed_pangu():
     ]
     max_tokens = 5
 
-    with VllmRunner(
-            snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
-            max_model_len=8192,
-            enforce_eager=True,
-            dtype="auto",
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
+    with VllmRunner(snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
+                    max_model_len=8192,
+                    enforce_eager=True,
+                    dtype="auto",
+                    tensor_parallel_size=2,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
@@ -141,28 +140,6 @@ def test_models_distributed_topk() -> None:
         vllm_model.generate(example_prompts, sampling_params)
 
 
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"})
-def test_models_distributed_alltoallv() -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
-        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
-        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
-    ]
-    dtype = "half"
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
-
-    with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
-
-
 def test_models_distributed_Qwen3_W8A8():
     example_prompts = [
         "Hello, my name is",
 
@@ -0,0 +1,69 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+from unittest.mock import patch
+
+import torch
+
+from tests.ut.base import TestBase
+from vllm_ascend.ops.common_fused_moe import fused_experts_moge
+
+
+class TestFusedExpertsMoGE(TestBase):
+
+    def test_fused_experts_moge(self):
+        with patch('torch_npu.npu_grouped_matmul') as mock_grouped_matmul, \
+             patch('torch_npu.npu_swiglu') as mock_swiglu, \
+             patch('vllm_ascend.utils.is_310p') as mock_is_310p:
+
+            mock_is_310p.return_value = False
+
+            mock_grouped_matmul.side_effect = lambda x, weight, **kwargs: [
+                torch.randn(x[0].shape[0], weight[0].shape[1])
+            ]
+
+            mock_swiglu.side_effect = lambda x: x
+
+            hidden_states = torch.randn(4, 128)
+            w1 = torch.randn(4, 256, 128)
+            w2 = torch.randn(4, 128, 128)
+            topk_weights = torch.rand(4, 1)
+            topk_ids = torch.tensor([[0], [1], [2], [3]], dtype=torch.long)
+            top_k = 1
+            global_num_experts = 4
+
+            moe_parallel_config = type(
+                'MockConfig', (), {
+                    'ep_size': 1,
+                    'tp_size': 1,
+                    'dp_size': 1,
+                    'tp_rank': 0,
+                    'dp_rank': 0,
+                    'ep_rank': 0,
+                    'use_ep': True
+                })()
+
+            output = fused_experts_moge(
+                hidden_states=hidden_states,
+                w1=w1,
+                w2=w2,
+                moe_parallel_config=moe_parallel_config,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                global_num_experts=global_num_experts,
+                apply_router_weight_on_input=True,
+            )
+
+            self.assertEqual(output.shape, (4, 128))
@@ -27,9 +27,9 @@
 from vllm_ascend.ascend_forward_context import (FusedMoEState,
                                                 _get_fused_moe_state)
 from vllm_ascend.ops.fused_moe import (AscendFusedMoE,
-                                       AscendUnquantizedFusedMoEMethod,
-                                       unified_apply_mlp)
+                                       AscendUnquantizedFusedMoEMethod)
 from vllm_ascend.ops.layers.experts_selector import select_experts
+from vllm_ascend.ops.layers.moe_mlp import unified_apply_mlp
 from vllm_ascend.utils import AscendSocVersion, adapt_patch
 
 adapt_patch(True)
@@ -129,36 +129,38 @@ def capture_register(dispatcher_instance):
         with_quant=False)
 
     with patch('torch.distributed.get_rank', return_value=0), \
-         patch('torch.distributed.get_world_size', return_value=4), \
-         patch('vllm_ascend.ops.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
-         patch('vllm_ascend.ops.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
-         patch('vllm_ascend.ops.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
-         patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
-         patch('vllm_ascend.ops.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
-         patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
-         patch('torch.distributed.all_gather'), \
-         patch('torch.distributed.all_to_all_single'), \
-         patch('vllm_ascend.ops.fused_moe.tensor_model_parallel_all_reduce'), \
-         patch('vllm_ascend.ops.fused_moe.data_parallel_reduce_scatter'), \
-         patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
-               return_value=mock_dp_and_tp_group(mocker)), \
-         patch('vllm_ascend.ops.fused_moe.get_ascend_config',
-               return_value=MagicMock(
-                   torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
-                   expert_map_path=None
-               )), \
-         patch('vllm_ascend.ops.fused_moe.determine_expert_map',
-               return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \
-         patch('vllm_ascend.ops.fused_moe.get_forward_context',
-               return_value=mock_forward_context_obj), \
+        patch('torch.distributed.get_world_size', return_value=4), \
+        patch('vllm_ascend.ops.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('torch.distributed.all_gather'), \
+        patch('torch.distributed.all_to_all_single'), \
+        patch('vllm_ascend.ops.fused_moe.tensor_model_parallel_all_reduce'), \
+        patch('vllm_ascend.ops.fused_moe.data_parallel_reduce_scatter'), \
+        patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
+            return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.get_ascend_config',
+            return_value=MagicMock(
+                torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
+                expert_map_path=None
+            )), \
+        patch('vllm_ascend.ops.fused_moe.determine_expert_map',
+            return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \
+        patch('vllm_ascend.ops.fused_moe.get_forward_context',
+            return_value=mock_forward_context_obj), \
         patch('vllm_ascend.ops.fused_moe.get_current_vllm_config',
-               return_value=MagicMock(
-                   parallel_config=MagicMock(tensor_parallel_size=2),
-                   scheduler_config=MagicMock(max_num_seqs=4),
-                   model_config=MagicMock(max_model_len=2048)
-               )), \
+                return_value=MagicMock(
+                    parallel_config=MagicMock(tensor_parallel_size=2),
+                    scheduler_config=MagicMock(max_num_seqs=4),
+                    model_config=MagicMock(max_model_len=2048)
+                )), \
         patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3), \
-        patch.object(token_dispatcher_module, 'setup_token_dispatchers', mock_setup_token_dispatchers):
+        patch.object(token_dispatcher_module, 'setup_token_dispatchers', mock_setup_token_dispatchers), \
+        patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context',
+                return_value=mock_forward_context_obj):
 
         yield {
             'mock_forward_context_obj': mock_forward_context_obj,
@@ -441,12 +443,11 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env,
 
             assert result.shape == expected_shape
 
-    @pytest.mark.parametrize("others_param",
-                             [[16, False], [1, True], [1, False], [4, False]])
+    @pytest.mark.parametrize("others_param", [16, 1, 4])
     def test_apply_with_expert_map(self, moe_method, mock_dist_env,
                                    mock_moe_env, others_param):
 
-        ep_size, alltoall_buffer = others_param
+        ep_size = others_param
         is_prefill = False
 
         if ep_size == 1:
@@ -464,9 +465,7 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
                                     with_quant=False,
                                     token_dispatcher=selected_token_dispatcher)
 
-        with patch("vllm_ascend.ops.fused_moe.MOE_ALL2ALL_BUFFER",
-                   alltoall_buffer), \
-             patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \
+        with patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \
              patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3):
 
             expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
@@ -475,8 +474,6 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
             if ep_size == 1:
                 x = x.view(-1, 2)
             router_logits = torch.randn(8, 8)
-            if alltoall_buffer:
-                moe_method.max_model_len = 1
             layer = MagicMock()
 
             local_num_experts = 2
@@ -529,26 +526,21 @@ def test_select_experts(self, mock_dist_env, mock_moe_env,
 
 class TestUnifiedApplyMLP(TestBase):
 
-    @patch('vllm_ascend.ops.fused_moe.get_forward_context')
-    @patch('vllm_ascend.ops.fused_moe.get_mc2_group')
-    @patch('vllm_ascend.ops.fused_moe.is_310p')
+    @patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context')
+    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_dynamic_quant')
     @patch('torch_npu.npu_dequant_swiglu_quant')
     def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant,
                                                      mock_npu_dynamic_quant,
                                                      mock_npu_grouped_matmul,
                                                      mock_is_310p,
-                                                     mock_get_mc2_group,
                                                      mock_get_forward_context):
 
         mock_forward_context = MagicMock()
         mock_forward_context.fused_moe_state = FusedMoEState.MC2
         mock_get_forward_context.return_value = mock_forward_context
 
-        mock_mc2_group = MagicMock()
-        mock_get_mc2_group.return_value = mock_mc2_group
-
         mock_is_310p.return_value = False
 
         mock_npu_dynamic_quant.return_value = (torch.randint(-128,
@@ -601,7 +593,7 @@ def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant,
 
         self.assertEqual(result.dtype, torch.bfloat16)
 
-    @patch('vllm_ascend.ops.fused_moe.is_310p')
+    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_swiglu')
     @patch('torch_npu.npu_dynamic_quant')
@@ -643,7 +635,7 @@ def test_unified_apply_mlp_without_quantization(self,
         self.assertEqual(result.shape, hidden_states.shape)
         self.assertEqual(result.dtype, torch.float16)
 
-    @patch('vllm_ascend.ops.fused_moe.get_forward_context')
+    @patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context')
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_swiglu')
     @patch('torch_npu.npu_dynamic_quant')
@@ -703,7 +695,7 @@ def test_unified_apply_mlp_with_quantization_and_dynamic_scale(
         self.assertEqual(result.shape, hidden_states.shape)
         self.assertEqual(result.dtype, torch.bfloat16)
 
-    @patch('vllm_ascend.ops.fused_moe.is_310p')
+    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_swiglu')
     @patch('torch_npu.npu_dynamic_quant')
 
@@ -17,57 +17,13 @@
 
 from unittest.mock import MagicMock, PropertyMock, patch
 
-import pytest
 import torch
-from pytest_mock import MockerFixture
 
-from tests.ut.base import PytestBase, TestBase
+from tests.ut.base import TestBase
 from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
-    AscendSocVersion, MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig,
-    TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather,
-    TokenDispatcherWithMC2, _Dispatchers, _register_token_dispatcher,
-    get_token_dispatcher, setup_token_dispatchers)
-
-
-class TestMoEAlltoAllSeqOverLapDispatcher(PytestBase):
-
-    @pytest.fixture
-    def config(self):
-        config = MoEDispatcherConfig()
-        config.set_num_local_experts(2)
-        config.set_num_moe_experts(4)
-        config.set_moe_pad_expert_input_to_capacity(False)
-        config.set_moe_expert_capacity_factor(None)
-        config.set_moe_router_topk(2)
-        config.set_moe_grouped_gemm(False)
-        config.set_group_topk(0)
-        config.set_num_groups(1)
-        config.set_is_fused(False)
-        return config.build()
-
-    def mock_ep_group(self, mocker):
-        mock_group = mocker.MagicMock()
-        mock_group.rank_in_group = 0
-        mock_group.world_size = 2
-        mock_group.device_group = "mock_group"
-        return mock_group
-
-    @pytest.fixture
-    def dispatcher(self, config, mocker: MockerFixture):
-        mocker.patch(
-            "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
-            return_value=self.mock_ep_group(mocker))
-        mocker.patch("torch.npu.current_device", return_value="cpu")
-        mocker.patch("torch.npu.Stream", return_value=mocker.MagicMock)
-        return MoEAlltoAllSeqOverLapDispatcher(config)
-
-    def test_initialization(self, dispatcher, config):
-        assert dispatcher.num_local_experts == config.num_local_experts
-        assert dispatcher.num_experts == config.num_moe_experts
-        assert dispatcher.local_expert_indices == [0, 1]
-        assert dispatcher.ep_rank == 0
-        assert dispatcher.ep_size == 2
-        assert dispatcher.overlap_stream is not None
+    AscendSocVersion, TokenDispatcherWithAll2AllV,
+    TokenDispatcherWithAllGather, TokenDispatcherWithMC2, _Dispatchers,
+    _register_token_dispatcher, get_token_dispatcher, setup_token_dispatchers)
 
 
 class TestTokenDispatcherWithMC2(TestBase):
 
@@ -353,8 +353,7 @@ def test_apply_without_expert_map(self, moe_method, mock_dist_env,
             else:
                 assert result.shape == x.shape
 
-    @pytest.mark.parametrize("others_param",
-                             [[16, False], [1, True], [1, False], [4, False]])
+    @pytest.mark.parametrize("others_param", [16, 1, 4])
     def test_apply_with_expert_map(self, moe_method, mock_dist_env,
                                    mock_moe_env, others_param):
         """
@@ -363,22 +362,18 @@ def test_apply_with_expert_map(self, moe_method, mock_dist_env,
         3 test use_select_experts and fused_experts_with_all2all
         4 test use_select_experts and fused_experts
         """
-        ep_size, alltoall_buffer = others_param
+        ep_size = others_param
         is_prefill = False
         forward_context = MagicMock(
             fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
-        with patch("vllm_ascend.torchair.ops.torchair_fused_moe.MOE_ALL2ALL_BUFFER",
-                   alltoall_buffer), \
-             patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
+        with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
              patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
             expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
             moe_method.ep_size = ep_size
             x = torch.randn(8, 2, 2)
             if ep_size == 1:
                 x = x.view(-1, 2)
             router_logits = torch.randn(8, 8)
-            if alltoall_buffer:
-                moe_method.max_model_len = 1
             layer = MagicMock()
             layer.w13_weight = torch.randn(8, 16, 1)
             layer.w2_weight = torch.randn(16, 8, 1)