Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
733566a
init
sfc-gh-yewang Aug 7, 2025
6601ab4
update model_runner
sfc-gh-yewang Aug 7, 2025
5d25b44
update dep version
sfc-gh-yewang Aug 7, 2025
3e6cbfd
skip vllm version checking for now
sfc-gh-yewang Aug 7, 2025
9b6bf61
fix model load
sfc-gh-yewang Aug 7, 2025
810b283
fix Fp8LinearOp
sfc-gh-yewang Aug 7, 2025
4732739
fix embeding layer
sfc-gh-yewang Aug 8, 2025
f00bac6
fix MultiprocExecutor patch
sfc-gh-yewang Aug 8, 2025
b373f7a
disable ulysses for now
sfc-gh-yewang Aug 8, 2025
785d6b0
not using _SP for now
sfc-gh-yewang Aug 8, 2025
e8a6662
fix ulysses bug
sfc-gh-yewang Aug 8, 2025
711ce76
fix swiftkv
sfc-gh-yewang Aug 8, 2025
fa097df
remove from_dict
sfc-gh-yewang Aug 12, 2025
1af5108
use supports_mm_inputs
sfc-gh-yewang Aug 12, 2025
16546a5
add env to skip ulysses patches
sfc-gh-yewang Aug 12, 2025
586a055
update
sfc-gh-yewang Aug 19, 2025
ce17269
uncomment
sfc-gh-yewang Aug 19, 2025
8fa7cf2
update
sfc-gh-yewang Aug 19, 2025
76d1323
accomodate to 0.10.1
sfc-gh-yewang Aug 19, 2025
e4a68ca
update
sfc-gh-yewang Aug 19, 2025
2116ebc
fix ulysses cuda graph capture
sfc-gh-yewang Aug 19, 2025
dd59615
use piecewise cuda graph
sfc-gh-yewang Aug 19, 2025
32c8ae0
update
sfc-gh-yewang Aug 20, 2025
08d930d
Merge branch 'main' into wangye/vllm_0.10.1
sfc-gh-yewang Aug 25, 2025
81948bf
original model -> base model
sfc-gh-mhidayetoglu Sep 4, 2025
276551b
minor changes
sfc-gh-mhidayetoglu Sep 9, 2025
e70d05d
remove skip ulysses
sfc-gh-mhidayetoglu Sep 12, 2025
b3b4bfb
rename
sfc-gh-mhidayetoglu Sep 12, 2025
fbae2ca
bump to v0.10.1
sfc-gh-mhidayetoglu Sep 12, 2025
7028648
bump up to v0.10.1
sfc-gh-mhidayetoglu Sep 12, 2025
ce5cb58
fix module name
sfc-gh-mhidayetoglu Sep 12, 2025
84b46fb
monkeypatch _capture_cudagraphs
sfc-gh-mhidayetoglu Sep 13, 2025
bcbf9f6
fix typo
sfc-gh-mhidayetoglu Sep 13, 2025
b5f0e73
make cases list
sfc-gh-mhidayetoglu Sep 13, 2025
33f9633
implement cudagraph dispatcher
sfc-gh-mhidayetoglu Sep 14, 2025
e31d830
remove comments
sfc-gh-mhidayetoglu Sep 14, 2025
642294a
remove compilation patch
sfc-gh-mhidayetoglu Sep 14, 2025
a92ef6e
cosmetic changes
sfc-gh-mhidayetoglu Sep 14, 2025
eba6d1c
Mert/no code changes (#178)
sfc-gh-mhidayetoglu Sep 14, 2025
b0d7302
Merge branch 'main' into wangye/vllm_0.10.1
sfc-gh-mhidayetoglu Sep 14, 2025
2fc63e7
Merge branch 'main' into wangye/vllm_0.10.1
sfc-gh-mhidayetoglu Sep 22, 2025
49b6c15
Merge branch 'main' into wangye/vllm_0.10.1
sfc-gh-yewang Sep 23, 2025
1152ea8
add moe router module with hierarchichal gating support for deepseek-v3
sfc-gh-reyazda Sep 25, 2025
f7e53f7
hack the system to bypass some optimization on arctic-inference
sfc-gh-reyazda Sep 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ Arctic Inference achieves high throughput and low latency through a wholistic se
<tbody>
<tr>
<td align="left">
Arctic Ulysses (<a href="https://www.snowflake.com/en/engineering-blog/ulysses-low-latency-llm-inference/">blog</a>,
<a href="https://arxiv.org/abs/2507.11830">paper</a>)
Arctic Ulysses (<a href="https://www.snowflake.com/en/engineering-blog/ulysses-low-latency-llm-inference/">blog</a>)
<br>
Shift Parallelism (<a href="https://www.snowflake.com/en/engineering-blog/arctic-inference-shift-parallelism/">blog</a>)
</td>
Expand Down
2 changes: 2 additions & 0 deletions arctic_inference/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
lambda: os.getenv("ARCTIC_INFERENCE_SKIP_VERSION_CHECK", "0") == "1",
}

# temporary workaround for gpt-oss model
ARCTIC_INFERENCE_SKIP_SPEC_MODEL_CHECK = 1

def __getattr__(name: str) -> Any:
if name in environment_variables:
Expand Down
1 change: 1 addition & 0 deletions arctic_inference/op_builder/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .moe_ops import MoEOpsBuilder
545 changes: 545 additions & 0 deletions arctic_inference/op_builder/builder.py

Large diffs are not rendered by default.

80 changes: 80 additions & 0 deletions arctic_inference/op_builder/moe_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@

import os

from .builder import CUDAOpBuilder, installed_cuda_version


class MoEOpsBuilder(CUDAOpBuilder):
BUILD_VAR = "AI_BUILD_MOE_OPS"
NAME = "moe_device_ops"

def __init__(self, name=None):
name = self.NAME if name is None else name
super().__init__(name=name)

def absolute_name(self):
return f'arctic_inference.moe_ops.{self.NAME}'

def is_compatible(self, verbose=False):
try:
import torch
except ImportError:
if verbose:
self.warning("Please install torch if trying to pre-compile arctic_inference kernels")
return False

cuda_okay = True
if torch.cuda.is_available(): #ignore-cuda
sys_cuda_major, _ = installed_cuda_version()
torch_cuda_major = int(torch.version.cuda.split('.')[0])
cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda
if cuda_capability < 6:
if verbose:
self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
cuda_okay = False
if cuda_capability >= 8:
if torch_cuda_major < 11 or sys_cuda_major < 11:
if verbose:
self.warning("On Ampere and higher architectures please use CUDA 11+")
cuda_okay = False
return super().is_compatible(verbose) and cuda_okay

def filter_ccs(self, ccs):
ccs_retained = []
ccs_pruned = []
for cc in [cc.split('.') for cc in ccs]:
if int(cc[0]) >= 8:
# Blocked flash has a dependency on Ampere + newer
ccs_retained.append(cc)
else:
ccs_pruned.append(cc)
if len(ccs_pruned) > 0:
self.warning(f"Filtered compute capabilities {ccs_pruned}")
return ccs_retained

def get_prefix(self):
ai_path = self._src_path("arctic_inference")
return "arctic_inference" if os.path.isdir(ai_path) else ".."

def sources(self):
sources = [
"csrc/moe_ops/topk_router.cpp",
"csrc/moe_ops/topk_router.cu",
"csrc/moe_ops/moe_apis.cpp"
]

prefix = self.get_prefix()
sources = [os.path.join(prefix, src) for src in sources]
return sources

def extra_ldflags(self):
return []

def include_paths(self):
sources = [
'csrc/moe_ops/',
]

prefix = self.get_prefix()
sources = [os.path.join(prefix, src) for src in sources]
return sources
9 changes: 0 additions & 9 deletions arctic_inference/vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def __new__(cls, *args, **kwargs):

class SpeculativeConfigPatch(ArcticPatch[SpeculativeConfig]):

_orig_from_dict = SpeculativeConfig.__dict__["from_dict"].__wrapped__
_orig_post_init = SpeculativeConfig.__post_init__

def __new__(cls, *args, **kwargs):
Expand Down Expand Up @@ -110,14 +109,6 @@ def __post_init__(self):
else:
self._orig_post_init()

@classmethod
def from_dict(cls, dict_value: dict) -> SpeculativeConfig:
"""Parse the CLI value for the speculative config."""
if cls is SpeculativeConfig:
return SpeculativeConfigPatch._orig_from_dict(
ArcticSpeculativeConfig, dict_value)
return SpeculativeConfigPatch._orig_from_dict(cls, dict_value)


class VllmConfigPatch(ArcticPatch[VllmConfig]):

Expand Down
Loading