snowflakedb · sfc-gh-reyazda · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
@@ -36,8 +36,7 @@ Arctic Inference achieves high throughput and low latency through a wholistic se
     <tbody>
         <tr>
             <td align="left">
-                Arctic Ulysses (<a href="https://www.snowflake.com/en/engineering-blog/ulysses-low-latency-llm-inference/">blog</a>,
-                                <a href="https://arxiv.org/abs/2507.11830">paper</a>)
+                Arctic Ulysses (<a href="https://www.snowflake.com/en/engineering-blog/ulysses-low-latency-llm-inference/">blog</a>)
                 <br>
                 Shift Parallelism (<a href="https://www.snowflake.com/en/engineering-blog/arctic-inference-shift-parallelism/">blog</a>)
             </td>

@@ -30,6 +30,8 @@
     lambda: os.getenv("ARCTIC_INFERENCE_SKIP_VERSION_CHECK", "0") == "1",
 }
 
+# temporary workaround for gpt-oss model
+ARCTIC_INFERENCE_SKIP_SPEC_MODEL_CHECK = 1
 
 def __getattr__(name: str) -> Any:
     if name in environment_variables:

@@ -0,0 +1 @@
+from .moe_ops import MoEOpsBuilder
@@ -0,0 +1,80 @@
+
+import os
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+
+
+class MoEOpsBuilder(CUDAOpBuilder):
+    BUILD_VAR = "AI_BUILD_MOE_OPS"
+    NAME = "moe_device_ops"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'arctic_inference.moe_ops.{self.NAME}'
+
+    def is_compatible(self, verbose=False):
+        try:
+            import torch
+        except ImportError:
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile arctic_inference kernels")
+            return False
+
+        cuda_okay = True
+        if torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 6:
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                cuda_okay = False
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+        return super().is_compatible(verbose) and cuda_okay
+
+    def filter_ccs(self, ccs):
+        ccs_retained = []
+        ccs_pruned = []
+        for cc in [cc.split('.') for cc in ccs]:
+            if int(cc[0]) >= 8:
+                # Blocked flash has a dependency on Ampere + newer
+                ccs_retained.append(cc)
+            else:
+                ccs_pruned.append(cc)
+        if len(ccs_pruned) > 0:
+            self.warning(f"Filtered compute capabilities {ccs_pruned}")
+        return ccs_retained
+
+    def get_prefix(self):
+        ai_path = self._src_path("arctic_inference")
+        return "arctic_inference" if os.path.isdir(ai_path) else ".."
+
+    def sources(self):
+        sources = [
+            "csrc/moe_ops/topk_router.cpp",
+            "csrc/moe_ops/topk_router.cu",
+            "csrc/moe_ops/moe_apis.cpp"
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+        return sources
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        sources = [
+            'csrc/moe_ops/',
+        ]
+
+        prefix = self.get_prefix()
+        sources = [os.path.join(prefix, src) for src in sources]
+        return sources
@@ -77,7 +77,6 @@ def __new__(cls, *args, **kwargs):
 
 class SpeculativeConfigPatch(ArcticPatch[SpeculativeConfig]):
 
-    _orig_from_dict = SpeculativeConfig.__dict__["from_dict"].__wrapped__
     _orig_post_init = SpeculativeConfig.__post_init__
 
     def __new__(cls, *args, **kwargs):
@@ -110,14 +109,6 @@ def __post_init__(self):
         else:
             self._orig_post_init()
 
-    @classmethod
-    def from_dict(cls, dict_value: dict) -> SpeculativeConfig:
-        """Parse the CLI value for the speculative config."""
-        if cls is SpeculativeConfig:
-            return SpeculativeConfigPatch._orig_from_dict(
-                ArcticSpeculativeConfig, dict_value)
-        return SpeculativeConfigPatch._orig_from_dict(cls, dict_value)
-
 
 class VllmConfigPatch(ArcticPatch[VllmConfig]):