pytorch
diff --git a/‎.github/scripts/setup-env.sh‎
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/setup-env.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 8 additions & 5 deletions b/‎README.md‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎extension_cpp/csrc/cuda/muladd.cu‎
Lines changed: 118 additions & 51 deletions b/‎extension_cpp/csrc/cuda/muladd.cu‎
Lines changed: 118 additions & 51 deletions
@@ -101,7 +101,7 @@ pip install --progress-bar=off -r requirements.txt
 echo '::endgroup::'
 
 echo '::group::Install extension-cpp'
-python setup.py develop
+pip install -e . --no-build-isolation
 echo '::endgroup::'
 
 echo '::group::Collect environment information'
 
@@ -1,11 +1,14 @@
-# C++/CUDA Extensions in PyTorch
+# C++/CUDA Extensions in PyTorch with LibTorch Stable ABI
+
+An example of writing a C++/CUDA extension for PyTorch using the **LibTorch Stable ABI**.
+See [here](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html) for the accompanying tutorial.
 
-An example of writing a C++/CUDA extension for PyTorch. See
-[here](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html) for the accompanying tutorial.
 This repo demonstrates how to write an example `extension_cpp.ops.mymuladd`
-custom op that has both custom CPU and CUDA kernels.
+custom op that has both custom CPU and CUDA kernels, with cross-version
+compatibility using the stable ABI.
+
+The examples in this repo work with PyTorch 2.10+.
 
-The examples in this repo work with PyTorch 2.4+.
 
 To build:
 ```
 
@@ -1,10 +1,18 @@
-#include <ATen/Operators.h>
-#include <torch/all.h>
-#include <torch/library.h>
+// LibTorch Stable ABI version of CUDA custom operators
+// This file uses the stable API for cross-version compatibility.
+// See: https://pytorch.org/docs/main/notes/libtorch_stable_abi.html
+
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/accelerator.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/macros/Macros.h>
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <ATen/cuda/CUDAContext.h>
 
 namespace extension_cpp {
 
@@ -13,21 +21,39 @@ __global__ void muladd_kernel(int numel, const float* a, const float* b, float c
   if (idx < numel) result[idx] = a[idx] * b[idx] + c;
 }
 
-at::Tensor mymuladd_cuda(const at::Tensor& a, const at::Tensor& b, double c) {
-  TORCH_CHECK(a.sizes() == b.sizes());
-  TORCH_CHECK(a.dtype() == at::kFloat);
-  TORCH_CHECK(b.dtype() == at::kFloat);
-  TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA);
-  at::Tensor a_contig = a.contiguous();
-  at::Tensor b_contig = b.contiguous();
-  at::Tensor result = at::empty(a_contig.sizes(), a_contig.options());
-  const float* a_ptr = a_contig.data_ptr<float>();
-  const float* b_ptr = b_contig.data_ptr<float>();
-  float* result_ptr = result.data_ptr<float>();
+torch::stable::Tensor mymuladd_cuda(
+    const torch::stable::Tensor& a,
+    const torch::stable::Tensor& b,
+    double c) {
+  STD_TORCH_CHECK(a.sizes().equals(b.sizes()), "Tensor sizes must match");
+  STD_TORCH_CHECK(
+      a.scalar_type() == torch::headeronly::ScalarType::Float,
+      "Input tensor a must be float32");
+  STD_TORCH_CHECK(
+      b.scalar_type() == torch::headeronly::ScalarType::Float,
+      "Input tensor b must be float32");
+  STD_TORCH_CHECK(
+      a.device().type() == torch::headeronly::DeviceType::CUDA,
+      "Input tensor a must be on CUDA");
+  STD_TORCH_CHECK(
+      b.device().type() == torch::headeronly::DeviceType::CUDA,
+      "Input tensor b must be on CUDA");
+
+  torch::stable::Tensor a_contig = torch::stable::contiguous(a);
+  torch::stable::Tensor b_contig = torch::stable::contiguous(b);
+  torch::stable::Tensor result = torch::stable::empty_like(a_contig);
+
+  const float* a_ptr = a_contig.const_data_ptr<float>();
+  const float* b_ptr = b_contig.const_data_ptr<float>();
+  float* result_ptr = result.mutable_data_ptr<float>();
 
   int numel = a_contig.numel();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  void* stream_ptr = nullptr;
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_current_cuda_stream(a.get_device_index(), &stream_ptr));
+  cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr);
+
   muladd_kernel<<<(numel+255)/256, 256, 0, stream>>>(numel, a_ptr, b_ptr, c, result_ptr);
   return result;
 }
@@ -37,20 +63,38 @@ __global__ void mul_kernel(int numel, const float* a, const float* b, float* res
   if (idx < numel) result[idx] = a[idx] * b[idx];
 }
 
-at::Tensor mymul_cuda(const at::Tensor& a, const at::Tensor& b) {
-  TORCH_CHECK(a.sizes() == b.sizes());
-  TORCH_CHECK(a.dtype() == at::kFloat);
-  TORCH_CHECK(b.dtype() == at::kFloat);
-  TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA);
-  at::Tensor a_contig = a.contiguous();
-  at::Tensor b_contig = b.contiguous();
-  at::Tensor result = at::empty(a_contig.sizes(), a_contig.options());
-  const float* a_ptr = a_contig.data_ptr<float>();
-  const float* b_ptr = b_contig.data_ptr<float>();
-  float* result_ptr = result.data_ptr<float>();
+torch::stable::Tensor mymul_cuda(
+    const torch::stable::Tensor& a,
+    const torch::stable::Tensor& b) {
+  STD_TORCH_CHECK(a.sizes().equals(b.sizes()), "Tensor sizes must match");
+  STD_TORCH_CHECK(
+      a.scalar_type() == torch::headeronly::ScalarType::Float,
+      "Input tensor a must be float32");
+  STD_TORCH_CHECK(
+      b.scalar_type() == torch::headeronly::ScalarType::Float,
+      "Input tensor b must be float32");
+  STD_TORCH_CHECK(
+      a.device().type() == torch::headeronly::DeviceType::CUDA,
+      "Input tensor a must be on CUDA");
+  STD_TORCH_CHECK(
+      b.device().type() == torch::headeronly::DeviceType::CUDA,
+      "Input tensor b must be on CUDA");
+
+  torch::stable::Tensor a_contig = torch::stable::contiguous(a);
+  torch::stable::Tensor b_contig = torch::stable::contiguous(b);
+  torch::stable::Tensor result = torch::stable::empty_like(a_contig);
+
+  const float* a_ptr = a_contig.const_data_ptr<float>();
+  const float* b_ptr = b_contig.const_data_ptr<float>();
+  float* result_ptr = result.mutable_data_ptr<float>();
+
   int numel = a_contig.numel();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  void* stream_ptr = nullptr;
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_current_cuda_stream(a.get_device_index(), &stream_ptr));
+  cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr);
+
   mul_kernel<<<(numel+255)/256, 256, 0, stream>>>(numel, a_ptr, b_ptr, result_ptr);
   return result;
 }
@@ -60,32 +104,55 @@ __global__ void add_kernel(int numel, const float* a, const float* b, float* res
   if (idx < numel) result[idx] = a[idx] + b[idx];
 }
 
-void myadd_out_cuda(const at::Tensor& a, const at::Tensor& b, at::Tensor& out) {
-  TORCH_CHECK(a.sizes() == b.sizes());
-  TORCH_CHECK(b.sizes() == out.sizes());
-  TORCH_CHECK(a.dtype() == at::kFloat);
-  TORCH_CHECK(b.dtype() == at::kFloat);
-  TORCH_CHECK(out.dtype() == at::kFloat);
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_INTERNAL_ASSERT(a.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(b.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(out.device().type() == at::DeviceType::CUDA);
-  at::Tensor a_contig = a.contiguous();
-  at::Tensor b_contig = b.contiguous();
-  const float* a_ptr = a_contig.data_ptr<float>();
-  const float* b_ptr = b_contig.data_ptr<float>();
-  float* result_ptr = out.data_ptr<float>();
+// An example of an operator that mutates one of its inputs.
+void myadd_out_cuda(
+    const torch::stable::Tensor& a,
+    const torch::stable::Tensor& b,
+    torch::stable::Tensor& out) {
+  STD_TORCH_CHECK(a.sizes().equals(b.sizes()), "Tensor sizes must match");
+  STD_TORCH_CHECK(b.sizes().equals(out.sizes()), "Output tensor size must match inputs");
+  STD_TORCH_CHECK(
+      a.scalar_type() == torch::headeronly::ScalarType::Float,
+      "Input tensor a must be float32");
+  STD_TORCH_CHECK(
+      b.scalar_type() == torch::headeronly::ScalarType::Float,
+      "Input tensor b must be float32");
+  STD_TORCH_CHECK(
+      out.scalar_type() == torch::headeronly::ScalarType::Float,
+      "Output tensor must be float32");
+  STD_TORCH_CHECK(out.is_contiguous(), "Output tensor must be contiguous");
+  STD_TORCH_CHECK(
+      a.device().type() == torch::headeronly::DeviceType::CUDA,
+      "Input tensor a must be on CUDA");
+  STD_TORCH_CHECK(
+      b.device().type() == torch::headeronly::DeviceType::CUDA,
+      "Input tensor b must be on CUDA");
+  STD_TORCH_CHECK(
+      out.device().type() == torch::headeronly::DeviceType::CUDA,
+      "Output tensor must be on CUDA");
+
+  torch::stable::Tensor a_contig = torch::stable::contiguous(a);
+  torch::stable::Tensor b_contig = torch::stable::contiguous(b);
+
+  const float* a_ptr = a_contig.const_data_ptr<float>();
+  const float* b_ptr = b_contig.const_data_ptr<float>();
+  float* result_ptr = out.mutable_data_ptr<float>();
+
   int numel = a_contig.numel();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  void* stream_ptr = nullptr;
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_current_cuda_stream(a.get_device_index(), &stream_ptr));
+  cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr);
+
   add_kernel<<<(numel+255)/256, 256, 0, stream>>>(numel, a_ptr, b_ptr, result_ptr);
 }
 
-
 // Registers CUDA implementations for mymuladd, mymul, myadd_out
-TORCH_LIBRARY_IMPL(extension_cpp, CUDA, m) {
-  m.impl("mymuladd", &mymuladd_cuda);
-  m.impl("mymul", &mymul_cuda);
-  m.impl("myadd_out", &myadd_out_cuda);
+STABLE_TORCH_LIBRARY_IMPL(extension_cpp, CUDA, m) {
+  m.impl("mymuladd", TORCH_BOX(&mymuladd_cuda));
+  m.impl("mymul", TORCH_BOX(&mymul_cuda));
+  m.impl("myadd_out", TORCH_BOX(&myadd_out_cuda));
 }
 
 }