ROCm
diff --git a/‎Dockerfile
Lines changed: 2 additions & 2 deletions b/‎Dockerfile
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎src/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/include/migraphx/op/group.hpp
Lines changed: 65 additions & 0 deletions b/‎src/include/migraphx/op/group.hpp
Lines changed: 65 additions & 0 deletions
diff --git a/‎src/include/migraphx/operators.hpp
Lines changed: 1 addition & 0 deletions b/‎src/include/migraphx/operators.hpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/targets/gpu/fuse_mlir.cpp
Lines changed: 9 additions & 111 deletions b/‎src/targets/gpu/fuse_mlir.cpp
Lines changed: 9 additions & 111 deletions
diff --git a/‎src/targets/gpu/lowering.cpp
Lines changed: 0 additions & 11 deletions b/‎src/targets/gpu/lowering.cpp
Lines changed: 0 additions & 11 deletions
@@ -112,8 +112,8 @@ RUN cget -p $PREFIX install doxygen@Release_1_9_8
 
 COPY ./test/onnx/.onnxrt-commit /
 
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
+ARG ONNXRUNTIME_REPO=https://github.com/rocm/onnxruntime
+ARG ONNXRUNTIME_BRANCH=rocm6.4_internal_testing
 ARG ONNXRUNTIME_COMMIT
 
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime && \
 
@@ -188,6 +188,7 @@ register_migraphx_ops(
     greater
     greater_or_equal
     group_query_attention
+    group
     gru
     identity
     if_op
 
@@ -0,0 +1,65 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_OPERATORS_GROUP_HPP
+#define MIGRAPHX_GUARD_OPERATORS_GROUP_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/check_shapes.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct group
+{
+    std::string tag = "";
+
+    std::string name() const { return "group"; }
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.tag, "tag"));
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs, const std::vector<module_ref>& mods) const
+    {
+        if(mods.size() != 1)
+            MIGRAPHX_THROW("should have one submodule.");
+        module_ref mod = mods[0];
+        check_shapes{inputs, *this}.has_at_least(1);
+
+        auto result =
+            mod->compute_shapes(inputs, {.name = name(), .strict_type = true, .strict_lens = true});
+        if(result.size() == 1)
+            return result.front();
+        return shape{result};
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
@@ -66,6 +66,7 @@
 #include <migraphx/op/greater.hpp>
 #include <migraphx/op/greater_or_equal.hpp>
 #include <migraphx/op/group_query_attention.hpp>
+#include <migraphx/op/group.hpp>
 #include <migraphx/op/gru.hpp>
 #include <migraphx/op/identity.hpp>
 #include <migraphx/op/if_op.hpp>
 
@@ -917,126 +917,24 @@ struct find_mlir_standalone_attention_op
     }
 };
 
-struct find_mlir_gqa_attention_op
+struct find_mlir_attention_op
 {
     mlir_mode dot_mode = mlir_mode::none;
 
-    auto matcher() const { return match::name("gpu::kv_cache_attention"); }
-
-    auto finalize_attention_module(module_ref m) const
-    {
-        eliminate_common_subexpression{}.apply(*m);
-        dead_code_elimination{}.apply(*m);
-    }
+    auto matcher() const { return match::name("group"); }
 
     void apply(module_pass_manager& mpm, const match::matcher_result& r) const
     {
-        auto attn = r.result;
-
-        float scale_val       = attn->get_operator().to_value().get("scale", 0.0);
-        std::size_t num_heads = attn->get_operator().to_value().get("num_heads", 32);
-        std::size_t kv_num_heads = attn->get_operator().to_value().get("kv_num_heads", 32);
-        auto kv_num_heads_factor = num_heads / kv_num_heads;
-        auto qkv              = attn->inputs().at(0);
-        auto pk                = attn->inputs().at(1);
-        auto pv                = attn->inputs().at(2);
-        auto csl              = attn->inputs().at(3);
-        auto batch_size       = pk->get_shape().lens()[0];
-        auto seq_len          = qkv->get_shape().lens()[2];
-        auto head_size        = qkv->get_shape().lens()[3];
-        auto max_seq_len      = pk->get_shape().lens()[2];
-        csl                   = mpm.get_module().insert_instruction(
-            attn, make_op("multibroadcast", {{"out_lens", {batch_size, num_heads}}}), csl);
-
-        module m_attn;
-        std::vector<instruction_ref> inputs = {qkv, pk, pv, csl};
-        std::unordered_map<instruction_ref, instruction_ref> map_main_to_mattn;
-        m_attn.add_params(inputs, &map_main_to_mattn);
-
-        auto q = m_attn.add_instruction(
-            make_op("slice", {{"axes", {1}}, {"starts", {0}}, {"ends", {num_heads}}}),
-            map_main_to_mattn.at(qkv));
-        auto k = map_main_to_mattn.at(pk);
-        auto v = map_main_to_mattn.at(pv);
-        if(kv_num_heads_factor != 1)
+        auto group = r.result;
+        auto tag   = group->get_operator().to_value().get("tag", "");
+        if(tag != "attention")
         {
-            auto kv_new_lens = k->get_shape().lens();
-            kv_new_lens.at(1) = num_heads;
-            k = m_attn.add_instruction(
-                make_op("unsqueeze", {{"axes", {2}}}), k);
-            v = m_attn.add_instruction(
-                make_op("unsqueeze", {{"axes", {2}}}), v);
-            auto kv_unsqueezed_lens = k->get_shape().lens();
-            kv_unsqueezed_lens.at(2) = kv_num_heads_factor;
-            k = m_attn.add_instruction(make_op("multibroadcast", {{"out_lens", kv_unsqueezed_lens}}), k);
-            v = m_attn.add_instruction(make_op("multibroadcast", {{"out_lens", kv_unsqueezed_lens}}), v);
-            k = m_attn.add_instruction(make_op("reshape", {{"dims", kv_new_lens}}), k);
-            v = m_attn.add_instruction(make_op("reshape", {{"dims", kv_new_lens}}), v);
-        }
-        auto kt = m_attn.add_instruction(
-            make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), k);
-        auto gemm1 = m_attn.add_instruction(make_op("dot"), q, kt);
-
-        std::vector<int> range_vec(max_seq_len);
-        std::iota(range_vec.begin(), range_vec.end(), 0);
-        shape range_s{csl->get_shape().type(), {max_seq_len}};
-        auto range = m_attn.add_literal(range_s, range_vec);
-        std::vector<std::size_t> bnsm{batch_size, num_heads, seq_len, max_seq_len};
-        auto bc_range =
-            m_attn.add_instruction(make_op("multibroadcast", {{"out_lens", bnsm}}), range);
-
-        auto scalar_s = shape{qkv->get_shape().type(), {1}};
-        auto ninf =
-            m_attn.add_literal(literal{scalar_s, {-std::numeric_limits<float>::infinity()}});
-        ninf = m_attn.add_instruction(make_op("multibroadcast", {{"out_lens", bnsm}}), ninf);
-
-        if(float_equal(scale_val, 0.0))
-        {
-            scale_val = 1.0f / std::sqrt(static_cast<float>(head_size));
-        }
-        auto scale = m_attn.add_literal(literal{scalar_s, {scale_val}});
-        scale      = m_attn.add_instruction(make_op("multibroadcast", {{"out_lens", bnsm}}), scale);
-
-        if(seq_len > 1)
-        {
-            std::vector<int> seq_range_vec(seq_len);
-            std::iota(seq_range_vec.begin(), seq_range_vec.end(), 1);
-            shape seq_range_s{csl->get_shape().type(), {seq_len}};
-            auto seq_range = m_attn.add_literal(seq_range_s, seq_range_vec);
-            seq_range =
-                m_attn.add_instruction(make_op("reshape", {{"dims", {seq_len, 1}}}), seq_range);
-            seq_range =
-                m_attn.add_instruction(make_op("multibroadcast", {{"out_lens", bnsm}}), seq_range);
-            auto causal_mask =
-                m_attn.add_instruction(make_op("greater_or_equal"), bc_range, seq_range);
-            causal_mask = m_attn.add_instruction(
-                make_op("convert", {{"target_type", shape::bool_type}}), causal_mask);
-            gemm1 = m_attn.add_instruction(make_op("where"), causal_mask, ninf, gemm1);
+            return;
         }
 
-        auto bc_csl =
-            m_attn.add_instruction(make_op("reshape", {{"dims", {batch_size, num_heads, 1, 1}}}),
-                                   map_main_to_mattn.at(csl));
-        auto mask_comp =
-            m_attn.add_instruction(make_op("multibroadcast", {{"out_lens", bnsm}}), bc_csl);
-        auto mask = m_attn.add_instruction(make_op("greater_or_equal"), bc_range, mask_comp);
-        mask =
-            m_attn.add_instruction(make_op("convert", {{"target_type", shape::bool_type}}), mask);
-        auto mul     = m_attn.add_instruction(make_op("mul"), gemm1, scale);
-        auto where   = m_attn.add_instruction(make_op("where"), mask, ninf, mul);
-        auto softmax = m_attn.add_instruction(make_op("softmax", {{"axis", 3}}), where);
-        auto scores  = m_attn.add_instruction(make_op("dot"), softmax, v);
-        auto out =
-            m_attn.add_instruction(make_op("transpose", {{"permutation", {0, 2, 1, 3}}}), scores);
-        out = m_attn.add_instruction(make_op("reshape", {{"dims", attn->get_shape().lens()}}), out);
-        m_attn.add_return({out});
-
-        finalize_attention_module(&m_attn);
-        module_ref mpm_attn = mpm.create_module("mlir_attn", std::move(m_attn));
-        mpm_attn->set_bypass();
-
+        auto* m_attn = group->module_inputs()[0];
         mpm.get_module().replace_instruction(
-            attn, mlir_op{attn->get_operator()}, mlir_contiguous(mpm, inputs), {mpm_attn});
+            group, mlir_op{group->get_operator()}, mlir_contiguous(mpm, group->inputs()), {m_attn});
     }
 };
 
@@ -1198,7 +1096,7 @@ void fuse_mlir::apply(module_pass_manager& mpm) const
         mpm.run_pass(dead_code_elimination{});
     }
 
-    match::find_matches(mpm, find_mlir_gqa_attention_op{mlir_mode::all});
+    match::find_matches(mpm, find_mlir_attention_op{mlir_mode::all});
     mpm.run_pass(dead_code_elimination{});
 
     match::find_matches(
 
@@ -555,17 +555,6 @@ struct miopen_apply
                                                      {"output_shape", to_value(ins->get_shape())}}),
                                             ins->inputs());
         });
-
-        apply_map.emplace("gpu::kv_cache_attention", [=](instruction_ref ins) {
-            auto s          = ins->get_shape();
-            auto output     = insert_allocation(ins, s);
-            auto new_inputs = ins->inputs();
-            new_inputs.push_back(output);
-            return mod->replace_instruction(
-                ins,
-                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
-                new_inputs);
-        });
     }
 
     void add_scan_slice_op()