From 33fe9501717962d26651e198598ae8e0eaeed533 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Mon, 6 May 2024 14:38:37 -0700 Subject: [PATCH 1/2] [SYCL] Add a test for generated device code for the group_load_store extension Only loads for now. Also adds `detail::naive` property to force usage of the unoptimized implementation, both for testing purposes and potential future experiments/debugging. --- .../oneapi/experimental/group_load_store.hpp | 29 +- .../sycl/ext/oneapi/properties/property.hpp | 3 +- sycl/test/check_device_code/group_load.cpp | 605 ++++++++++++++++++ 3 files changed, 628 insertions(+), 9 deletions(-) create mode 100644 sycl/test/check_device_code/group_load.cpp diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp index e697290cac21e..d158758080c86 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp @@ -51,8 +51,15 @@ struct full_group_key inline constexpr full_group_key::value_t full_group; namespace detail { +struct naive_key : detail::compile_time_property_key { + using value_t = property_value; +}; +inline constexpr naive_key::value_t naive; using namespace sycl::detail; +} // namespace detail +#ifdef __SYCL_DEVICE_ONLY__ +namespace detail { template inline constexpr bool verify_load_types = std::is_same_v< @@ -101,7 +108,6 @@ int get_mem_idx(GroupTy g, int vec_or_array_idx) { } } // namespace detail -#ifdef __SYCL_DEVICE_ONLY__ // Load API span overload. template && detail::is_generic_group_v> group_load(Group g, InputIteratorT in_ptr, - span out, Properties properties = {}) { - constexpr bool blocked = detail::isBlocked(properties); - - group_barrier(g); - for (int i = 0; i < out.size(); ++i) - out[i] = in_ptr[detail::get_mem_idx(g, i)]; - group_barrier(g); + span out, Properties props = {}) { + constexpr bool blocked = detail::isBlocked(props); + + if constexpr (props.template has_property()) { + group_barrier(g); + for (int i = 0; i < out.size(); ++i) + out[i] = in_ptr[detail::get_mem_idx(g, i)]; + group_barrier(g); + } else { + using use_naive = + detail::merged_properties_t; + return group_load(g, in_ptr, out, use_naive{}); + } } // Store API span overload. diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index f1823ee0e0f99..9e10c6f6187e2 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -199,8 +199,9 @@ enum PropKind : uint32_t { DataPlacement = 58, ContiguousMemory = 59, FullGroup = 60, + Naive = 61, // PropKindSize must always be the last value. - PropKindSize = 61, + PropKindSize = 62, }; struct property_key_base_tag {}; diff --git a/sycl/test/check_device_code/group_load.cpp b/sycl/test/check_device_code/group_load.cpp new file mode 100644 index 0000000000000..854664507c32c --- /dev/null +++ b/sycl/test/check_device_code/group_load.cpp @@ -0,0 +1,605 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "group_load" --include-generated-funcs --version 4 +// NOTE: and manually adjusted to follow the related explicit instantiation. +// RUN: %clangxx -O3 -fsycl -fsycl-device-only -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck %s + +#include + +using namespace sycl; + +namespace oneapi_exp = sycl::ext::oneapi::experimental; +using namespace sycl::ext::oneapi::experimental; + +using full_group_blocked = + decltype(properties(full_group, data_placement_blocked)); + +using naive_blocked = + decltype(properties(oneapi_exp::detail::naive, data_placement_blocked)); + +using opt_blocked = + decltype(properties(full_group, contiguous_memory, data_placement_blocked)); + +using full_group_striped = + decltype(properties(full_group, data_placement_striped)); + +using naive_striped = + decltype(properties(oneapi_exp::detail::naive, data_placement_striped)); + +using opt_striped = + decltype(properties(full_group, contiguous_memory, data_placement_striped)); + +template +using plain_global_ptr = typename sycl::detail::DecoratedType< + T, access::address_space::global_space>::type *; + +// Ensure `detail::naive` always results in no block loads/stores. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, naive_blocked>( + sycl::sub_group, plain_global_ptr, int &, naive_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_SL_RSM_T2_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] comdat !srcloc [[META5:![0-9]+]] !sycl_fixed_targets [[META6:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7:![0-9]+]] +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Check that optimized implementation is selected. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, opt_blocked>( + sycl::sub_group, plain_global_ptr, int &, opt_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_RSN_T2_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.0") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META5]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Check that contiguous_memory can be auto-detected. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, full_group_blocked>( + sycl::sub_group, plain_global_ptr, int &, full_group_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESM_SK_RSL_T2_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.2") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META5]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// SYCL 2020's accessor can't be statically known to be contiguous. +using accessor_iter_t = accessor::iterator; +// Can't be optimized. +template SYCL_EXTERNAL void +sycl::ext::oneapi::experimental::group_load( + sycl::sub_group, accessor_iter_t, int &, full_group_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_RSN_T2_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.2") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META5]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA11:![0-9]+]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA13:![0-9]+]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV3_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Explicit property - optimize. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, accessor_iter_t, int, opt_blocked>(sycl::sub_group, + accessor_iter_t, int &, + opt_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_SO_RSP_T2_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.0") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META5]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA13]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV3_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Run-time alignment check is needed if type's alignment is less than BlockRead +// requirements. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, char, opt_blocked>( + sycl::sub_group, plain_global_ptr, char &, opt_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccNS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_RSN_T2_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 1 dereferenceable(1) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.0") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META5]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA15:![0-9]+]] +// CHECK-NEXT: store i8 [[TMP1]], ptr addrspace(4) [[OUT]], align 1, !tbaa [[TBAA15]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Four shorts in blocked data layout could be loaded as a single 64-bit +// integer. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, short, 4, opt_blocked>( + sycl::sub_group, plain_global_ptr, span, opt_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.5") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.0") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16:![0-9]+]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META17:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I:%.*]] = shl i32 [[TMP2]], 2 +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[ADD_I_I:%.*]] = or disjoint i32 [[MUL_I_I]], [[I_0_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA20:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA20]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP22:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Check for non-power-of-two size. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 3, opt_blocked>( + sycl::sub_group, plain_global_ptr, span, opt_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.6") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.0") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META24:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP2]], 3 +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 3 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[MUL_I_I]], [[I_0_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP27:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Four int elements in blocked data layout don't map directly to any BlockRead +// API. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 4, opt_blocked>( + sycl::sub_group, plain_global_ptr, span, opt_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm4ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.7") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.0") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META28:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I:%.*]] = shl i32 [[TMP2]], 2 +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM4ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[ADD_I_I:%.*]] = or disjoint i32 [[MUL_I_I]], [[I_0_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP31:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm4ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Similar to four elements case but more complex to optimize. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 7, opt_blocked>( + sycl::sub_group, plain_global_ptr, span, opt_blocked); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm7ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.8") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.0") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META32:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 7 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM7ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[MUL_I_I]], [[I_0_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP35:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm7ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Striped data layout with one element per work item isn't different from +// blocked data layout, so use span version only in the checks below. + +// Ensure `detail::naive` always results in no block loads/stores. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 2, naive_striped>( + sycl::sub_group, plain_global_ptr, span, naive_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_SL_NS0_4spanISM_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.10") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META36:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META39:![0-9]+]] +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA42:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] +// CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I_0]], 2 +// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] +// CHECK: for.cond.cleanup: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void +// CHECK: for.body: +// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 +// CHECK-NEXT: [[MUL_I:%.*]] = mul nuw nsw i32 [[TMP1]], [[I_0]] +// CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[TMP0]], [[MUL_I]] +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP2]], i64 [[CONV]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]] + +// Check that optimized implementation is selected. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 2, opt_striped>( + sycl::sub_group, plain_global_ptr, span, opt_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META45:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META48:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP51:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Check that contiguous_memory can be auto-detected. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 2, full_group_striped>( + sycl::sub_group, plain_global_ptr, span, full_group_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESM_SK_NS0_4spanISL_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.13") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META52:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META55:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP58:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// SYCL 2020's accessor can't be statically known to be contiguous. +using accessor_iter_t = accessor::iterator; +// Can't be optimized. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, accessor_iter_t, int, 2, full_group_striped>( + sycl::sub_group, accessor_iter_t, span, full_group_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.13") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA13]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META59:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META62:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[CONV3_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV3_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP65:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEENSC_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Explicit property - optimize. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, accessor_iter_t, int, 2, opt_striped>(sycl::sub_group, + accessor_iter_t, + span, + opt_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_SO_NS0_4spanISP_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.9") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA13]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META66:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META69:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[CONV3_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV3_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP72:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Run-time alignment check is needed if type's alignment is less than BlockRead +// requirements. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, char, 2, opt_striped>( + sycl::sub_group, plain_global_ptr, span, opt_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.15") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META73:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META76:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I]], align 1, !tbaa [[TBAA15]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i8 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA15]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP79:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Just because there is a blocked data layout testcase, nothing inherently +// useful here. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, short, 4, opt_striped>( + sycl::sub_group, plain_global_ptr, span, opt_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.5") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META80:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META83:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA20]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA20]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP86:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Check for non-power-of-two size. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 3, opt_striped>( + sycl::sub_group, plain_global_ptr, span, opt_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.6") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META87:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META90:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 3 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP93:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Even though power of two, still too many to map directly onto BloadRead API. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 16, opt_striped>( + sycl::sub_group, plain_global_ptr, span, opt_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm16ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.16") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META94:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META97:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 16 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM16ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP100:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm16ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void + +// Non-power of two case bigger than max natively supported power of two case. +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sycl::sub_group, plain_global_ptr, int, 11, opt_striped>( + sycl::sub_group, plain_global_ptr, span, opt_striped); +// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm11ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEET3_( +// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.17") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.11") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META16]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA11]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA7]], !noalias [[META101:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA7]], !noalias [[META104:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i32 [[I_0_I]], 11 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM11ENS3_10PROPERTIESIST5TUPLEIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS3_6DETAIL9NAIVE_KEYEJEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP107:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm11ENS3_10propertiesISt5tupleIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS3_6detail9naive_keyEJEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR2]] +// CHECK-NEXT: ret void From 4a19dbc1e1b4210feedb97a3f8b7686bb7ac56d1 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Tue, 7 May 2024 10:19:30 -0700 Subject: [PATCH 2/2] Limit to linux only --- sycl/test/check_device_code/group_load.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sycl/test/check_device_code/group_load.cpp b/sycl/test/check_device_code/group_load.cpp index 854664507c32c..75b8130568e72 100644 --- a/sycl/test/check_device_code/group_load.cpp +++ b/sycl/test/check_device_code/group_load.cpp @@ -2,6 +2,12 @@ // NOTE: and manually adjusted to follow the related explicit instantiation. // RUN: %clangxx -O3 -fsycl -fsycl-device-only -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck %s +// Windows/linux have some slight differences in IR generation (function +// arguments passing and long/long long differences/mangling) that could +// complicate test updates while not improving test coverage. Limiting to linux +// should be fine. +// REQUIRES: linux + #include using namespace sycl;