diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 5a62538792f30..906643dda649d 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -339,6 +339,7 @@ set(llvm_offload_wrapper_files llvm_offload_wrappers/__llvm_offload.h llvm_offload_wrappers/__llvm_offload_host.h llvm_offload_wrappers/__llvm_offload_device.h + llvm_offload_wrappers/cuda_runtime.h ) set(llvm_libc_wrapper_files diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h new file mode 100644 index 0000000000000..2d698e1c14e49 --- /dev/null +++ b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h @@ -0,0 +1,137 @@ +/*===- __cuda_runtime.h - LLVM/Offload wrappers for CUDA runtime API -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CUDA_RUNTIME_API__ +#define __CUDA_RUNTIME_API__ + +#include +#include +#include + +extern "C" { +int omp_get_initial_device(void); +void omp_target_free(void *Ptr, int Device); +void *omp_target_alloc(size_t Size, int Device); +int omp_target_memcpy(void *Dst, const void *Src, size_t Length, + size_t DstOffset, size_t SrcOffset, int DstDevice, + int SrcDevice); +void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum); +int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum, + void *AsyncInfoQueue); +} + +// TODO: There are many fields missing in this enumeration. +typedef enum cudaError { + cudaSuccess = 0, + cudaErrorInvalidValue = 1, + cudaErrorMemoryAllocation = 2, + cudaErrorNoDevice = 100, + cudaErrorInvalidDevice = 101, + cudaErrorOTHER = -1, +} cudaError_t; + +enum cudaMemcpyKind { + cudaMemcpyHostToHost = 0, + cudaMemcpyHostToDevice = 1, + cudaMemcpyDeviceToHost = 2, + cudaMemcpyDeviceToDevice = 3, + cudaMemcpyDefault = 4 +}; + +typedef void *cudaStream_t; + +static thread_local cudaError_t __cudaomp_last_error = cudaSuccess; + +// Returns the last error that has been produced and resets it to cudaSuccess. +inline cudaError_t cudaGetLastError() { + cudaError_t TempError = __cudaomp_last_error; + __cudaomp_last_error = cudaSuccess; + return TempError; +} + +// Returns the last error that has been produced without reseting it. +inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; } + +inline cudaError_t cudaDeviceSynchronize() { + int DeviceNum = 0; + return __cudaomp_last_error = + (cudaError_t)__tgt_target_synchronize_async_info_queue( + /*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr); +} + +inline cudaError_t __cudaMalloc(void **devPtr, size_t size) { + int DeviceNum = 0; + *devPtr = omp_target_alloc(size, DeviceNum); + if (*devPtr == NULL) + return __cudaomp_last_error = cudaErrorMemoryAllocation; + + return __cudaomp_last_error = cudaSuccess; +} + +template cudaError_t cudaMalloc(T **devPtr, size_t size) { + return __cudaMalloc((void **)devPtr, size); +} + +inline cudaError_t __cudaFree(void *devPtr) { + int DeviceNum = 0; + omp_target_free(devPtr, DeviceNum); + return __cudaomp_last_error = cudaSuccess; +} + +template inline cudaError_t cudaFree(T *ptr) { + return __cudaFree((void *)ptr); +} + +inline cudaError_t __cudaMemcpy(void *dst, const void *src, size_t count, + cudaMemcpyKind kind) { + // get the host device number (which is the inital device) + int HostDeviceNum = omp_get_initial_device(); + + // use the default device for gpu + int GPUDeviceNum = 0; + + // default to copy from host to device + int DstDeviceNum = GPUDeviceNum; + int SrcDeviceNum = HostDeviceNum; + + if (kind == cudaMemcpyDeviceToHost) + std::swap(DstDeviceNum, SrcDeviceNum); + + // omp_target_memcpy returns 0 on success and non-zero on failure + if (omp_target_memcpy(dst, src, count, 0, 0, DstDeviceNum, SrcDeviceNum)) + return __cudaomp_last_error = cudaErrorInvalidValue; + return __cudaomp_last_error = cudaSuccess; +} + +template +inline cudaError_t cudaMemcpy(T *dst, const T *src, size_t count, + cudaMemcpyKind kind) { + return __cudaMemcpy((void *)dst, (const void *)src, count, kind); +} + +inline cudaError_t __cudaMemset(void *devPtr, int value, size_t count, + cudaStream_t stream = 0) { + int DeviceNum = 0; + if (!omp_target_memset(devPtr, value, count, DeviceNum)) + return __cudaomp_last_error = cudaErrorInvalidValue; + return __cudaomp_last_error = cudaSuccess; +} + +template +inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) { + return __cudaMemset((void *)devPtr, value, count); +} + +inline cudaError_t cudaDeviceReset(void) { + cudaDeviceSynchronize(); + // TODO: not implemented. + return __cudaomp_last_error = cudaSuccess; +} + +#endif diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h index 338b56226f204..a7be3f51fac7d 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -72,7 +72,7 @@ enum class IdentFlag { #include "llvm/Frontend/OpenMP/OMPKinds.def" // Version of the kernel argument format used by the omp runtime. -#define OMP_KERNEL_ARG_VERSION 3 +#define OMP_KERNEL_ARG_VERSION 4 // Minimum version of the compiler that generates a kernel dynamic pointer. #define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3 diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index d8f3c8fa06b74..0424d10175de1 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3) __OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr) __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr, - Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32) + Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr) __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr) __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8) __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 532313a31fc13..695f6e51d50a5 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -501,6 +501,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, constexpr const size_t MaxDim = 3; Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim)); Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); + Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy()); assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty()); @@ -529,7 +530,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Flags, NumTeams3D, NumThreads3D, - KernelArgs.DynCGGroupMem}; + KernelArgs.DynCGGroupMem, + AsyncInfoQueue}; } void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 4c1f7712249a3..f96b2f9ca259d 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -85,6 +85,9 @@ struct __tgt_async_info { /// ensure it is a valid location while the transfer to the device is /// happening. KernelLaunchEnvironmentTy KernelLaunchEnvironment; + + /// Flag to indicate the Queue should be persistent. + bool PersistentQueue = false; }; /// This struct contains all of the arguments to a target kernel region launch. @@ -110,12 +113,16 @@ struct KernelArgsTy { // The number of threads (for x,y,z dimension). uint32_t ThreadLimit[3] = {0, 0, 0}; uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested. + // A __tgt_async_info queue pointer to be used for the kernel and all + // associated device interactions. The operations are implicitly made + // non-blocking. + void *AsyncInfoQueue = nullptr; }; static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t), "Invalid struct size"); static_assert(sizeof(KernelArgsTy) == (8 * sizeof(int32_t) + 3 * sizeof(int64_t) + - 4 * sizeof(void **) + 2 * sizeof(int64_t *)), + 5 * sizeof(void **) + 2 * sizeof(int64_t *)), "Invalid struct size"); /// Flat array of kernel launch parameters and their total size. diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 2b6445e9fbe55..8730879905984 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -136,8 +136,19 @@ class AsyncInfoTy { /// Synchronization method to be used. SyncTy SyncType; - AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) + AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING) : Device(Device), SyncType(SyncType) {} + AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue) + : Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) { + AsyncInfo.Queue = AsyncInfoQueue; + AsyncInfo.PersistentQueue = !!AsyncInfoQueue; + } + AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType) + : Device(Device), SyncType(SyncType) { + AsyncInfo.Queue = AsyncInfoQueue; + AsyncInfo.PersistentQueue = !!AsyncInfoQueue; + } + ~AsyncInfoTy() { synchronize(); } /// Implicit conversion to the __tgt_async_info which is used in the @@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy { void **TaskAsyncInfoPtr = nullptr; public: - TaskAsyncInfoWrapperTy(DeviceTy &Device) + TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue= nullptr) : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) { + assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!"); // If we failed to acquired the current global thread id, we cannot // re-enqueue the current task. Thus we should use the local blocking async // info. @@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize, void *VAddr, bool IsRecord, bool SaveOutput, uint64_t &ReqPtrArgOffset); +void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId); + #ifdef __cplusplus } #endif diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 86df4584db091..f0e04896201a4 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2215,8 +2215,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; // Once the stream is synchronized, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; return AMDGPUStreamManager.returnResource(Stream); } @@ -2235,9 +2238,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (!(*CompletedOrErr)) return Plugin::success(); - // Once the stream is completed, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; return AMDGPUStreamManager.returnResource(Stream); } @@ -2450,7 +2456,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { - // TODO: Implement this function. + AMDGPUStreamTy *Stream; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + return Plugin::success(); } diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 60f7c918d7adb..64568cf701a8a 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -1518,8 +1518,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { assert(AsyncInfoPtr && "Invalid async info"); + assert(!(*AsyncInfoPtr) && "Already initialized async info"); *AsyncInfoPtr = new __tgt_async_info(); + (*AsyncInfoPtr)->PersistentQueue = true; AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b6465d61bd033..bfbc101529e18 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy { } // Once the stream is synchronized, return it to stream pool and reset - // AsyncInfo. This is to make sure the synchronization only works for its - // own tasks. + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; if (auto Err = CUDAStreamManager.returnResource(Stream)) return Err; @@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_ERROR_NOT_READY) return Plugin::success(); - // Once the stream is synchronized and the operations completed (or an error - // occurs), return it to stream pool and reset AsyncInfo. This is to make - // sure the synchronization only works for its own tasks. + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo if the queue is not persistent. This is to make sure the + // synchronization only works for its own tasks. + if (AsyncInfo.PersistentQueue) + return Plugin::success(); + AsyncInfo.Queue = nullptr; if (auto Err = CUDAStreamManager.returnResource(Stream)) return Err; diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp index ef1aad829e7bd..95dfa034465d0 100644 --- a/offload/src/KernelLanguage/API.cpp +++ b/offload/src/KernelLanguage/API.cpp @@ -10,6 +10,9 @@ #include "Shared/APITypes.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" + +#include #include struct dim3 { @@ -55,10 +58,13 @@ unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size, int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit, const void *HostPtr, KernelArgsTy *Args); +void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId); unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void *args, size_t sharedMem, void *stream) { + int64_t DeviceNo = 0; KernelArgsTy Args = {}; + Args.Version = OMP_KERNEL_ARG_VERSION; Args.DynCGroupMem = sharedMem; Args.NumTeams[0] = gridDim.x; Args.NumTeams[1] = gridDim.y; @@ -68,6 +74,13 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, Args.ThreadLimit[2] = blockDim.z; Args.ArgPtrs = reinterpret_cast(args); Args.Flags.IsCUDA = true; - return __tgt_target_kernel(nullptr, 0, gridDim.x, blockDim.x, func, &Args); + if (stream) + Args.AsyncInfoQueue = stream; + else + Args.AsyncInfoQueue = + __tgt_target_get_default_async_info_queue(nullptr, DeviceNo); + int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func, + &Args); + return rv; } } diff --git a/offload/src/exports b/offload/src/exports index 7bdc7d2a531bb..11830f62af388 100644 --- a/offload/src/exports +++ b/offload/src/exports @@ -29,6 +29,8 @@ VERS1.0 { __tgt_target_kernel; __tgt_target_kernel_nowait; __tgt_target_nowait_query; + __tgt_target_get_default_async_info_queue; + __tgt_target_synchronize_async_info_queue; __tgt_target_kernel_replay; __tgt_activate_record_replay; __tgt_mapper_num_components; diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp index 21f9114ac2b08..c425957a8a85b 100644 --- a/offload/src/interface.cpp +++ b/offload/src/interface.cpp @@ -16,6 +16,7 @@ #include "OpenMP/OMPT/Callback.h" #include "OpenMP/omp.h" #include "PluginManager.h" +#include "Shared/APITypes.h" #include "omptarget.h" #include "private.h" @@ -352,7 +353,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, if (!DeviceOrErr) FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); - TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr); + TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr, KernelArgs->AsyncInfoQueue); AsyncInfoTy &AsyncInfo = TargetAsyncInfo; /// RAII to establish tool anchors before and after target region OMPT_IF_BUILT(InterfaceRAII TargetRAII( @@ -550,3 +551,48 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) { delete AsyncInfo; *AsyncHandle = nullptr; } + +EXTERN void *__tgt_target_get_default_async_info_queue(void *Loc, + int64_t DeviceId) { + assert(PM && "Runtime not initialized"); + + static thread_local void **AsyncInfoQueue = nullptr; + + if (!AsyncInfoQueue) + AsyncInfoQueue = reinterpret_cast( + calloc(PM->getNumDevices(), sizeof(AsyncInfoQueue[0]))); + + if (!AsyncInfoQueue[DeviceId]) { + auto DeviceOrErr = PM->getDevice(DeviceId); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + + __tgt_async_info *AsyncInfo = nullptr; + DeviceOrErr->RTL->init_async_info(DeviceId, &AsyncInfo); + AsyncInfoQueue[DeviceId] = AsyncInfo->Queue; + } + + return AsyncInfoQueue[DeviceId]; +} + +EXTERN int __tgt_target_synchronize_async_info_queue(void *Loc, + int64_t DeviceId, + void *AsyncInfoQueue) { + assert(PM && "Runtime not initialized"); + + auto DeviceOrErr = PM->getDevice(DeviceId); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + if (!AsyncInfoQueue) + AsyncInfoQueue = __tgt_target_get_default_async_info_queue(Loc, DeviceId); + AsyncInfoTy AsyncInfo(*DeviceOrErr, AsyncInfoQueue, + AsyncInfoTy::SyncTy::BLOCKING); + + if (AsyncInfo.synchronize()) + FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n"); + [[maybe_unused]] __tgt_async_info *ASI = AsyncInfo; + assert(ASI->Queue); + assert(ASI->Queue && ASI->PersistentQueue); + + return 0; +} diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp index 7a2ee1303d68c..14dcd59d2e71b 100644 --- a/offload/src/omptarget.cpp +++ b/offload/src/omptarget.cpp @@ -49,7 +49,7 @@ int AsyncInfoTy::synchronize() { case SyncTy::BLOCKING: // If we have a queue we need to synchronize it now. Result = Device.synchronize(*this); - assert(AsyncInfo.Queue == nullptr && + assert((AsyncInfo.PersistentQueue || !AsyncInfo.Queue) && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!"); break; diff --git a/offload/test/offloading/CUDA/basic_api_malloc_free.cu b/offload/test/offloading/CUDA/basic_api_malloc_free.cu new file mode 100644 index 0000000000000..60a51e33a5af9 --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_malloc_free.cu @@ -0,0 +1,42 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void kernel(int *A, int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + DevPtr[i] = 1; + for (int i = 0; i < N; ++i) + *A += DevPtr[i]; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + kernel<<<1, 1>>>(Ptr, DevPtr, 42); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_api_memcpy.cu b/offload/test/offloading/CUDA/basic_api_memcpy.cu new file mode 100644 index 0000000000000..088e20ffa9e2b --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_memcpy.cu @@ -0,0 +1,47 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +__global__ void kernel(int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + DevPtr[i]--; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int Res = 0; + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + int HstPtr[42]; + for (int i = 0; i < 42; ++i) { + HstPtr[i] = 2; + } + Err = cudaMemcpy(DevPtr, HstPtr, 42 * sizeof(int), cudaMemcpyHostToDevice); + if (Err != cudaSuccess) + return -1; + printf("Res: %i\n", Res); + // CHECK: Res: 0 + kernel<<<1, 1>>>(DevPtr, 42); + cudaDeviceSynchronize(); + Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost); + if (Err != cudaSuccess) + return -1; + for (int i = 0; i < 42; ++i) { + printf("%i : %i\n", i, HstPtr[i]); + Res += HstPtr[i]; + } + printf("Res: %i\n", Res); + // CHECK: Res: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; +} diff --git a/offload/test/offloading/CUDA/basic_api_memset.cu b/offload/test/offloading/CUDA/basic_api_memset.cu new file mode 100644 index 0000000000000..474eb2a46f0a2 --- /dev/null +++ b/offload/test/offloading/CUDA/basic_api_memset.cu @@ -0,0 +1,44 @@ +// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t +// RUN: %t | %fcheck-generic + +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +#include +#include + +extern "C" { +void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); +void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); +} + +__global__ void kernel(int *A, int *DevPtr, int N) { + for (int i = 0; i < N; ++i) + *A += DevPtr[i]; + *A *= -1; +} + +int main(int argc, char **argv) { + int DevNo = 0; + int *Ptr = reinterpret_cast(llvm_omp_target_alloc_shared(4, DevNo)); + int *DevPtr; + auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + Err = cudaMemset(DevPtr, -1, 42 * sizeof(int)); + if (Err != cudaSuccess) + return -1; + *Ptr = 0; + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 + kernel<<<1, 1>>>(Ptr, DevPtr, 42); + cudaDeviceSynchronize(); + printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); + // CHECK: Ptr [[Ptr]], *Ptr: 42 + Err = cudaFree(DevPtr); + if (Err != cudaSuccess) + return -1; + llvm_omp_target_free_shared(Ptr, DevNo); +} diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu index 79f01f48b6c2a..0048b06dcbf72 100644 --- a/offload/test/offloading/CUDA/basic_launch.cu +++ b/offload/test/offloading/CUDA/basic_launch.cu @@ -10,6 +10,7 @@ // UNSUPPORTED: x86_64-pc-linux-gnu // UNSUPPORTED: x86_64-pc-linux-gnu-LTO +#include #include extern "C" { @@ -26,6 +27,7 @@ int main(int argc, char **argv) { printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 square<<<1, 1>>>(Ptr); + cudaDeviceSynchronize(); printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); // CHECK: Ptr [[Ptr]], *Ptr: 42 llvm_omp_target_free_shared(Ptr, DevNo); diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu index d4a6bc9ddfb3f..8458c8ca7a6cb 100644 --- a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu +++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu @@ -10,6 +10,7 @@ // UNSUPPORTED: x86_64-pc-linux-gnu // UNSUPPORTED: x86_64-pc-linux-gnu-LTO +#include #include extern "C" { @@ -28,6 +29,7 @@ int main(int argc, char **argv) { printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0 square<<<7, 6>>>(Ptr); + cudaDeviceSynchronize(); printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); // CHECK: Ptr [[Ptr]], *Ptr: 42 llvm_omp_target_free_shared(Ptr, DevNo); diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu index c11c194b5e061..ce3d4015daee2 100644 --- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu +++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu @@ -10,6 +10,7 @@ // UNSUPPORTED: x86_64-pc-linux-gnu // UNSUPPORTED: x86_64-pc-linux-gnu-LTO +#include #include extern "C" { @@ -35,6 +36,7 @@ int main(int argc, char **argv) { printf("Src: %i : %i\n", Src[0], Src[1]); // CHECK: Src: -2 : 8 square<<<1, 1>>>(Ptr, 3, Src, 4); + cudaDeviceSynchronize(); printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); // CHECK: Ptr [[Ptr]], *Ptr: 42 printf("Src: %i : %i\n", Src[0], Src[1]); diff --git a/offload/test/offloading/CUDA/launch_tu.cu b/offload/test/offloading/CUDA/launch_tu.cu index aad3d50975237..3c127a3368e11 100644 --- a/offload/test/offloading/CUDA/launch_tu.cu +++ b/offload/test/offloading/CUDA/launch_tu.cu @@ -10,6 +10,7 @@ // UNSUPPORTED: x86_64-pc-linux-gnu // UNSUPPORTED: x86_64-pc-linux-gnu-LTO +#include #include extern "C" { @@ -26,6 +27,7 @@ int main(int argc, char **argv) { printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7 square<<<1, 1>>>(Ptr); + cudaDeviceSynchronize(); printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr); // CHECK: Ptr [[Ptr]], *Ptr: 42 llvm_omp_target_free_shared(Ptr, DevNo);