Skip to content
This repository was archived by the owner on Jul 1, 2025. It is now read-only.

Commit bd69664

Browse files
nickggfacebook-github-bot
authored andcommitted
Add dynamic buffer support to OCL Backend (#3765)
Summary: The OpenCL Backend uses a static memory allocation strategy of allocating a single large buffer and then using offsets into it, which is good for the general case, but doesn't allow us to get the most benefit out of Device Resident Tensors (when we'd like to leave an output on the device to be used as the input to another network). This PR adds a more dynamic mapping of device buffers to the OCL backend via OpenCL SubBuffers, which are similar to Glow TensorViews in that they provide access to a region without additional allocations. There is no behavioural change in this PR, but it provides infrastructure to reference buffers outside of the range of the DeviceBuffer in the future, which we need to get DRT perf wins. The immediate benefit is that I was able to simplify the OCL kernel code, deleting about 25% of kernels.cl. Documentation: NFC Pull Request resolved: #3765 Test Plan: tests in release and ASAN Differential Revision: D18465407 Pulled By: nickgg fbshipit-source-id: 1b5416c4f389885bae4d5e1533a65bef8ab60122
1 parent 334a712 commit bd69664

File tree

8 files changed

+232
-490
lines changed

8 files changed

+232
-490
lines changed

.circleci/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ elif [[ "$CIRCLE_JOB" == "PYTORCH" ]]; then
140140
cd build
141141
elif [[ "$CIRCLE_JOB" == "OPENCL" ]]; then
142142
install_pocl
143-
CMAKE_ARGS+=("-DGLOW_WITH_OPENCL=ON")
143+
CMAKE_ARGS+=("-DGLOW_WITH_OPENCL=ON" "-DGLOW_OPENCL_ALIGN=128")
144144
else
145145
CMAKE_ARGS+=("-DCMAKE_BUILD_TYPE=Debug")
146146
if [[ "${CIRCLE_JOB}" == "SHARED" ]]; then

lib/Backends/OpenCL/OpenCL.cpp

Lines changed: 127 additions & 93 deletions
Large diffs are not rendered by default.

lib/Backends/OpenCL/OpenCL.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,7 @@ class OpenCLFunction final : public CompiledFunction {
145145
/// Fill the device \p buffer with a given \p value.
146146
/// \param len number of buffer elements to be filled by the \p value.
147147
/// Elements are considered to be of the type described by \p elemKind.
148-
void fillBuffer(cl_mem buffer, uint64_t start, uint64_t len, float value,
149-
ElemKind elemKind,
148+
void fillBuffer(cl_mem buffer, uint64_t len, float value, ElemKind elemKind,
150149
runtime::OpenCLDeviceBindings *devBindings);
151150

152151
/// Execution a convolution instruction which uses NCHW format.
@@ -242,10 +241,13 @@ namespace runtime {
242241
/// device specific information used to run a compiled function on a specific
243242
/// device.
244243
struct OpenCLDeviceBindings : DeviceBindings {
245-
OpenCLDeviceBindings(cl_mem buffer, cl_command_queue commands,
246-
cl_device_id device, cl_context ctx, cl_program prog)
244+
OpenCLDeviceBindings(
245+
cl_mem buffer, cl_command_queue commands, cl_device_id device,
246+
cl_context ctx, cl_program prog,
247+
const std::unordered_map<std::string, cl_mem> &subBuffers)
247248
: DeviceBindings(OCLBackend::getName()), deviceBuffer{buffer},
248-
commandQueue{commands}, deviceId{device}, context{ctx}, program{prog} {}
249+
commandQueue{commands}, deviceId{device}, context{ctx}, program{prog},
250+
weightBuffers(subBuffers) {}
249251

250252
/// CL memory buffer. Currently this contains both mutable and immutable
251253
/// weights, the buffer is allocated once when the network is added.
@@ -269,6 +271,12 @@ struct OpenCLDeviceBindings : DeviceBindings {
269271

270272
/// A list of kernels and their associated events.
271273
std::vector<KernelLaunch> kernelLaunches;
274+
275+
/// Buffers or subBuffers associated with symbols.
276+
std::unordered_map<std::string, cl_mem> weightBuffers;
277+
278+
/// /returns the subBufffer assciated with a Value.
279+
cl_mem getBuffer(glow::Value *v);
272280
};
273281
} // namespace runtime
274282
} // namespace glow

lib/Backends/OpenCL/OpenCLDeviceManager.cpp

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,29 @@ DeviceManager *createOCLDeviceManager(const DeviceConfig &config) {
5555
return new OpenCLDeviceManager(config);
5656
}
5757

58-
OpenCLBuffer::~OpenCLBuffer() { clReleaseMemObject(buffer_); }
58+
OpenCLBuffer::~OpenCLBuffer() {
59+
for (auto buf : subBuffers_) {
60+
clReleaseMemObject(buf.second);
61+
}
62+
subBuffers_.clear();
63+
64+
clReleaseMemObject(buffer_);
65+
}
66+
67+
/// Add a mapping from a Symbol name to an offset into buffer_;
68+
bool OpenCLBuffer::addSubBuffer(std::string name, size_t offset, size_t size) {
69+
cl_buffer_region region({offset, size});
70+
cl_int err;
71+
auto buf = clCreateSubBuffer(buffer_, CL_MEM_READ_WRITE,
72+
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
73+
auto res = subBuffers_.emplace(name, buf);
74+
if (!res.second) {
75+
llvm::dbgs() << "OpenCLBuffer: failed to add subBuffer for symbol " << name
76+
<< "\n";
77+
return false;
78+
}
79+
return true;
80+
}
5981
} // namespace runtime
6082
} // namespace glow
6183

@@ -356,6 +378,15 @@ void OpenCLDeviceManager::addNetworkImpl(const Module *module,
356378
clFinish(commands);
357379
}
358380
usedMemoryBytes_ += sizeInBytes;
381+
382+
// Add a sub-buffer for each symbol in the symbol table. OpenCL sub-buffers
383+
// are essentially TensorViews in Glow.
384+
for (auto &pair : bundle.getSymbolTable()) {
385+
bool success = buffer->addSubBuffer(pair.first, pair.second.offset,
386+
pair.second.size);
387+
DCHECK(success);
388+
}
389+
359390
// Compile the CL program.
360391
// Add to the function name lookup map.
361392
// Add shared pointer to the buffer to buffers. This way the buffer will
@@ -376,6 +407,7 @@ void OpenCLDeviceManager::addNetworkImpl(const Module *module,
376407
programs_.emplace(func.first, program);
377408
functions_.emplace(func.first, func.second);
378409
buffers_.emplace(func.first, buffer);
410+
379411
buffer->incrementUsers();
380412

381413
DCHECK_LE(usedMemoryBytes_, maxMemoryBytes_);
@@ -666,7 +698,7 @@ void OpenCLDeviceManager::runFunctionImpl(
666698
auto program = programs_[function];
667699
auto clBindings = glow::make_unique<runtime::OpenCLDeviceBindings>(
668700
buffers_[function]->getBuffer(), queue.backingQueue, deviceId_, context_,
669-
program);
701+
program, buffers_[function]->getSubBuffers());
670702

671703
// Copy inputs to the device.
672704
copyInputsToDevice(func->getRuntimeBundle(), context.get(), clBindings.get());

lib/Backends/OpenCL/OpenCLDeviceManager.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ class OpenCLBuffer {
9898
/// The OpenCL buffer being stored.
9999
cl_mem buffer_;
100100

101+
/// Subbuffers for symbols.
102+
std::unordered_map<std::string, cl_mem> subBuffers_;
103+
101104
/// Count of functions using this buffer.
102105
unsigned int users_{0};
103106

@@ -120,6 +123,14 @@ class OpenCLBuffer {
120123

121124
/// Get size of buffer in bytes.
122125
size_t getSize() { return size_; }
126+
127+
/// Return the mapping from Symbol name to subBuffer for this Buffer.
128+
const std::unordered_map<std::string, cl_mem> &getSubBuffers() {
129+
return subBuffers_;
130+
}
131+
132+
/// Add a mapping from a Symbol name to an offset into buffer_;
133+
bool addSubBuffer(std::string name, size_t offset, size_t size);
123134
};
124135

125136
/// A class controlling a single OpenCL device. Many OpenCLFunctions may be

0 commit comments

Comments
 (0)