Skip to content
This repository was archived by the owner on Jul 1, 2025. It is now read-only.

Commit d67cfad

Browse files
authored
Merge glow-h changes in internal repo to github master (#2832)
*Description*: This commit brings over changes that were made internally at FB for glow-h. *Testing*: ninja all && ninja test Habana specific testing will be done internally.
1 parent d299b98 commit d67cfad

File tree

8 files changed

+136
-98
lines changed

8 files changed

+136
-98
lines changed

lib/Backends/Habana/Habana.cpp

Lines changed: 62 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static synDataType getSynType(ElemKind kind) {
5454
case ElemKind::FloatTy:
5555
return syn_type_single;
5656
case ElemKind::Float16Ty:
57-
return syn_type_half;
57+
GLOW_UNREACHABLE("Unhandled ElemKind: Float16Ty");
5858
case ElemKind::Int8QTy:
5959
return syn_type_fixed;
6060
case ElemKind::Int16QTy:
@@ -92,18 +92,6 @@ static std::string getKernelName(llvm::StringRef kernelBase, ElemKind kind) {
9292
return std::string(kernelBase) + getKernelSuffix(kind);
9393
}
9494

95-
/// If \p PH is an output placeholder, \returns the SaveNode.
96-
static SaveNode *getOutputSave(Function *F, Placeholder *PH) {
97-
for (auto &use : PH->getUsers()) {
98-
if (auto *save = llvm::dyn_cast<SaveNode>(use.getUser())) {
99-
if (save->getParent() == F && save->getPlaceholder() == PH) {
100-
return save;
101-
}
102-
}
103-
}
104-
return nullptr;
105-
}
106-
10795
namespace {
10896
/// Parameters for pooling operation.
10997
struct synPoolParams {
@@ -217,23 +205,6 @@ class TensorHandle final {
217205
// Model params need to be floats, even if the tensor is integral or
218206
// quantized.
219207
if (ioType == IOType::Static) {
220-
// Quantized types: dequantize into float buffer.
221-
if (V->isQuantizedType()) {
222-
// Check that a weight buffer was passed in; these are model params.
223-
assert(!allocated_);
224-
Type type = *V;
225-
if (V->getElementType() == ElemKind::UInt8FusedQTy) {
226-
// Fused quantized values just need to be passed through in raw form.
227-
type = Type(ElemKind::Int8QTy, V->dims(), 1.0, 0);
228-
}
229-
Tensor DT = quantization::dequantizeTensor(Tensor(buffer_, &type),
230-
ElemKind::FloatTy);
231-
auto bytes = DT.getSizeInBytes();
232-
buffer_ = malloc(bytes);
233-
memcpy(buffer_, DT.getUnsafePtr(), bytes);
234-
allocated_ = true;
235-
}
236-
237208
// Int32ITy: Cast to floats.
238209
if (V->getElementType() == ElemKind::Int32ITy) {
239210
float *floats_ = (float *)malloc(V->size() * sizeof(float));
@@ -258,11 +229,19 @@ class TensorHandle final {
258229
// Create tensor descriptor, with quantization params if needed.
259230
synTensorDescriptor desc(elemType, rdims.size(), rdims.data(), buffer_,
260231
synMemoryHost, false, name_.data());
261-
if (V->isQuantizedType() &&
262-
V->getElementType() != ElemKind::UInt8FusedQTy) {
263-
desc.m_quantizationParams[0].m_zp = V->getOffset();
264-
desc.m_quantizationParams[0].m_scale = V->getScale();
232+
if (V->isQuantizedType()) {
233+
if (V->getElementType() == ElemKind::UInt8FusedQTy) {
234+
desc.m_quantizationParams[0].m_zp = 0;
235+
desc.m_quantizationParams[0].m_scale = 1;
236+
} else {
237+
desc.m_quantizationParams[0].m_zp = V->getOffset();
238+
desc.m_quantizationParams[0].m_scale = V->getScale();
239+
}
240+
265241
desc.m_quantizationParams[0].m_qDataType = elemType;
242+
if (ioType == IOType::Static) {
243+
desc.m_isQuantized = true;
244+
}
266245
}
267246

268247
chk(synCreateTensor(&desc, &tensor_, ioType == IOType::Output, false,
@@ -307,6 +286,9 @@ class TensorHandle final {
307286
/// Get the underlying data buffer.
308287
void *getData() const { return buffer_; }
309288

289+
/// Get the name of the managed tensor
290+
const std::string &getName() const { return name_; }
291+
310292
/// Get the dimensions of the stored tensor.
311293
llvm::ArrayRef<unsigned> dims() const { return dims_; }
312294

@@ -665,15 +647,18 @@ allocateGraphTensors(Function *F) {
665647
continue;
666648
}
667649
if (auto *save = getOutputSave(F, V)) {
668-
// We want to avoid emitting copies for save nodes by simply marking the
669-
// save input as an "output" tensor. The exceptions are when the input
670-
// is itself a placeholder/constant, or a reshape. (The reshape case is
671-
// likely a Synapse bug.)
650+
// Naively, we'd generate a memcpy for any SaveNode, but that's a waste
651+
// so we want to avoid it. We can optimize it away by mapping the
652+
// SaveNode's input node (N, below) to the output tensor, and then simply
653+
// not generating a memcpy if the SaveNode itself has no associated
654+
// tensor.
672655
auto *N = save->getInput().getNode();
673-
Node *proxy =
674-
(llvm::isa<Storage>(N) || llvm::isa<HabanaReshapeNode>(N)) ? save : N;
675-
tensors.emplace(proxy, TensorHandle(V->getType(), V->getName(), nullptr,
676-
IOType::Output));
656+
if (llvm::isa<Storage>(N) || llvm::isa<HabanaReshapeNode>(N) ||
657+
N->getNumUsers() > 1) {
658+
N = save;
659+
}
660+
tensors.emplace(
661+
N, TensorHandle(V->getType(), V->getName(), nullptr, IOType::Output));
677662
} else {
678663
tensors.emplace(V, TensorHandle(V->getType(), V->getName(), nullptr,
679664
IOType::Default));
@@ -747,6 +732,7 @@ HabanaBackend::compile(Function *F, const BackendOptions &opts) const {
747732
std::vector<std::unique_ptr<ns_ConstantKernel::Params>> constantParams;
748733
std::vector<std::unique_ptr<ns_TileKernel::Params>> tileParams;
749734
std::vector<std::unique_ptr<unsigned>> concatParams;
735+
std::vector<std::unique_ptr<ns_TakeKernel::Params>> takeParams;
750736

751737
// Keep references to tensor pointer arrays passed into multi-input nodes
752738
// until the compilation is done.
@@ -755,6 +741,10 @@ HabanaBackend::compile(Function *F, const BackendOptions &opts) const {
755741
std::vector<TensorHandle> tempTensors;
756742

757743
for (const auto &I : F->getNodes()) {
744+
if (!isOpSupported(I)) {
745+
llvm::errs() << "Unsupported operator: " << I.getDebugDesc() << "\n";
746+
GLOW_UNREACHABLE("Unsupported operator");
747+
}
758748
switch (I.getKind()) {
759749
case Kinded::Kind::HabanaFullyConnectedNodeKind: {
760750
auto *NI = llvm::cast<HabanaFullyConnectedNode>(&I);
@@ -1116,7 +1106,17 @@ HabanaBackend::compile(Function *F, const BackendOptions &opts) const {
11161106
makeConcatParams(CI->getDim(), tensors[CI].dims().size());
11171107
std::vector<synTensor> inputs;
11181108
for (auto const &N : CI->getInputs()) {
1119-
inputs.push_back(tensors[N].get());
1109+
std::string memcpyNodeName =
1110+
llvm::formatv("{0}_memcpy_{1}", N.getNode()->getName(),
1111+
inputs.size())
1112+
.str();
1113+
TensorHandle memcpy(N.getType(), memcpyNodeName);
1114+
chk(synCreateGenericNode(
1115+
&tensors[N].get(), &memcpy.get(), 1, 1, nullptr,
1116+
getKernelName("memcpy", N.getType()->getElementType()).c_str(),
1117+
memcpy.getName().c_str(), nullptr, nullptr));
1118+
inputs.push_back(memcpy.get());
1119+
tempTensors.emplace_back(std::move(memcpy));
11201120
}
11211121

11221122
chk(synCreateGenericNode(inputs.data(), &tensors[CI].get(), inputs.size(),
@@ -1165,6 +1165,25 @@ HabanaBackend::compile(Function *F, const BackendOptions &opts) const {
11651165
multiInputs.emplace_back(std::move(inputs));
11661166
break;
11671167
}
1168+
case Kinded::Kind::GatherNodeKind: {
1169+
auto *gather = llvm::cast<GatherNode>(&I);
1170+
std::vector<synTensor> inputs = {tensors[gather->getData()].get(),
1171+
tensors[gather->getIndices()].get()};
1172+
1173+
auto params = llvm::make_unique<ns_TakeKernel::Params>();
1174+
params->axis =
1175+
gather->getData().dims().size() - gather->getBatchDims() - 1;
1176+
params->mode = 0;
1177+
1178+
chk(synCreateGenericNode(
1179+
inputs.data(), &tensors[gather].get(), inputs.size(), 1, params.get(),
1180+
getKernelName("take", gather->getResult().getElementType()).c_str(),
1181+
gather->getName().data(), nullptr, nullptr));
1182+
1183+
multiInputs.emplace_back(std::move(inputs));
1184+
takeParams.emplace_back(std::move(params));
1185+
break;
1186+
}
11681187
default: {
11691188
llvm::errs() << "Unhandled node: " << I.getDebugDesc() << "\n";
11701189
GLOW_UNREACHABLE("Unhandled node");

lib/Backends/Habana/Habana.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,12 +99,12 @@ class HabanaIOBufferPool {
9999
/// construction. This is the effective size of one HabanaIOBuffer in this
100100
/// pool.
101101
size_t perBufferSize_;
102-
/// The combined size of all HabanaIOBuffers in this pool (i.e. size_ *
103-
/// numBuffers_).
102+
/// The combined size of all HabanaIOBuffers in this pool (i.e. perBufferSize_
103+
/// * numBuffers_).
104104
size_t allBuffersSize_;
105105
/// Buffer that backs all of the HOmaIOBuffers in this pool. The first buffer
106-
/// starts at buffer_, the second at buffer_ + size_, etc. The last *ends* at
107-
/// buffer_ + totalSize_.
106+
/// starts at buffer_, the second at buffer_ + perBufferSize_, etc. The last
107+
/// *ends* at buffer_ + allBuffersSize_.
108108
uint8_t *buffer_;
109109
/// The number of buffers in the pool.
110110
unsigned numBuffers_{kDefaultNumBuffers};

lib/Backends/Habana/HabanaDeviceManager.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020

2121
#include "synapse.h"
2222

23+
#include <glog/logging.h>
24+
#include <limits>
25+
2326
using namespace glow;
2427
using namespace glow::runtime;
2528

@@ -52,13 +55,18 @@ HabanaDeviceManager::HabanaDeviceManager(std::unique_ptr<DeviceConfig> config,
5255
// If this is the first HabanaDeviceManager to be created, initialize the
5356
// Synapse API.
5457
if (numActiveDevices_ == 0) {
58+
LOG(INFO) << "Using version " << synGetVersion();
5559
chk(synInitialize());
5660
}
5761

5862
numActiveDevices_++;
5963
}
6064

6165
HabanaDeviceManager::~HabanaDeviceManager() {
66+
// If a device was never successfully acquired, there's nothing to clean up.
67+
if (deviceId_ == INVALID_DEVICE) {
68+
return;
69+
}
6270
std::lock_guard<std::mutex> lock(synapseMtx_);
6371
numActiveDevices_--;
6472

@@ -282,6 +290,7 @@ void HabanaDeviceManager::runFunctionImpl(RunIdentifierTy runId,
282290
}
283291
inflightRequests_++;
284292
}
293+
285294
// Execute the function.
286295
auto deviceBindings =
287296
llvm::make_unique<HabanaBindings>(deviceId_, topologyId);

lib/Backends/Habana/HabanaDeviceManager.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,15 @@ namespace runtime {
3434
/// This class implements the DeviceManager interface for
3535
/// Habana devices.
3636
class HabanaDeviceManager : public DeviceManager {
37+
using DeviceId = uint32_t;
38+
using TopologyId = uint64_t;
39+
40+
static constexpr auto INVALID_DEVICE = std::numeric_limits<DeviceId>::max();
41+
static constexpr auto INVALID_TOPOLOGY =
42+
std::numeric_limits<TopologyId>::max();
43+
3744
/// The ID of the device managed by this instance.
38-
uint32_t deviceId_{0};
45+
DeviceId deviceId_{INVALID_DEVICE};
3946
/// The available memory on the device.
4047
uint64_t freeMemory_{0};
4148
/// The total memory on the device.
@@ -57,8 +64,8 @@ class HabanaDeviceManager : public DeviceManager {
5764
/// The number of workers in wait pool.
5865
unsigned numWaiters_{kNumWaiters};
5966

60-
/// Track active topology on this device. -1 is invalid.
61-
uint64_t activeTopo_{(uint64_t)-1};
67+
/// Track active topology on this device.
68+
TopologyId activeTopo_{INVALID_TOPOLOGY};
6269
/// Number of requests in flight. Used to block topo switching.
6370
unsigned inflightRequests_{0};
6471
/// Condition variable for signaling queue drain.

lib/Base/Tensor.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ static void dumpGenericImpl(Handle<ElemTy> handle, llvm::raw_ostream &os,
148148
}
149149

150150
os << "]\n";
151+
152+
os.flush();
151153
}
152154

153155
template <class ElemTy>
@@ -188,6 +190,8 @@ static void dumpAsciiGenericImpl(Handle<ElemTy> handle, llvm::raw_ostream &os) {
188190
} else {
189191
llvm_unreachable("Invalid tensor size");
190192
}
193+
194+
os.flush();
191195
}
192196

193197
/// This is a slow generic transpose. This method performs a single for loop

lib/Onnxifi/onnxifiGlow.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ GLOW_ONNXIFI_LIBRARY_FUNCTION_WRAPPER(onnxGetBackendIDs)(
5959
#else
6060
constexpr bool withCPU = false;
6161
#endif
62+
#ifdef GLOW_WITH_HABANA
63+
constexpr bool withHabana = true;
64+
#else
65+
constexpr bool withHabana = false;
66+
#endif
6267

6368
// Only return quantization backend if GLOW_DUMP_PROFILE.
6469
if (getenv("GLOW_DUMP_PROFILE")) {
@@ -79,21 +84,24 @@ GLOW_ONNXIFI_LIBRARY_FUNCTION_WRAPPER(onnxGetBackendIDs)(
7984

8085
backendIDs[0] = quantizationBackendOnnx;
8186
backendIDs[1] = quantizationBackendC2;
82-
} else if (withCPU) {
87+
} else if (withCPU || withHabana) {
8388
*numBackends = 4;
8489

90+
auto backendKind =
91+
withHabana ? glow::BackendKind::Habana : glow::BackendKind::CPU;
92+
8593
// In case backendIDs is nullptr or does not have enough capacity just
8694
// return the total number of supported backends.
8795
if (numBackendsCapacity < *numBackends || !backendIDs) {
8896
return ONNXIFI_STATUS_FALLBACK;
8997
}
9098

91-
auto *cpuBackendOnnx = manager.createBackendId(glow::BackendKind::CPU,
99+
auto *cpuBackendOnnx = manager.createBackendId(backendKind,
92100
/*useOnnx*/ true);
93101
auto *interpreterBackendOnnx =
94102
manager.createBackendId(glow::BackendKind::Interpreter,
95103
/*useOnnx*/ true);
96-
auto *cpuBackendC2 = manager.createBackendId(glow::BackendKind::CPU,
104+
auto *cpuBackendC2 = manager.createBackendId(backendKind,
97105
/*useOnnx*/ false);
98106
auto *interpreterBackendC2 =
99107
manager.createBackendId(glow::BackendKind::Interpreter,

tests/unittests/BackendTestUtils.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ class BackendStatelessTest : public ::testing::TestWithParam<BackendKind> {
5858
const BackendKind Interpreter = BackendKind::Interpreter;
5959
const BackendKind CPU = BackendKind::CPU;
6060
const BackendKind OpenCL = BackendKind::OpenCL;
61+
const BackendKind Habana = BackendKind::Habana;
6162
};
6263

6364
class BackendTest : public BackendStatelessTest {
@@ -79,6 +80,9 @@ static const auto all_backends = ::testing::Values(
7980
#ifdef GLOW_WITH_OPENCL
8081
BackendKind::OpenCL,
8182
#endif // GLOW_WITHOPENCL
83+
#ifdef ENABLE_HABANA_IN_TESTS
84+
BackendKind::Habana,
85+
#endif // ENABLE_HABANA_IN_TESTS
8286
BackendKind::Interpreter);
8387

8488
// Instantiate parameterized test suite with all available backends.

0 commit comments

Comments
 (0)