Merge glow-h changes in internal repo to github master (#2832)

arunm-git · web-flow · commit d67cfad5bd0a · 2019-05-01T19:07:36.000-07:00
*Description*:

This commit brings over changes that were made internally at FB for glow-h.

*Testing*:
ninja all &amp;&amp; ninja test
Habana specific testing will be done internally.
diff --git a/lib/Backends/Habana/Habana.cpp b/lib/Backends/Habana/Habana.cpp
@@ -54,7 +54,7 @@ static synDataType getSynType(ElemKind kind) {
   case ElemKind::FloatTy:
     return syn_type_single;
   case ElemKind::Float16Ty:
-    return syn_type_half;
+    GLOW_UNREACHABLE("Unhandled ElemKind: Float16Ty");
   case ElemKind::Int8QTy:
     return syn_type_fixed;
   case ElemKind::Int16QTy:
@@ -92,18 +92,6 @@ static std::string getKernelName(llvm::StringRef kernelBase, ElemKind kind) {
   return std::string(kernelBase) + getKernelSuffix(kind);
 }
 
-/// If \p PH is an output placeholder, \returns the SaveNode.
-static SaveNode *getOutputSave(Function *F, Placeholder *PH) {
-  for (auto &use : PH->getUsers()) {
-    if (auto *save = llvm::dyn_cast<SaveNode>(use.getUser())) {
-      if (save->getParent() == F && save->getPlaceholder() == PH) {
-        return save;
-      }
-    }
-  }
-  return nullptr;
-}
-
 namespace {
 /// Parameters for pooling operation.
 struct synPoolParams {
@@ -217,23 +205,6 @@ class TensorHandle final {
     // Model params need to be floats, even if the tensor is integral or
     // quantized.
     if (ioType == IOType::Static) {
-      // Quantized types: dequantize into float buffer.
-      if (V->isQuantizedType()) {
-        // Check that a weight buffer was passed in; these are model params.
-        assert(!allocated_);
-        Type type = *V;
-        if (V->getElementType() == ElemKind::UInt8FusedQTy) {
-          // Fused quantized values just need to be passed through in raw form.
-          type = Type(ElemKind::Int8QTy, V->dims(), 1.0, 0);
-        }
-        Tensor DT = quantization::dequantizeTensor(Tensor(buffer_, &type),
-                                                   ElemKind::FloatTy);
-        auto bytes = DT.getSizeInBytes();
-        buffer_ = malloc(bytes);
-        memcpy(buffer_, DT.getUnsafePtr(), bytes);
-        allocated_ = true;
-      }
-
       // Int32ITy: Cast to floats.
       if (V->getElementType() == ElemKind::Int32ITy) {
         float *floats_ = (float *)malloc(V->size() * sizeof(float));
@@ -258,11 +229,19 @@ class TensorHandle final {
     // Create tensor descriptor, with quantization params if needed.
     synTensorDescriptor desc(elemType, rdims.size(), rdims.data(), buffer_,
                              synMemoryHost, false, name_.data());
-    if (V->isQuantizedType() &&
-        V->getElementType() != ElemKind::UInt8FusedQTy) {
-      desc.m_quantizationParams[0].m_zp = V->getOffset();
-      desc.m_quantizationParams[0].m_scale = V->getScale();
+    if (V->isQuantizedType()) {
+      if (V->getElementType() == ElemKind::UInt8FusedQTy) {
+        desc.m_quantizationParams[0].m_zp = 0;
+        desc.m_quantizationParams[0].m_scale = 1;
+      } else {
+        desc.m_quantizationParams[0].m_zp = V->getOffset();
+        desc.m_quantizationParams[0].m_scale = V->getScale();
+      }
+
       desc.m_quantizationParams[0].m_qDataType = elemType;
+      if (ioType == IOType::Static) {
+        desc.m_isQuantized = true;
+      }
     }
 
     chk(synCreateTensor(&desc, &tensor_, ioType == IOType::Output, false,
@@ -307,6 +286,9 @@ class TensorHandle final {
   /// Get the underlying data buffer.
   void *getData() const { return buffer_; }
 
+  /// Get the name of the managed tensor
+  const std::string &getName() const { return name_; }
+
   /// Get the dimensions of the stored tensor.
   llvm::ArrayRef<unsigned> dims() const { return dims_; }
 
@@ -665,15 +647,18 @@ allocateGraphTensors(Function *F) {
       continue;
     }
     if (auto *save = getOutputSave(F, V)) {
-      // We want to avoid emitting copies for save nodes by simply marking the
-      // save input as an "output" tensor.  The exceptions are when the input
-      // is itself a placeholder/constant, or a reshape.  (The reshape case is
-      // likely a Synapse bug.)
+      // Naively, we'd generate a memcpy for any SaveNode, but that's a waste
+      // so we want to avoid it.  We can optimize it away by mapping the
+      // SaveNode's input node (N, below) to the output tensor, and then simply
+      // not generating a memcpy if the SaveNode itself has no associated
+      // tensor.
       auto *N = save->getInput().getNode();
-      Node *proxy =
-          (llvm::isa<Storage>(N) || llvm::isa<HabanaReshapeNode>(N)) ? save : N;
-      tensors.emplace(proxy, TensorHandle(V->getType(), V->getName(), nullptr,
-                                          IOType::Output));
+      if (llvm::isa<Storage>(N) || llvm::isa<HabanaReshapeNode>(N) ||
+          N->getNumUsers() > 1) {
+        N = save;
+      }
+      tensors.emplace(
+          N, TensorHandle(V->getType(), V->getName(), nullptr, IOType::Output));
     } else {
       tensors.emplace(V, TensorHandle(V->getType(), V->getName(), nullptr,
                                       IOType::Default));
@@ -747,6 +732,7 @@ HabanaBackend::compile(Function *F, const BackendOptions &opts) const {
   std::vector<std::unique_ptr<ns_ConstantKernel::Params>> constantParams;
   std::vector<std::unique_ptr<ns_TileKernel::Params>> tileParams;
   std::vector<std::unique_ptr<unsigned>> concatParams;
+  std::vector<std::unique_ptr<ns_TakeKernel::Params>> takeParams;
 
   // Keep references to tensor pointer arrays passed into multi-input nodes
   // until the compilation is done.
@@ -755,6 +741,10 @@ HabanaBackend::compile(Function *F, const BackendOptions &opts) const {
   std::vector<TensorHandle> tempTensors;
 
   for (const auto &I : F->getNodes()) {
+    if (!isOpSupported(I)) {
+      llvm::errs() << "Unsupported operator: " << I.getDebugDesc() << "\n";
+      GLOW_UNREACHABLE("Unsupported operator");
+    }
     switch (I.getKind()) {
     case Kinded::Kind::HabanaFullyConnectedNodeKind: {
       auto *NI = llvm::cast<HabanaFullyConnectedNode>(&I);
@@ -1116,7 +1106,17 @@ HabanaBackend::compile(Function *F, const BackendOptions &opts) const {
           makeConcatParams(CI->getDim(), tensors[CI].dims().size());
       std::vector<synTensor> inputs;
       for (auto const &N : CI->getInputs()) {
-        inputs.push_back(tensors[N].get());
+        std::string memcpyNodeName =
+            llvm::formatv("{0}_memcpy_{1}", N.getNode()->getName(),
+                          inputs.size())
+                .str();
+        TensorHandle memcpy(N.getType(), memcpyNodeName);
+        chk(synCreateGenericNode(
+            &tensors[N].get(), &memcpy.get(), 1, 1, nullptr,
+            getKernelName("memcpy", N.getType()->getElementType()).c_str(),
+            memcpy.getName().c_str(), nullptr, nullptr));
+        inputs.push_back(memcpy.get());
+        tempTensors.emplace_back(std::move(memcpy));
       }
 
       chk(synCreateGenericNode(inputs.data(), &tensors[CI].get(), inputs.size(),
@@ -1165,6 +1165,25 @@ HabanaBackend::compile(Function *F, const BackendOptions &opts) const {
       multiInputs.emplace_back(std::move(inputs));
       break;
     }
+    case Kinded::Kind::GatherNodeKind: {
+      auto *gather = llvm::cast<GatherNode>(&I);
+      std::vector<synTensor> inputs = {tensors[gather->getData()].get(),
+                                       tensors[gather->getIndices()].get()};
+
+      auto params = llvm::make_unique<ns_TakeKernel::Params>();
+      params->axis =
+          gather->getData().dims().size() - gather->getBatchDims() - 1;
+      params->mode = 0;
+
+      chk(synCreateGenericNode(
+          inputs.data(), &tensors[gather].get(), inputs.size(), 1, params.get(),
+          getKernelName("take", gather->getResult().getElementType()).c_str(),
+          gather->getName().data(), nullptr, nullptr));
+
+      multiInputs.emplace_back(std::move(inputs));
+      takeParams.emplace_back(std::move(params));
+      break;
+    }
     default: {
       llvm::errs() << "Unhandled node: " << I.getDebugDesc() << "\n";
       GLOW_UNREACHABLE("Unhandled node");
diff --git a/lib/Backends/Habana/Habana.h b/lib/Backends/Habana/Habana.h
@@ -99,12 +99,12 @@ class HabanaIOBufferPool {
   /// construction. This is the effective size of one HabanaIOBuffer in this
   /// pool.
   size_t perBufferSize_;
-  /// The combined size of all HabanaIOBuffers in this pool (i.e. size_ *
-  /// numBuffers_).
+  /// The combined size of all HabanaIOBuffers in this pool (i.e. perBufferSize_
+  /// * numBuffers_).
   size_t allBuffersSize_;
   /// Buffer that backs all of the HOmaIOBuffers in this pool. The first buffer
-  /// starts at buffer_, the second at buffer_ + size_, etc. The last *ends* at
-  /// buffer_ + totalSize_.
+  /// starts at buffer_, the second at buffer_ + perBufferSize_, etc. The last
+  /// *ends* at buffer_ + allBuffersSize_.
   uint8_t *buffer_;
   /// The number of buffers in the pool.
   unsigned numBuffers_{kDefaultNumBuffers};
diff --git a/lib/Backends/Habana/HabanaDeviceManager.cpp b/lib/Backends/Habana/HabanaDeviceManager.cpp
@@ -20,6 +20,9 @@
 
 #include "synapse.h"
 
+#include <glog/logging.h>
+#include <limits>
+
 using namespace glow;
 using namespace glow::runtime;
 
@@ -52,13 +55,18 @@ HabanaDeviceManager::HabanaDeviceManager(std::unique_ptr<DeviceConfig> config,
   // If this is the first HabanaDeviceManager to be created, initialize the
   // Synapse API.
   if (numActiveDevices_ == 0) {
+    LOG(INFO) << "Using version " << synGetVersion();
     chk(synInitialize());
   }
 
   numActiveDevices_++;
 }
 
 HabanaDeviceManager::~HabanaDeviceManager() {
+  // If a device was never successfully acquired, there's nothing to clean up.
+  if (deviceId_ == INVALID_DEVICE) {
+    return;
+  }
   std::lock_guard<std::mutex> lock(synapseMtx_);
   numActiveDevices_--;
 
@@ -282,6 +290,7 @@ void HabanaDeviceManager::runFunctionImpl(RunIdentifierTy runId,
     }
     inflightRequests_++;
   }
+
   // Execute the function.
   auto deviceBindings =
       llvm::make_unique<HabanaBindings>(deviceId_, topologyId);
diff --git a/lib/Backends/Habana/HabanaDeviceManager.h b/lib/Backends/Habana/HabanaDeviceManager.h
@@ -34,8 +34,15 @@ namespace runtime {
 /// This class implements the DeviceManager interface for
 /// Habana devices.
 class HabanaDeviceManager : public DeviceManager {
+  using DeviceId = uint32_t;
+  using TopologyId = uint64_t;
+
+  static constexpr auto INVALID_DEVICE = std::numeric_limits<DeviceId>::max();
+  static constexpr auto INVALID_TOPOLOGY =
+      std::numeric_limits<TopologyId>::max();
+
   /// The ID of the device managed by this instance.
-  uint32_t deviceId_{0};
+  DeviceId deviceId_{INVALID_DEVICE};
   /// The available memory on the device.
   uint64_t freeMemory_{0};
   /// The total memory on the device.
@@ -57,8 +64,8 @@ class HabanaDeviceManager : public DeviceManager {
   /// The number of workers in wait pool.
   unsigned numWaiters_{kNumWaiters};
 
-  /// Track active topology on this device.  -1 is invalid.
-  uint64_t activeTopo_{(uint64_t)-1};
+  /// Track active topology on this device.
+  TopologyId activeTopo_{INVALID_TOPOLOGY};
   /// Number of requests in flight.  Used to block topo switching.
   unsigned inflightRequests_{0};
   /// Condition variable for signaling queue drain.
diff --git a/lib/Base/Tensor.cpp b/lib/Base/Tensor.cpp
@@ -148,6 +148,8 @@ static void dumpGenericImpl(Handle<ElemTy> handle, llvm::raw_ostream &os,
   }
 
   os << "]\n";
+
+  os.flush();
 }
 
 template <class ElemTy>
@@ -188,6 +190,8 @@ static void dumpAsciiGenericImpl(Handle<ElemTy> handle, llvm::raw_ostream &os) {
   } else {
     llvm_unreachable("Invalid tensor size");
   }
+
+  os.flush();
 }
 
 /// This is a slow generic transpose. This method performs a single for loop
diff --git a/lib/Onnxifi/onnxifiGlow.cpp b/lib/Onnxifi/onnxifiGlow.cpp
@@ -59,6 +59,11 @@ GLOW_ONNXIFI_LIBRARY_FUNCTION_WRAPPER(onnxGetBackendIDs)(
 #else
   constexpr bool withCPU = false;
 #endif
+#ifdef GLOW_WITH_HABANA
+  constexpr bool withHabana = true;
+#else
+  constexpr bool withHabana = false;
+#endif
 
   // Only return quantization backend if GLOW_DUMP_PROFILE.
   if (getenv("GLOW_DUMP_PROFILE")) {
@@ -79,21 +84,24 @@ GLOW_ONNXIFI_LIBRARY_FUNCTION_WRAPPER(onnxGetBackendIDs)(
 
     backendIDs[0] = quantizationBackendOnnx;
     backendIDs[1] = quantizationBackendC2;
-  } else if (withCPU) {
+  } else if (withCPU || withHabana) {
     *numBackends = 4;
 
+    auto backendKind =
+        withHabana ? glow::BackendKind::Habana : glow::BackendKind::CPU;
+
     // In case backendIDs is nullptr or does not have enough capacity just
     // return the total number of supported backends.
     if (numBackendsCapacity < *numBackends || !backendIDs) {
       return ONNXIFI_STATUS_FALLBACK;
     }
 
-    auto *cpuBackendOnnx = manager.createBackendId(glow::BackendKind::CPU,
+    auto *cpuBackendOnnx = manager.createBackendId(backendKind,
                                                    /*useOnnx*/ true);
     auto *interpreterBackendOnnx =
         manager.createBackendId(glow::BackendKind::Interpreter,
                                 /*useOnnx*/ true);
-    auto *cpuBackendC2 = manager.createBackendId(glow::BackendKind::CPU,
+    auto *cpuBackendC2 = manager.createBackendId(backendKind,
                                                  /*useOnnx*/ false);
     auto *interpreterBackendC2 =
         manager.createBackendId(glow::BackendKind::Interpreter,
diff --git a/tests/unittests/BackendTestUtils.h b/tests/unittests/BackendTestUtils.h
@@ -58,6 +58,7 @@ class BackendStatelessTest : public ::testing::TestWithParam<BackendKind> {
   const BackendKind Interpreter = BackendKind::Interpreter;
   const BackendKind CPU = BackendKind::CPU;
   const BackendKind OpenCL = BackendKind::OpenCL;
+  const BackendKind Habana = BackendKind::Habana;
 };
 
 class BackendTest : public BackendStatelessTest {
@@ -79,6 +80,9 @@ static const auto all_backends = ::testing::Values(
 #ifdef GLOW_WITH_OPENCL
     BackendKind::OpenCL,
 #endif // GLOW_WITHOPENCL
+#ifdef ENABLE_HABANA_IN_TESTS
+    BackendKind::Habana,
+#endif // ENABLE_HABANA_IN_TESTS
     BackendKind::Interpreter);
 
 // Instantiate parameterized test suite with all available backends.
diff --git a/tests/unittests/DeviceManagerTest.cpp b/tests/unittests/DeviceManagerTest.cpp

Original file line number	Diff line number	Diff line change
`@@ -148,6 +148,8 @@ static void dumpGenericImpl(Handle<ElemTy> handle, llvm::raw_ostream &os,`
`148`	`148`	`}`
`149`	`149`
`150`	`150`	`os << "]\n";`
	`151`	`+`
	`152`	`+ os.flush();`
`151`	`153`	`}`
`152`	`154`
`153`	`155`	`template <class ElemTy>`
`@@ -188,6 +190,8 @@ static void dumpAsciiGenericImpl(Handle<ElemTy> handle, llvm::raw_ostream &os) {`
`188`	`190`	`} else {`
`189`	`191`	`llvm_unreachable("Invalid tensor size");`
`190`	`192`	`}`
	`193`	`+`
	`194`	`+ os.flush();`
`191`	`195`	`}`
`192`	`196`
`193`	`197`	`/// This is a slow generic transpose. This method performs a single for loop`