From ccbe5fcf73c50e6bd6f8cd1994bdd2fcd8f9ccf6 Mon Sep 17 00:00:00 2001
From: John McCall <rjmccall@apple.com>
Date: Fri, 27 Jan 2017 17:52:24 -0500
Subject: [PATCH] Switch MetadataCache to use a global slab allocator.

This seems to more than fix a performance regression that we
detected on a metadata-allocation microbenchmark.

A few months ago, I improved the metadata cache representation
and changed the metadata allocation scheme to primarily use malloc.
Previously, we'd been using malloc in the concurrent tree data
structure but a per-cache slab allocator for the metadata itself.
At the time, I was concerned about the overhead of per-cache
allocators, since many metadata patterns see only a small number
of instantiations.  That's still an important factor, so in the
new scheme we're using a global allocator; but instead of using
malloc for individual allocations, we're using a slab allocator,
which should have better peak, single-thread performance, at the
cost of not easily supporting deallocation.  Deallocation is
only used for metadata when there's contention on the cache, and
specifically only when there's contention for the same key, so
leaking a little isn't the worst thing in the world.

The initial slab is a 64K globally-allocated buffer.
Successive slabs are 16K and allocated with malloc.

rdar://28189496
---
 cmake/modules/AddSwift.cmake          |  2 +-
 cmake/modules/AddSwiftUnittests.cmake |  3 +
 include/swift/Runtime/Concurrent.h    |  5 +-
 stdlib/public/runtime/Metadata.cpp    | 96 +++++++++++++++++++++++++++
 stdlib/public/runtime/MetadataCache.h | 17 +++--
 utils/gen-static-stdlib-link-args     |  1 +
 utils/static-executable-args.lnk      |  1 +
 7 files changed, 117 insertions(+), 8 deletions(-)

diff --git a/cmake/modules/AddSwift.cmake b/cmake/modules/AddSwift.cmake
index 60b5b573cf327..8a13232c2e1ec 100644
--- a/cmake/modules/AddSwift.cmake
+++ b/cmake/modules/AddSwift.cmake
@@ -329,7 +329,7 @@ function(_add_variant_link_flags)
     RESULT_VAR_NAME result)
 
   if("${LFLAGS_SDK}" STREQUAL "LINUX")
-    list(APPEND result "-lpthread" "-ldl")
+    list(APPEND result "-lpthread" "-latomic" "-ldl")
   elseif("${LFLAGS_SDK}" STREQUAL "FREEBSD")
     list(APPEND result "-lpthread")
   elseif("${LFLAGS_SDK}" STREQUAL "CYGWIN")
diff --git a/cmake/modules/AddSwiftUnittests.cmake b/cmake/modules/AddSwiftUnittests.cmake
index 27b57d44bd8bb..4de5361e22c21 100644
--- a/cmake/modules/AddSwiftUnittests.cmake
+++ b/cmake/modules/AddSwiftUnittests.cmake
@@ -42,6 +42,9 @@ function(add_swift_unittest test_dirname)
   if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
     set_property(TARGET "${test_dirname}" APPEND_STRING PROPERTY
       LINK_FLAGS " -Xlinker -rpath -Xlinker ${SWIFT_LIBRARY_OUTPUT_INTDIR}/swift/macosx")
+  elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux")
+    set_property(TARGET "${test_dirname}" APPEND_STRING PROPERTY
+      LINK_FLAGS " -latomic")
   endif()
 
   if(SWIFT_ENABLE_GOLD_LINKER AND
diff --git a/include/swift/Runtime/Concurrent.h b/include/swift/Runtime/Concurrent.h
index 48e37ab40fd59..1fe1bca7bccd0 100644
--- a/include/swift/Runtime/Concurrent.h
+++ b/include/swift/Runtime/Concurrent.h
@@ -189,8 +189,9 @@ class ConcurrentMapBase<EntryTy, false, Allocator> : protected Allocator {
     // Destroy the node's payload.
     node->~Node();
 
-    // Deallocate the node.
-    this->Deallocate(node, allocSize);
+    // Deallocate the node.  The static_cast here is required
+    // because LLVM's allocator API is insane.
+    this->Deallocate(static_cast<void*>(node), allocSize);
   }
 };
 
diff --git a/stdlib/public/runtime/Metadata.cpp b/stdlib/public/runtime/Metadata.cpp
index 59c317ef183d7..57e80fa123758 100644
--- a/stdlib/public/runtime/Metadata.cpp
+++ b/stdlib/public/runtime/Metadata.cpp
@@ -2809,3 +2809,99 @@ swift::swift_getGenericWitnessTable(GenericWitnessTable *genericTable,
 }
 
 uint64_t swift::RelativeDirectPointerNullPtr = 0;
+
+/***************************************************************************/
+/*** Allocator implementation **********************************************/
+/***************************************************************************/
+
+namespace {
+  struct PoolRange {
+    static constexpr uintptr_t PageSize = 16 * 1024;
+    static constexpr uintptr_t MaxPoolAllocationSize = PageSize / 2;
+
+    /// The start of the allocation.
+    char *Begin;
+
+    /// The number of bytes remaining.
+    size_t Remaining;
+  };
+}
+
+// A statically-allocated pool.  It's zero-initialized, so this
+// doesn't cost us anything in binary size.
+LLVM_ALIGNAS(alignof(void*)) static char InitialAllocationPool[64*1024];
+static std::atomic<PoolRange>
+AllocationPool{PoolRange{InitialAllocationPool,
+                         sizeof(InitialAllocationPool)}};
+
+void *MetadataAllocator::Allocate(size_t size, size_t alignment) {
+  assert(alignment <= alignof(void*));
+  assert(size % alignof(void*) == 0);
+
+  // If the size is larger than the maximum, just use malloc.
+  if (size > PoolRange::MaxPoolAllocationSize)
+    return malloc(size);
+
+  // Allocate out of the pool.
+  PoolRange curState = AllocationPool.load(std::memory_order_relaxed);
+  while (true) {
+    char *allocation;
+    PoolRange newState;
+    bool allocatedNewPage;
+
+    // Try to allocate out of the current page.
+    if (size <= curState.Remaining) {
+      allocatedNewPage = false;
+      allocation = curState.Begin;
+      newState = PoolRange{curState.Begin + size, curState.Remaining - size};
+    } else {
+      allocatedNewPage = true;
+      allocation = new char[PoolRange::PageSize];
+      newState = PoolRange{allocation + size, PoolRange::PageSize - size};
+      __asan_poison_memory_region(allocation, PoolRange::PageSize);
+    }
+
+    // Swap in the new state.
+    if (std::atomic_compare_exchange_weak_explicit(&AllocationPool,
+                                                   &curState, newState,
+                                              std::memory_order_relaxed,
+                                              std::memory_order_relaxed)) {
+      // If that succeeded, we've successfully allocated.
+      __msan_allocated_memory(allocation, size);
+      __asan_poison_memory_region(allocation, size);
+      return allocation;
+    }
+
+    // If it failed, go back to a neutral state and try again.
+    if (allocatedNewPage) {
+      delete[] allocation;
+    }
+  }
+}
+
+void MetadataAllocator::Deallocate(const void *allocation, size_t size) {
+  __asan_poison_memory_region(allocation, size);
+
+  if (size > PoolRange::MaxPoolAllocationSize) {
+    free(const_cast<void*>(allocation));
+    return;
+  }
+
+  // Check whether the allocation pool is still in the state it was in
+  // immediately after the given allocation.
+  PoolRange curState = AllocationPool.load(std::memory_order_relaxed);
+  if (reinterpret_cast<const char*>(allocation) + size != curState.Begin) {
+    return;
+  }
+
+  // Try to swap back to the pre-allocation state.  If this fails,
+  // don't bother trying again; we'll just leak the allocation.
+  PoolRange newState = { reinterpret_cast<char*>(const_cast<void*>(allocation)),
+                         curState.Remaining + size };
+  (void)
+    std::atomic_compare_exchange_strong_explicit(&AllocationPool,
+                                                 &curState, newState,
+                                                 std::memory_order_relaxed,
+                                                 std::memory_order_relaxed);
+}
+
diff --git a/stdlib/public/runtime/MetadataCache.h b/stdlib/public/runtime/MetadataCache.h
index aca46968e63e0..2d4e6eb381c78 100644
--- a/stdlib/public/runtime/MetadataCache.h
+++ b/stdlib/public/runtime/MetadataCache.h
@@ -26,11 +26,18 @@
 
 namespace swift {
 
-// For now, use malloc and free as our standard allocator for
-// metadata caches.  It might make sense in the future to take
-// advantage of the fact that we know that most allocations here
-// won't ever be deallocated.
-using MetadataAllocator = llvm::MallocAllocator;
+class MetadataAllocator : public llvm::AllocatorBase<MetadataAllocator> {
+public:
+  void Reset() {}
+
+  LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t size, size_t alignment);
+  using AllocatorBase<MetadataAllocator>::Allocate;
+
+  void Deallocate(const void *Ptr, size_t size);
+  using AllocatorBase<MetadataAllocator>::Deallocate;
+
+  void PrintStats() const {}
+};
 
 /// A typedef for simple global caches.
 template <class EntryTy>
diff --git a/utils/gen-static-stdlib-link-args b/utils/gen-static-stdlib-link-args
index 39e518d22b655..ef1423b11c0a3 100755
--- a/utils/gen-static-stdlib-link-args
+++ b/utils/gen-static-stdlib-link-args
@@ -62,6 +62,7 @@ function write_linkfile {
 -ldl
 -lpthread
 -lswiftCore
+-latomic
 -lswiftImageInspectionShared
 $ICU_LIBS
 -Xlinker
diff --git a/utils/static-executable-args.lnk b/utils/static-executable-args.lnk
index 6987433fc6185..ff408601185c4 100644
--- a/utils/static-executable-args.lnk
+++ b/utils/static-executable-args.lnk
@@ -8,6 +8,7 @@
 -Xlinker
 --defsym=__import_pthread_key_create=pthread_key_create
 -lpthread
+-latomic
 -licui18n
 -licuuc
 -licudata