improve speed of the Gamma loss function

paulbkoch · paulbkoch · commit 66fa2aea3bff · 2024-09-27T18:05:00.000-07:00
diff --git a/shared/libebm/compute/avx2_ebm/avx2_32.cpp b/shared/libebm/compute/avx2_ebm/avx2_32.cpp
@@ -34,12 +34,24 @@ namespace DEFINED_ZONE_NAME {
 #error DEFINED_ZONE_NAME must be defined
 #endif // DEFINED_ZONE_NAME
 
-// this is super-special and included inside the zone namespace
-#include "objective_registrations.hpp"
-
 static constexpr size_t k_cAlignment = 32;
-
 struct alignas(k_cAlignment) Avx2_32_Float;
+struct alignas(k_cAlignment) Avx2_32_Int;
+
+template<bool bNegateInput = false,
+      bool bNaNPossible = true,
+      bool bUnderflowPossible = true,
+      bool bOverflowPossible = true>
+inline Avx2_32_Float Exp(const Avx2_32_Float& val) noexcept;
+template<bool bNegateOutput = false,
+      bool bNaNPossible = true,
+      bool bNegativePossible = true,
+      bool bZeroPossible = true,
+      bool bPositiveInfinityPossible = true>
+inline Avx2_32_Float Log(const Avx2_32_Float& val) noexcept;
+
+// this is super-special and included inside the zone namespace
+#include "objective_registrations.hpp"
 
 struct alignas(k_cAlignment) Avx2_32_Int final {
    friend Avx2_32_Float;
@@ -138,18 +150,6 @@ struct alignas(k_cAlignment) Avx2_32_Int final {
 static_assert(std::is_standard_layout<Avx2_32_Int>::value && std::is_trivially_copyable<Avx2_32_Int>::value,
       "This allows offsetof, memcpy, memset, inter-language, GPU and cross-machine use where needed");
 
-template<bool bNegateInput = false,
-      bool bNaNPossible = true,
-      bool bUnderflowPossible = true,
-      bool bOverflowPossible = true>
-inline Avx2_32_Float Exp(const Avx2_32_Float& val) noexcept;
-template<bool bNegateOutput = false,
-      bool bNaNPossible = true,
-      bool bNegativePossible = true,
-      bool bZeroPossible = true,
-      bool bPositiveInfinityPossible = true>
-inline Avx2_32_Float Log(const Avx2_32_Float& val) noexcept;
-
 struct alignas(k_cAlignment) Avx2_32_Float final {
    template<bool bNegateInput, bool bNaNPossible, bool bUnderflowPossible, bool bOverflowPossible>
    friend Avx2_32_Float Exp(const Avx2_32_Float& val) noexcept;
diff --git a/shared/libebm/compute/avx512f_ebm/avx512f_32.cpp b/shared/libebm/compute/avx512f_ebm/avx512f_32.cpp
@@ -34,12 +34,24 @@ namespace DEFINED_ZONE_NAME {
 #error DEFINED_ZONE_NAME must be defined
 #endif // DEFINED_ZONE_NAME
 
-// this is super-special and included inside the zone namespace
-#include "objective_registrations.hpp"
-
 static constexpr size_t k_cAlignment = 64;
-
 struct alignas(k_cAlignment) Avx512f_32_Float;
+struct alignas(k_cAlignment) Avx512f_32_Int;
+
+template<bool bNegateInput = false,
+      bool bNaNPossible = true,
+      bool bUnderflowPossible = true,
+      bool bOverflowPossible = true>
+inline Avx512f_32_Float Exp(const Avx512f_32_Float& val) noexcept;
+template<bool bNegateOutput = false,
+      bool bNaNPossible = true,
+      bool bNegativePossible = true,
+      bool bZeroPossible = true,
+      bool bPositiveInfinityPossible = true>
+inline Avx512f_32_Float Log(const Avx512f_32_Float& val) noexcept;
+
+// this is super-special and included inside the zone namespace
+#include "objective_registrations.hpp"
 
 struct alignas(k_cAlignment) Avx512f_32_Int final {
    friend Avx512f_32_Float;
@@ -152,18 +164,6 @@ struct alignas(k_cAlignment) Avx512f_32_Int final {
 static_assert(std::is_standard_layout<Avx512f_32_Int>::value && std::is_trivially_copyable<Avx512f_32_Int>::value,
       "This allows offsetof, memcpy, memset, inter-language, GPU and cross-machine use where needed");
 
-template<bool bNegateInput = false,
-      bool bNaNPossible = true,
-      bool bUnderflowPossible = true,
-      bool bOverflowPossible = true>
-inline Avx512f_32_Float Exp(const Avx512f_32_Float& val) noexcept;
-template<bool bNegateOutput = false,
-      bool bNaNPossible = true,
-      bool bNegativePossible = true,
-      bool bZeroPossible = true,
-      bool bPositiveInfinityPossible = true>
-inline Avx512f_32_Float Log(const Avx512f_32_Float& val) noexcept;
-
 struct alignas(k_cAlignment) Avx512f_32_Float final {
    template<bool bNegateInput, bool bNaNPossible, bool bUnderflowPossible, bool bOverflowPossible>
    friend Avx512f_32_Float Exp(const Avx512f_32_Float& val) noexcept;
diff --git a/shared/libebm/compute/cpu_ebm/cpu_64.cpp b/shared/libebm/compute/cpu_ebm/cpu_64.cpp
@@ -31,11 +31,24 @@ namespace DEFINED_ZONE_NAME {
 #error DEFINED_ZONE_NAME must be defined
 #endif // DEFINED_ZONE_NAME
 
+struct Cpu_64_Float;
+struct Cpu_64_Int;
+
+template<bool bNegateInput = false,
+      bool bNaNPossible = true,
+      bool bUnderflowPossible = true,
+      bool bOverflowPossible = true>
+inline Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept;
+template<bool bNegateOutput = false,
+      bool bNaNPossible = true,
+      bool bNegativePossible = true,
+      bool bZeroPossible = true,
+      bool bPositiveInfinityPossible = true>
+inline Cpu_64_Float Log(const Cpu_64_Float& val) noexcept;
+
 // this is super-special and included inside the zone namespace
 #include "objective_registrations.hpp"
 
-struct Cpu_64_Float;
-
 struct Cpu_64_Int final {
    friend Cpu_64_Float;
    friend inline Cpu_64_Float IfEqual(const Cpu_64_Int& cmp1,
@@ -96,18 +109,6 @@ struct Cpu_64_Int final {
 static_assert(std::is_standard_layout<Cpu_64_Int>::value && std::is_trivially_copyable<Cpu_64_Int>::value,
       "This allows offsetof, memcpy, memset, inter-language, GPU and cross-machine use where needed");
 
-template<bool bNegateInput = false,
-      bool bNaNPossible = true,
-      bool bUnderflowPossible = true,
-      bool bOverflowPossible = true>
-inline Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept;
-template<bool bNegateOutput = false,
-      bool bNaNPossible = true,
-      bool bNegativePossible = true,
-      bool bZeroPossible = true,
-      bool bPositiveInfinityPossible = true>
-inline Cpu_64_Float Log(const Cpu_64_Float& val) noexcept;
-
 struct Cpu_64_Float final {
    template<bool bNegateInput, bool bNaNPossible, bool bUnderflowPossible, bool bOverflowPossible>
    friend Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept;
diff --git a/shared/libebm/compute/objectives/GammaDevianceRegressionObjective.hpp b/shared/libebm/compute/objectives/GammaDevianceRegressionObjective.hpp
@@ -52,25 +52,23 @@ template<typename TFloat> struct GammaDevianceRegressionObjective : RegressionOb
    inline double FinishMetric(const double metricSum) const noexcept { return 2.0 * metricSum; }
 
    GPU_DEVICE inline TFloat CalcMetric(const TFloat& score, const TFloat& target) const noexcept {
-      const TFloat prediction = Exp(score); // log link function
-      const TFloat frac = target / prediction;
+      const TFloat invPrediction = Exp<true>(score); // log link function
+      const TFloat frac = target * invPrediction;
       const TFloat metric = frac - 1.0 - Log(frac);
       return metric;
    }
 
    GPU_DEVICE inline TFloat CalcGradient(const TFloat& score, const TFloat& target) const noexcept {
-      const TFloat prediction = Exp(score); // log link function
-      const TFloat frac = target / prediction;
-      const TFloat gradient = 1.0 - frac;
+      const TFloat invPrediction = Exp<true>(score); // log link function
+      const TFloat gradient = FusedNegateMultiplyAdd(target, invPrediction, 1.0);
       return gradient;
    }
 
    GPU_DEVICE inline GradientHessian<TFloat> CalcGradientHessian(
          const TFloat& score, const TFloat& target) const noexcept {
-      const TFloat prediction = Exp(score); // log link function
-      const TFloat frac = target / prediction;
-      const TFloat gradient = 1.0 - frac;
-      const TFloat hessian = frac;
+      const TFloat invPrediction = Exp<true>(score); // log link function
+      const TFloat gradient = FusedNegateMultiplyAdd(target, invPrediction, 1.0);
+      const TFloat hessian = target * invPrediction;
       return MakeGradientHessian(gradient, hessian);
    }
 };