diff --git a/include/blas_helper.cuh b/include/blas_helper.cuh
index 647290ee98..4dad754964 100644
--- a/include/blas_helper.cuh
+++ b/include/blas_helper.cuh
@@ -79,6 +79,18 @@ namespace quda
   template <> struct VectorType<int8_t, 24> {
     using type = array<int8_t, 24>;
   };
+  template <> struct VectorType<double, 12> {
+    using type = array<double, 12>;
+  };
+  template <> struct VectorType<float, 12> {
+    using type = array<float, 12>;
+  };
+  template <> struct VectorType<short, 12> {
+    using type = array<short, 12>;
+  };
+  template <> struct VectorType<int8_t, 12> {
+    using type = array<int8_t, 12>;
+  };
   template <> struct VectorType<double, 6> {
     using type = array<double, 6>;
   };
@@ -343,37 +355,49 @@ namespace quda
 
     // native ordering
     template <> constexpr int n_vector<double, true, 4, false>() { return 2; }
+    template <> constexpr int n_vector<double, true, 2, false>() { return 2; }
     template <> constexpr int n_vector<double, true, 1, false>() { return 2; }
 
     template <> constexpr int n_vector<double, true, 4, true>() { return 2; }
+    template <> constexpr int n_vector<double, true, 2, true>() { return 2; }
     template <> constexpr int n_vector<double, true, 1, true>() { return 2; }
 
     template <> constexpr int n_vector<float, true, 4, false>() { return 4; }
-    template <> constexpr int n_vector<float, true, 1, false>() { return 4; }
+    template <> constexpr int n_vector<float, true, 2, false>() { return 4; }
+    template <> constexpr int n_vector<float, true, 1, false>() { return 4; } // TODO: correct?
 
     template <> constexpr int n_vector<float, true, 4, true>() { return 4; }
+    template <> constexpr int n_vector<float, true, 2, true>() { return QUDA_ORDER_SP_MG; }
     template <> constexpr int n_vector<float, true, 1, true>() { return 2; }
 
     template <> constexpr int n_vector<short, true, 4, true>() { return QUDA_ORDER_FP; }
+    template <> constexpr int n_vector<short, true, 2, true>() { return QUDA_ORDER_FP_MG; }
     template <> constexpr int n_vector<short, true, 1, true>() { return 2; }
 
     template <> constexpr int n_vector<int8_t, true, 4, true>() { return QUDA_ORDER_FP; }
+    template <> constexpr int n_vector<int8_t, true, 2, true>() { return QUDA_ORDER_FP_MG; }
     template <> constexpr int n_vector<int8_t, true, 1, true>() { return 2; }
 
     // Just use float-2/float-4 ordering on CPU when not site unrolling
     template <> constexpr int n_vector<double, false, 4, false>() { return 2; }
+    template <> constexpr int n_vector<double, false, 2, false>() { return 2; }
     template <> constexpr int n_vector<double, false, 1, false>() { return 2; }
     template <> constexpr int n_vector<float, false, 4, false>() { return 4; }
+    template <> constexpr int n_vector<float, false, 2, false>() { return 4; }
     template <> constexpr int n_vector<float, false, 1, false>() { return 4; }
 
     // AoS ordering is used on CPU uses when we are site unrolling
     template <> constexpr int n_vector<double, false, 4, true>() { return 24; }
+    template <> constexpr int n_vector<double, false, 2, true>() { return 12; }
     template <> constexpr int n_vector<double, false, 1, true>() { return 6; }
     template <> constexpr int n_vector<float, false, 4, true>() { return 24; }
+    template <> constexpr int n_vector<float, false, 2, true>() { return 12; }
     template <> constexpr int n_vector<float, false, 1, true>() { return 6; }
     template <> constexpr int n_vector<short, false, 4, true>() { return 24; }
+    template <> constexpr int n_vector<short, false, 2, true>() { return 12; }
     template <> constexpr int n_vector<short, false, 1, true>() { return 6; }
     template <> constexpr int n_vector<int8_t, false, 4, true>() { return 24; }
+    template <> constexpr int n_vector<int8_t, false, 2, true>() { return 12; }
     template <> constexpr int n_vector<int8_t, false, 1, true>() { return 6; }
 
     template <template <typename...> class Functor,
@@ -382,13 +406,18 @@ namespace quda
     constexpr void instantiate(const T &a, const T &b, const T &c, V &x_, Args &&... args)
     {
       unwrap_t<V> &x(x_);
-      if (x.Nspin() == 4 || x.Nspin() == 2) {
-        if constexpr (is_enabled_spin(2) || is_enabled_spin(4)) {
-          // Nspin-2 takes Nspin-4 path here, and we check for this later
+      if (x.Nspin() == 4) {
+        if constexpr (is_enabled_spin(4)) {
           Blas<Functor, store_t, y_store_t, 4, T>(a, b, c, x, args...);
         } else {
           errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
         }
+      } else if (x.Nspin() == 2) {
+        if constexpr (is_enabled_spin(2)) {
+          Blas<Functor, store_t, y_store_t, 2, T>(a, b, c, x, args...);
+        } else {
+          errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
+        }
       } else {
         if constexpr (is_enabled_spin(1)) {
           Blas<Functor, store_t, y_store_t, 1, T>(a, b, c, x, args...);
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 32ad6aa925..6125217de7 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -123,7 +123,7 @@ namespace quda
 
   struct ColorSpinorParam : public LatticeFieldParam {
     int nColor = 0; // Number of colors of the field
-    int nSpin = 0;  // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor
+    int nSpin = 0;  // =1 for staggered, =2 for coarse Dslash and chiral overlap Dslash, =4 for 4d spinor
     int nVec = 1;   // number of packed vectors (for multigrid transfer operator)
     int nVec_actual = 1; // The actual number of packed vectors (that are not zero padded)
 
@@ -1103,6 +1103,39 @@ namespace quda
   */
   void spinorDistanceReweight(ColorSpinorField &src, double alpha0, int t0);
 
+  /**
+     @brief Reconstruct a chiral spinor into a full spinor
+     @param[out] dst The reconstructed full spinor nSpin = 4
+     @param[in] src The chiral spinor nSpin = 2
+     @param[in] chirality The chirality of the reconstruction
+  */
+  void spinorChiralReconstruct(ColorSpinorField &dst, const ColorSpinorField &src, QudaChirality chirality);
+
+  /**
+     @brief Reconstruct two chiral spinors into a full spinor
+     @param[out] dst The reconstructed full spinor nSpin = 4
+     @param[in] src_left The left chirality part nSpin = 2
+     @param[in] src_right The right chirality part nSpin = 2
+  */
+  void spinorChiralReconstruct(ColorSpinorField &dst, const ColorSpinorField &src_left,
+                               const ColorSpinorField &src_right);
+
+  /**
+     @brief Project a full spinor to a chiral spinor
+     @param[out] dst The projected chiral spinor nSpin = 2
+     @param[in] src The full spinor nSpin = 4
+     @param[in] chirality The chirality of the projection
+  */
+  void spinorChiralProject(ColorSpinorField &dst, const ColorSpinorField &src, QudaChirality chirality);
+
+  /**
+     @brief Project a full spinor to two chiral spinors
+     @param[out] dst_left The projected left chirality part nSpin = 2
+     @param[out] dst_right The projected left chirality part nSpin = 2
+     @param[in] src The full spinor nSpin = 4
+  */
+  void spinorChiralProject(ColorSpinorField &dst_left, ColorSpinorField &dst_right, const ColorSpinorField &src);
+
   /**
      @brief Helper function for determining if the spin of the fields is the same.
      @param[in] a Input field
diff --git a/include/dirac_quda.h b/include/dirac_quda.h
index 2ceb4f5b70..c95b4f37e3 100644
--- a/include/dirac_quda.h
+++ b/include/dirac_quda.h
@@ -9,6 +9,7 @@
 #include <blas_quda.h>
 #include <field_cache.h>
 #include <memory>
+#include <overlap_kernel.h>
 
 namespace quda {
 
@@ -67,6 +68,8 @@ namespace quda {
 
     bool use_mobius_fused_kernel; // Whether or not use fused kernels for Mobius
 
+    OverlapKernel *overlap_kernel;
+
     double distance_pc_alpha0; // used by distance preconditioning
     int distance_pc_t0;        // used by distance preconditioning
 
@@ -149,6 +152,7 @@ namespace quda {
   class DiracMMdag;
   class DiracMdag;
   class DiracG5M;
+  class DiracMdagMChiral;
   //Forward declaration of multigrid Transfer class
   class Transfer;
 
@@ -162,6 +166,7 @@ namespace quda {
     friend class DiracMMdag;
     friend class DiracMdag;
     friend class DiracG5M;
+    friend class DiracMdagMChiral;
 
   protected:
     GaugeField *gauge;
@@ -350,6 +355,14 @@ namespace quda {
     */
     virtual void MMdag(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const;
 
+    /**
+       @brief Apply MdagM on single chirality
+    */
+    virtual void MdagMChiral(cvector_ref<ColorSpinorField> &, cvector_ref<const ColorSpinorField> &, QudaChirality) const
+    {
+      errorQuda("Not implemented!");
+    }
+
     /**
        @brief Prepare the source and solution vectors for solving given the solution type
        @param[out] out Prepared solution vectors
@@ -1406,6 +1419,47 @@ namespace quda {
     virtual void prefetch(QudaFieldLocation mem_space, qudaStream_t stream = device::get_default_stream()) const override;
   };
 
+  // Full overlap
+  class DiracOverlap : public Dirac
+  {
+
+  protected:
+    OverlapKernel *overlap_kernel;
+
+  public:
+    DiracOverlap(const DiracParam &param);
+    DiracOverlap(const DiracOverlap &dirac);
+    virtual ~DiracOverlap();
+    DiracOverlap &operator=(const DiracOverlap &dirac);
+
+    virtual void Dslash(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                        QudaParity parity) const override;
+    virtual void DslashXpay(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                            QudaParity parity, cvector_ref<const ColorSpinorField> &x, double k) const override;
+    virtual void M(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const override;
+    virtual void MdagM(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const override;
+    virtual void MdagMChiral(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                             QudaChirality chirality) const override;
+
+    virtual void prepare(cvector_ref<ColorSpinorField> &out, cvector_ref<ColorSpinorField> &in,
+                         cvector_ref<ColorSpinorField> &x, cvector_ref<const ColorSpinorField> &b,
+                         const QudaSolutionType solType) const override;
+    virtual void reconstruct(cvector_ref<ColorSpinorField> &x, cvector_ref<const ColorSpinorField> &b,
+                             const QudaSolutionType solType) const override;
+
+    virtual int getStencilSteps() const override { return 2 * (overlap_kernel->remez_order[0] + 1) + 1; }
+    virtual QudaDiracType getDiracType() const { return QUDA_OVERLAP_DIRAC; }
+
+    /**
+      @brief If managed memory and prefetch is enabled, prefetch
+      all relevant memory fields (gauge, clover, temporary spinors)
+      to the CPU or GPU as requested
+      @param[in] mem_space Memory space we are prefetching to
+      @param[in] stream Which stream to run the prefetch in (default 0)
+    */
+    virtual void prefetch(QudaFieldLocation mem_space, qudaStream_t stream = device::get_default_stream()) const;
+  };
+
   // Full staggered
   class DiracStaggered : public Dirac
   {
@@ -2499,6 +2553,7 @@ namespace quda {
       case QUDA_CLOVER_HASENBUSCH_TWIST_DIRAC:
       case QUDA_TWISTED_MASS_DIRAC:
       case QUDA_TWISTED_CLOVER_DIRAC:
+      case QUDA_OVERLAP_DIRAC:
         // while the twisted ops don't have a Hermitian indefinite spectrum, they
         // do have a spectrum of the form (real) + i mu
         gamma5(vec, vec);
@@ -2584,6 +2639,8 @@ namespace quda {
           || dirac_type == QUDA_GAUGE_COVDEV_DIRAC)
         return true;
 
+      if (dirac_type == QUDA_WILSON_DIRAC || dirac_type == QUDA_CLOVER_DIRAC) return true;
+
       // subtle: odd operator gets a minus sign
       if ((dirac_type == QUDA_STAGGEREDPC_DIRAC || dirac_type == QUDA_ASQTADPC_DIRAC)
           && (pc_type == QUDA_MATPC_EVEN_EVEN || pc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC))
@@ -2593,6 +2650,46 @@ namespace quda {
     }
   };
 
+  /**
+     Gloms onto a DiracMatrix and provides an operator() for its MdagMChiral method
+  */
+  class DiracMdagMChiral : public DiracMatrix
+  {
+  protected:
+    QudaChirality chirality;
+
+  public:
+    DiracMdagMChiral(const Dirac &d) : DiracMatrix(d) { }
+    DiracMdagMChiral(const Dirac *d) : DiracMatrix(d) { }
+
+    /**
+       @brief Multi-RHS operator application.
+       @param[out] out The vector of output fields
+       @param[in] in The vector of input fields
+     */
+    void operator()(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const override
+    {
+      dirac->MdagMChiral(out, in, chirality);
+      if (shift != 0.0) blas::axpy(shift, in, out);
+    }
+
+    int getStencilSteps() const override
+    {
+      if (dirac->getDiracType() == QUDA_OVERLAP_DIRAC) {
+        return dirac->getStencilSteps();
+      } else {
+        return dirac->getStencilSteps() * 2; // 2 for M and M dagger
+      }
+    }
+
+    /**
+       @brief return if the operator is HPD
+    */
+    virtual bool hermitian() const override { return true; }
+
+    void setChirality(QudaChirality chirality_in) { chirality = chirality_in; }
+  };
+
   /**
    * Create the Dirac operator. By default, we also create operators with possibly different
    * precisions: Sloppy, and Preconditioner.
diff --git a/include/enum_quda.h b/include/enum_quda.h
index 2d0430ffd5..488533f46d 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -99,6 +99,7 @@ typedef enum QudaDslashType_s {
   QUDA_DOMAIN_WALL_4D_DSLASH,
   QUDA_MOBIUS_DWF_DSLASH,
   QUDA_MOBIUS_DWF_EOFA_DSLASH,
+  QUDA_OVERLAP_DSLASH,
   QUDA_STAGGERED_DSLASH,
   QUDA_ASQTAD_DSLASH,
   QUDA_TWISTED_MASS_DSLASH,
@@ -173,6 +174,8 @@ typedef enum QudaSolveType_s {
   QUDA_NORMOP_PC_SOLVE,
   QUDA_NORMERR_SOLVE,
   QUDA_NORMERR_PC_SOLVE,
+  QUDA_NORMOP_CHIRAL_SOLVE,
+  QUDA_NORMERR_CHIRAL_SOLVE = QUDA_NORMOP_CHIRAL_SOLVE,
   QUDA_NORMEQ_SOLVE = QUDA_NORMOP_SOLVE,       // deprecated
   QUDA_NORMEQ_PC_SOLVE = QUDA_NORMOP_PC_SOLVE, // deprecated
   QUDA_INVALID_SOLVE = QUDA_INVALID_ENUM
@@ -309,6 +312,7 @@ typedef enum QudaDiracType_s {
   QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC,
   QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC,
   QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC,
+  QUDA_OVERLAP_DIRAC,
   QUDA_STAGGERED_DIRAC,
   QUDA_STAGGEREDPC_DIRAC,
   QUDA_STAGGEREDKD_DIRAC,
@@ -637,6 +641,12 @@ typedef enum QudaExtLibType_s {
   QUDA_EXTLIB_INVALID = QUDA_INVALID_ENUM
 } QudaExtLibType;
 
+typedef enum QudaChirality_s {
+  QUDA_LEFT_CHIRALITY = -1,  // (1 - \gamma_5) / 2
+  QUDA_RIGHT_CHIRALITY = +1, // (1 + \gamma_5) / 2
+  QUDA_INVALID_CHIRALITY = QUDA_INVALID_ENUM
+} QudaChirality;
+
 typedef enum QudaDDType_s { QUDA_DD_NO, QUDA_DD_RED_BLACK, QUDA_DD_INVALID = QUDA_INVALID_ENUM } QudaDDType;
 
 typedef enum QudaWFlowStepType_s {
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index faf68bf914..242cd31eff 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -85,12 +85,13 @@
 #define QUDA_DOMAIN_WALL_4D_DSLASH 4
 #define QUDA_MOBIUS_DWF_DSLASH 5
 #define QUDA_MOBIUS_DWF_EOFA_DSLASH 6
-#define QUDA_STAGGERED_DSLASH 7
-#define QUDA_ASQTAD_DSLASH 8
-#define QUDA_TWISTED_MASS_DSLASH 9
-#define QUDA_TWISTED_CLOVER_DSLASH 10
-#define QUDA_LAPLACE_DSLASH 11
-#define QUDA_COVDEV_DSLASH 12
+#define QUDA_OVERLAP_DSLASH 7
+#define QUDA_STAGGERED_DSLASH 8
+#define QUDA_ASQTAD_DSLASH 9
+#define QUDA_TWISTED_MASS_DSLASH 10
+#define QUDA_TWISTED_CLOVER_DSLASH 11
+#define QUDA_LAPLACE_DSLASH 12
+#define QUDA_COVDEV_DSLASH 13
 #define QUDA_INVALID_DSLASH QUDA_INVALID_ENUM
 
 #define QudaInverterType integer(4)
@@ -152,6 +153,8 @@
 #define QUDA_NORMOP_PC_SOLVE 3
 #define QUDA_NORMERR_SOLVE 4
 #define QUDA_NORMERR_PC_SOLVE 5
+#define QUDA_NORMOP_CHIRAL_SOLVE 7
+#define QUDA_NORMERR_CHIRAL_SOLVE QUDA_NORMOP_CHIRAL_SOLVE
 #define QUDA_NORMEQ_SOLVE QUDA_NORMOP_SOLVE // deprecated
 #define QUDA_NORMEQ_PC_SOLVE QUDA_NORMOP_PC_SOLVE // deprecated
 #define QUDA_INVALID_SOLVE QUDA_INVALID_ENUM
@@ -277,21 +280,22 @@
 #define QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC 11
 #define QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC 12
 #define QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC 13
-#define QUDA_STAGGERED_DIRAC 14
-#define QUDA_STAGGEREDPC_DIRAC 15
-#define QUDA_STAGGEREDKD_DIRAC 16
-#define QUDA_ASQTAD_DIRAC 17
-#define QUDA_ASQTADPC_DIRAC 18
-#define QUDA_ASQTADKD_DIRAC 19
-#define QUDA_TWISTED_MASS_DIRAC 20
-#define QUDA_TWISTED_MASSPC_DIRAC 21
-#define QUDA_TWISTED_CLOVER_DIRAC 22
-#define QUDA_TWISTED_CLOVERPC_DIRAC 23
-#define QUDA_COARSE_DIRAC 24
-#define QUDA_COARSEPC_DIRAC 25
-#define QUDA_GAUGE_LAPLACE_DIRAC 26
-#define QUDA_GAUGE_LAPLACEPC_DIRAC 27
-#define QUDA_GAUGE_COVDEV_DIRAC 28
+#define QUDA_OVERLAP_DIRAC 14
+#define QUDA_STAGGERED_DIRAC 15
+#define QUDA_STAGGEREDPC_DIRAC 16
+#define QUDA_STAGGEREDKD_DIRAC 17
+#define QUDA_ASQTAD_DIRAC 18
+#define QUDA_ASQTADPC_DIRAC 19
+#define QUDA_ASQTADKD_DIRAC 20
+#define QUDA_TWISTED_MASS_DIRAC 21
+#define QUDA_TWISTED_MASSPC_DIRAC 22
+#define QUDA_TWISTED_CLOVER_DIRAC 23
+#define QUDA_TWISTED_CLOVERPC_DIRAC 24
+#define QUDA_COARSE_DIRAC 25
+#define QUDA_COARSEPC_DIRAC 26
+#define QUDA_GAUGE_LAPLACE_DIRAC 27
+#define QUDA_GAUGE_LAPLACEPC_DIRAC 28
+#define QUDA_GAUGE_COVDEV_DIRAC 29
 #define QUDA_INVALID_DIRAC QUDA_INVALID_ENUM
 
 ! Where the field is stored
diff --git a/include/kernels/spinor_chiral_project.cuh b/include/kernels/spinor_chiral_project.cuh
new file mode 100644
index 0000000000..e72736355c
--- /dev/null
+++ b/include/kernels/spinor_chiral_project.cuh
@@ -0,0 +1,100 @@
+#include <math_helper.cuh>
+#include <color_spinor_field_order.h>
+#include <index_helper.cuh>
+#include <kernel.h>
+
+namespace quda
+{
+  using namespace colorspinor;
+
+  template <typename store_t, int nColor_, QudaChirality Chirality_>
+  struct ChiralReconstructSpinorArg : kernel_param<> {
+    using real = typename mapper<store_t>::type;
+    static constexpr int nSpin = 4;
+    static constexpr int nColor = nColor_;
+    static constexpr QudaChirality Chirality = Chirality_;
+    using Vout = typename colorspinor_mapper<store_t, nSpin, nColor>::type;
+    using Vin = typename colorspinor_mapper<store_t, nSpin / 2, nColor>::type;
+
+    Vout out;
+    const Vin in_left;
+    const Vin in_right;
+    ChiralReconstructSpinorArg(ColorSpinorField &out, const ColorSpinorField &in_left, const ColorSpinorField &in_right) :
+      kernel_param(dim3(out.VolumeCB(), out.SiteSubset(), 1)), out(out), in_left(in_left), in_right(in_right)
+    {
+    }
+  };
+
+  template <typename Arg> struct ChiralReconstructSpinor {
+    const Arg &arg;
+    constexpr ChiralReconstructSpinor(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ void operator()(int x_cb, int parity)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+      using HalfVector = ColorSpinor<real, Arg::nColor, Arg::nSpin / 2>;
+      const real invsqrt2 = (real)(1.0 / sqrt(2.0));
+
+      Vector out;
+      HalfVector in;
+      if constexpr (Arg::Chirality == QUDA_LEFT_CHIRALITY || Arg::Chirality == QUDA_INVALID_CHIRALITY) {
+        in = arg.in_left(x_cb, parity);
+        out += in.chiral_reconstruct(1);
+      }
+      if constexpr (Arg::Chirality == QUDA_RIGHT_CHIRALITY || Arg::Chirality == QUDA_INVALID_CHIRALITY) {
+        in = arg.in_right(x_cb, parity);
+        out += in.chiral_reconstruct(0);
+      }
+      out.toNonRel();
+      out *= invsqrt2;
+      arg.out(x_cb, parity) = out;
+    }
+  };
+
+  template <typename store_t, int nColor_, QudaChirality Chirality_> struct ChiralProjectSpinorArg : kernel_param<> {
+    using real = typename mapper<store_t>::type;
+    static constexpr int nSpin = 4;
+    static constexpr int nColor = nColor_;
+    static constexpr QudaChirality Chirality = Chirality_;
+    using Vout = typename colorspinor_mapper<store_t, nSpin / 2, nColor>::type;
+    using Vin = typename colorspinor_mapper<store_t, nSpin, nColor>::type;
+
+    Vout out_left;
+    Vout out_right;
+    const Vin in;
+    ChiralProjectSpinorArg(ColorSpinorField &out_left, ColorSpinorField &out_right, const ColorSpinorField &in) :
+      kernel_param(dim3(in.VolumeCB(), in.SiteSubset(), 1)), out_left(out_left), out_right(out_right), in(in)
+    {
+    }
+  };
+
+  template <typename Arg> struct ChiralProjectSpinor {
+    const Arg &arg;
+    constexpr ChiralProjectSpinor(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ void operator()(int x_cb, int parity)
+    {
+      using real = typename Arg::real;
+      using HalfVector = ColorSpinor<real, Arg::nColor, Arg::nSpin / 2>;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+      const real invsqrt2 = (real)(1.0 / sqrt(2.0));
+
+      HalfVector out;
+      Vector in = arg.in(x_cb, parity);
+      in.toRel();
+      in *= invsqrt2;
+      if constexpr (Arg::Chirality == QUDA_LEFT_CHIRALITY || Arg::Chirality == QUDA_INVALID_CHIRALITY) {
+        out = in.chiral_project(1);
+        arg.out_left(x_cb, parity) = out;
+      }
+      if constexpr (Arg::Chirality == QUDA_RIGHT_CHIRALITY || Arg::Chirality == QUDA_INVALID_CHIRALITY) {
+        out = in.chiral_project(0);
+        arg.out_right(x_cb, parity) = out;
+      }
+    }
+  };
+
+} // namespace quda
diff --git a/include/overlap_kernel.h b/include/overlap_kernel.h
new file mode 100644
index 0000000000..9ecff20986
--- /dev/null
+++ b/include/overlap_kernel.h
@@ -0,0 +1,31 @@
+/**
+   @file overlap.h
+
+   @section DESCRIPTION
+*/
+
+#pragma once
+
+#include <quda_internal.h>
+#include <color_spinor_field.h>
+
+namespace quda
+{
+  struct OverlapKernel {
+    std::vector<ColorSpinorField> evecs;
+    std::vector<double> evals;
+    double kappa;
+    double epsilon;
+    std::vector<double> remez_tol;
+    std::vector<std::vector<double>> remez_coeff;
+    std::vector<int> remez_order;
+
+    OverlapKernel(std::vector<ColorSpinorField> &evecs, const std::vector<Complex> &evals, double kappa,
+                  const std::vector<double> remez_tol);
+    OverlapKernel(const OverlapKernel *overlap_kernel, QudaPrecision precision);
+    ~OverlapKernel() = default;
+
+    inline QudaPrecision Precision() const { return evecs[0].Precision(); }
+    inline double Kappa() const { return kappa; }
+  };
+} // namespace quda
diff --git a/include/quda.h b/include/quda.h
index 6cbfee5f04..5d20d86617 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -457,6 +457,13 @@ extern "C" {
     /** Whether to use fused kernels for mobius */
     QudaBoolean use_mobius_fused_kernel;
 
+    /** Parameters for overlap fermion */
+    double overlap_invsqrt_tol;
+    int ov_n_ev;
+    double_complex *ov_eigvals;
+    double_complex **ov_eigvecs;
+    double *ov_masses;
+
     /**
      * Parameters for distance preconditioning algorithm proposed in arXiv:1006.4028,
      * which is useful to solve a precise heavy quark propagator.
@@ -523,6 +530,9 @@ extern "C" {
     QudaBoolean use_norm_op;
     QudaBoolean use_pc;
 
+    /** Use chiral version of MdagM */
+    QudaChirality chirality;
+
     /** Use Eigen routines to eigensolve the upper Hessenberg via QR **/
     QudaBoolean use_eigen_qr;
 
@@ -1211,6 +1221,10 @@ extern "C" {
    */
   void eigensolveQuda(void **h_evecs, double_complex *h_evals, QudaEigParam *param);
 
+  void loadOverlapQuda(QudaInvertParam *inv_param, QudaEigParam *eig_param);
+  void freeOverlapQuda();
+  void invertOverlapMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param);
+
   /**
    * Perform the solve, according to the parameters set in param.  It
    * is assumed that the gauge field has already been loaded via
diff --git a/include/quda_internal.h b/include/quda_internal.h
index 36f53cd987..01564b8b95 100644
--- a/include/quda_internal.h
+++ b/include/quda_internal.h
@@ -22,15 +22,15 @@
 // these are helper macros used to enable spin-1, spin-2 and spin-4 building blocks as needed
 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_CLOVER_DIRAC)                           \
   || defined(GPU_TWISTED_MASS_DIRAC) || defined(GPU_TWISTED_CLOVER_DIRAC) || defined(GPU_CLOVER_HASENBUSCH_TWIST)      \
-  || defined(GPU_COVDEV) || defined(GPU_CONTRACT)
+  || defined(GPU_LAPLACE) || defined(GPU_COVDEV) || defined(GPU_CONTRACT)
 #define NSPIN4
 #endif
 
-#if defined(GPU_MULTIGRID)
+#if defined(GPU_WILSON_DIRAC) || defined(GPU_MULTIGRID)
 #define NSPIN2
 #endif
 
-#if defined(GPU_STAGGERED_DIRAC) || defined(GPU_LAPLACE)
+#if defined(GPU_STAGGERED_DIRAC) || defined(GPU_LAPLACE) || defined(GPU_COVDEV)
 #define NSPIN1
 #endif
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 233de9598f..6c5e5c19da 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -45,7 +45,7 @@ set (QUDA_OBJS
   dirac_staggered_kd.cpp dirac_clover_hasenbusch_twist.cpp
   dirac_improved_staggered.cpp dirac_improved_staggered_kd.cpp dirac_domain_wall.cpp
   dirac_domain_wall_4d.cpp dirac_mobius.cpp dirac_twisted_clover.cpp
-  dirac_twisted_mass.cpp 
+  dirac_twisted_mass.cpp dirac_overlap.cpp overlap_kernel.cpp
   llfat_quda.cu staggered_two_link_quda.cu gauge_force.cu gauge_loop_trace.cu gauge_polyakov_loop.cu
   gauge_random.cu gauge_noise.cu
   gauge_field_strength_tensor.cu clover_quda.cu 
@@ -81,6 +81,7 @@ set (QUDA_OBJS
   extract_gauge_ghost_extended.cu copy_color_spinor.cpp
   spin_duplicate.cu
   spinor_noise.cu spinor_dilute.cu spinor_reweight.cu
+  spinor_chiral_project.cu
   copy_color_spinor_dd.cu copy_color_spinor_ds.cu
   copy_color_spinor_dh.cu copy_color_spinor_dq.cu
   copy_color_spinor_ss.cu copy_color_spinor_sd.cu
diff --git a/lib/blas_quda.cu b/lib/blas_quda.cu
index 1b5f552bca..163de14839 100644
--- a/lib/blas_quda.cu
+++ b/lib/blas_quda.cu
@@ -64,7 +64,7 @@ namespace quda {
       void apply(const qudaStream_t &stream) override
       {
         constexpr bool site_unroll_check = !std::is_same<store_t, y_store_t>::value || isFixed<store_t>::value;
-        if (site_unroll_check && (x.Ncolor() != 3 || x.Nspin() == 2))
+        if (site_unroll_check && (x.Ncolor() != 3 && x.Nspin() == 2))
           errorQuda("site unroll not supported for nSpin = %d nColor = %d", x.Nspin(), x.Ncolor());
 
         if (location == QUDA_CUDA_FIELD_LOCATION) {
@@ -78,7 +78,7 @@ namespace quda {
           constexpr bool site_unroll = !std::is_same<device_store_t, device_y_store_t>::value || isFixed<device_store_t>::value;
           constexpr int N = n_vector<device_store_t, true, nSpin, site_unroll>();
           constexpr int Ny = n_vector<device_y_store_t, true, nSpin, site_unroll>();
-          constexpr int M = site_unroll ? (nSpin == 4 ? 24 : 6) : N; // real numbers per thread
+          constexpr int M = site_unroll ? n_vector<device_store_t, false, nSpin, true>() : N; // real numbers per thread
           const int threads = x.Length() / (nParity * M);
 
           TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index 6bfe26399b..36788d3236 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -185,7 +185,7 @@ namespace quda {
   {
     LatticeField::setTuningString();
     std::stringstream aux_ss;
-    aux_ss << "vol=" << volume << "precision=" << precision << "Nc=" << nColor;
+    aux_ss << "vol=" << volume << ",precision=" << precision << ",Nc=" << nColor << ",memory=" << mem_type;
     aux_string = aux_ss.str();
     if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());
   }
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 532aac6f73..725fc16c52 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -296,7 +296,7 @@ namespace quda
     if (init) {
       std::stringstream aux_ss;
       aux_ss << "vol=" << volume << ",parity=" << siteSubset << ",precision=" << precision << ",order=" << fieldOrder
-             << ",Ns=" << nSpin << ",Nc=" << nColor;
+             << ",Ns=" << nSpin << ",Nc=" << nColor << ",memory=" << mem_type;
       if (nVec > 1) aux_ss << ",nVec=" << nVec;
       if (twistFlavor != QUDA_TWIST_NO && twistFlavor != QUDA_TWIST_INVALID) aux_ss << ",TwistFlavor=" << twistFlavor;
       aux_string = aux_ss.str();
diff --git a/lib/dirac.cpp b/lib/dirac.cpp
index a5ec216943..2a3a572019 100644
--- a/lib/dirac.cpp
+++ b/lib/dirac.cpp
@@ -48,6 +48,7 @@ namespace quda {
     type(dirac.type),
     halo_precision(dirac.halo_precision),
     commDim(dirac.commDim),
+    use_mobius_fused_kernel(dirac.use_mobius_fused_kernel),
     distance_pc_alpha0(dirac.distance_pc_alpha0),
     distance_pc_t0(dirac.distance_pc_t0),
     profile("Dirac", false)
@@ -72,6 +73,7 @@ namespace quda {
       symmetric = dirac.symmetric;
       dagger = dirac.dagger;
       commDim = dirac.commDim;
+      use_mobius_fused_kernel = dirac.use_mobius_fused_kernel;
       distance_pc_alpha0 = dirac.distance_pc_alpha0;
       distance_pc_t0 = dirac.distance_pc_t0;
       profile = dirac.profile;
@@ -186,6 +188,9 @@ namespace quda {
     } else if (param.type == QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC) {
       if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Creating a DiracMobiusEofaPC operator\n");
       return new DiracMobiusEofaPC(param);
+    } else if (param.type == QUDA_OVERLAP_DIRAC) {
+      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Creating a DiracOverlap operator\n");
+      return new DiracOverlap(param);
     } else if (param.type == QUDA_STAGGERED_DIRAC) {
       if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Creating a DiracStaggered operator\n");
       return new DiracStaggered(param);
diff --git a/lib/dirac_overlap.cpp b/lib/dirac_overlap.cpp
new file mode 100644
index 0000000000..6f695fb75f
--- /dev/null
+++ b/lib/dirac_overlap.cpp
@@ -0,0 +1,178 @@
+#include <util_quda.h>
+#include <dirac_quda.h>
+#include <dslash_quda.h>
+#include <blas_quda.h>
+
+namespace quda
+{
+  /**
+   * Apply the overlap overlap
+   * out = a * x + D * in = a * x + 0.5 * (1 + \gamma_5 sign(\gamma_5 M)) * in
+   * where M is the Wilson operator
+   */
+  void ApplyOverlap(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
+                    OverlapKernel &O, double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
+                    const int *comm_override, TimeProfile &profile)
+  {
+    auto in_def = getFieldTmp(out);
+    auto b1 = getFieldTmp(out);
+    auto b2 = getFieldTmp(out);
+    auto Mb1 = getFieldTmp(out);
+    auto Ab1 = getFieldTmp(out);
+
+    cvector_ref<ColorSpinorField> &evecs = O.evecs;
+    cvector<double> &evals = O.evals;
+    const double remez_order = O.remez_order[0];
+    cvector<double> &remez_coeff = O.remez_coeff[0];
+    const double lambda_max = (1.0 + 8.0 * O.kappa);
+    const double epsilon = O.epsilon;
+
+    /**
+     * Apply 0.5 directly to the input
+     */
+    if (dagger) {
+      blas::axy(0.5, in, out);
+      gamma5(in_def, out);
+    } else {
+      blas::axy(0.5, in, in_def);
+      gamma5(out, in_def);
+    }
+
+    /**
+     * \gamma_5 sign(\gamma_5 M) for small eigenvalues
+     * Define the eigenvalues and eigenvectors \gamma_5 M v_i = \lambda_i v_i
+     * ==> \gamma_5 \sum_i sign(\lambda_i) |v_i><v_i|
+     */
+    std::vector<quda::Complex> alpha(evecs.size() * in_def.size());
+    blas::block::cDotProduct(alpha, evecs, in_def);
+    for (auto &v : alpha) { v *= -1; }
+    blas::block::caxpy(alpha, evecs, in_def);
+    for (size_t i = 0; i < evecs.size(); i++) {
+      for (size_t j = 0; j < in_def.size(); ++j) { alpha[i * in_def.size() + j] *= -evals[i] / abs(evals[i]); }
+    }
+    blas::block::caxpy(alpha, evecs, out);
+    if (!dagger) { gamma5(out, out); }
+
+    /**
+     * \gamma_5 sign(\gamma_5 M) for large eigenvalues
+     * Define the Chebyshev polynomial approximation P(x) ~ x^{-1/2}
+     * ==> M P(M^\dagger M)
+     * Here M is the normalized Wilson operator which has the maximum eigenvalue 1
+     */
+    blas::zero(b1);
+    blas::zero(b2);
+    for (int k = remez_order; k >= 0; --k) {
+      ApplyWilson(Mb1, b1, U, -O.kappa, b1, parity, false, comm_override, profile);
+      ApplyWilson(Ab1, Mb1, U, -O.kappa, Mb1, parity, true, comm_override, profile);
+      blas::axpby(-(1.0 + epsilon) / (1.0 - epsilon), b1, 2.0 / (1.0 - epsilon) / (lambda_max * lambda_max), Ab1);
+      if (k > 0) {
+        blas::axpbypczw(remez_coeff[k], in_def, 2.0, Ab1, -1.0, b2, b2);
+      } else {
+        blas::axpbypczw(remez_coeff[0], in_def, 1.0, Ab1, -1.0, b2, b2);
+      }
+      std::swap(b1, b2);
+    }
+    ApplyWilson(Mb1, b1, U, -O.kappa, b1, parity, false, comm_override, profile);
+    if (dagger) { gamma5(Mb1, Mb1); }
+    if (a == 0.0) {
+      blas::axpbyz(1.0 / lambda_max, Mb1, 1.0, out, out);
+    } else {
+      blas::axpbypczw(a, x, 1.0 / lambda_max, Mb1, 1.0, out, out);
+    }
+  }
+
+  DiracOverlap::DiracOverlap(const DiracParam &param) : Dirac(param), overlap_kernel(param.overlap_kernel) { }
+
+  DiracOverlap::DiracOverlap(const DiracOverlap &dirac) : Dirac(dirac), overlap_kernel(dirac.overlap_kernel) { }
+
+  DiracOverlap::~DiracOverlap() { }
+
+  DiracOverlap &DiracOverlap::operator=(const DiracOverlap &dirac)
+  {
+    if (&dirac != this) {
+      Dirac::operator=(dirac);
+      overlap_kernel = dirac.overlap_kernel;
+    }
+    return *this;
+  }
+
+  void DiracOverlap::Dslash(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                            QudaParity parity) const
+  {
+    ApplyOverlap(out, in, *gauge, *overlap_kernel, 0.0, in, parity, dagger, commDim.data, profile);
+  }
+
+  void DiracOverlap::DslashXpay(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                QudaParity parity, cvector_ref<const ColorSpinorField> &x, double k) const
+  {
+    ApplyOverlap(out, in, *gauge, *overlap_kernel, k, x, parity, dagger, commDim.data, profile);
+  }
+
+  // Defined as m / (1 - m) + D, and then multiplied by sqrt((1 - m) / (1 + m))
+  void DiracOverlap::M(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
+  {
+    DslashXpay(out, in, QUDA_INVALID_PARITY, in, mass / (1.0 - mass));
+    if (mass != 0.0) { blas::ax(sqrt((1.0 - mass) / (1.0 + mass)), out); }
+  }
+
+  // Defined as m^2 / (1 - m^2) + DdagD
+  void DiracOverlap::MdagM(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
+  {
+    auto tmp = getFieldTmp(out);
+    Dslash(tmp, in, QUDA_INVALID_PARITY);
+    flipDagger();
+    DslashXpay(out, tmp, QUDA_INVALID_PARITY, in, (mass * mass) / (1.0 - mass * mass));
+    flipDagger();
+  }
+
+  // Defined as m^2 / (1 - m^2) + D
+  // (1\pm\gamma_5)/2 DdagD (1\pm\gamma_5)/2 = (1\pm\gamma_5)/2 D (1\pm\gamma_5)/2
+  void DiracOverlap::MdagMChiral(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                 QudaChirality chirality) const
+  {
+    ColorSpinorParam param(in[0]);
+    param.nSpin = 4;
+    param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
+    param.mem_type = QUDA_MEMORY_DEVICE; // TODO: Hack for eigensolver in the host memory
+    param.setPrecision(param.Precision(), param.Precision(), true);
+    auto in_tmp = getFieldTmp<ColorSpinorField>(in.size(), param);
+    auto out_tmp = getFieldTmp<ColorSpinorField>(out.size(), param);
+
+    for (size_t i = 0; i < in.size(); i++) { spinorChiralReconstruct(in_tmp[i], in[i], chirality); }
+    DslashXpay(out_tmp, in_tmp, QUDA_INVALID_PARITY, in_tmp, (mass * mass) / (1.0 - mass * mass));
+    for (size_t i = 0; i < out.size(); i++) { spinorChiralProject(out[i], out_tmp[i], chirality); }
+  }
+
+  void DiracOverlap::prepare(cvector_ref<ColorSpinorField> &out, cvector_ref<ColorSpinorField> &in,
+                             cvector_ref<ColorSpinorField> &x, cvector_ref<const ColorSpinorField> &b,
+                             const QudaSolutionType solType) const
+  {
+    if (solType == QUDA_MATPC_SOLUTION || solType == QUDA_MATPCDAG_MATPC_SOLUTION) {
+      errorQuda("Preconditioned solution requires a preconditioned solve_type");
+    }
+
+    create_alias(in, b);
+    create_alias(out, x);
+  }
+
+  void DiracOverlap::reconstruct(cvector_ref<ColorSpinorField> &x, cvector_ref<const ColorSpinorField> &b,
+                                 const QudaSolutionType solType) const
+  {
+    if (solType == QUDA_MATPC_SOLUTION || solType == QUDA_MATPCDAG_MATPC_SOLUTION) { return; }
+
+    if (solType == QUDA_MAT_SOLUTION) {
+      // x = -1 / (1 - m) * b + 1 / (1 - m) * 1 / sqrt(1 - m^2) * x'
+      // x' = M^{-1} * b = (sqrt((1 - m) / (1 + m)) * (m / (1 - m) + D))^{-1} * b
+      blas::axpby(-1.0 / (1.0 - mass), b, 1.0 / (1.0 - mass) / sqrt(1.0 - mass * mass), x);
+    } else if (solType == QUDA_MATDAG_MAT_SOLUTION) {
+      // x = -1 / (1 - m^2) * b + 1 / (1 - m^2) * 1 / (1 - m^2) * x'
+      // x' = (MdagM)^{-1} * b = (m^2 / (1 - m^2) + DdagD)^{-1} * b
+      blas::axpby(-1.0 / (1.0 - mass * mass), b, 1.0 / (1.0 - mass * mass) / (1.0 - mass * mass), x);
+    }
+  }
+
+  void DiracOverlap::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const
+  {
+    Dirac::prefetch(mem_space, stream);
+  }
+} // namespace quda
\ No newline at end of file
diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp
index 156969a191..dfc4c102f0 100644
--- a/lib/eigensolve_quda.cpp
+++ b/lib/eigensolve_quda.cpp
@@ -276,33 +276,32 @@ namespace quda
     double b = eig_param->a_max;
     double delta = (b - a) / 2.0;
     double theta = (b + a) / 2.0;
-    double sigma1 = -delta / theta;
+    double lambda1 = eig_param->spectrum == QUDA_SPECTRUM_SR_EIG ? a : b;
+    double sigma1 = delta / (lambda1 - theta);
+    double sigma_old = sigma1;
     double sigma;
     double d1 = sigma1 / delta;
-    double d2 = 1.0;
+    double d2 = -d1 * theta;
     double d3;
 
+    ColorSpinorParam param(in[0]);
+    param.mem_type = QUDA_MEMORY_DEVICE; // FIXME: Hack for eigensolver in the host memory
+    auto z_old = getFieldTmp<ColorSpinorField>(in.size(), param);
+    auto z = getFieldTmp<ColorSpinorField>(in.size(), param);
+    auto Az = getFieldTmp<ColorSpinorField>(in.size(), param);
+
     // out = d2 * in + d1 * out
     // C_1(x) = x
-    mat({out.begin(), out.end()}, {in.begin(), in.end()});
-    blas::caxpby(d2, in, d1, out);
-
-    if (eig_param->poly_deg == 1) return;
-
-    // C_0 is the current 'in'  vector.
-    // C_1 is the current 'out' vector.
-
-    // Clone 'in' to two temporary vectors.
-    std::vector<ColorSpinorField> tmp1{in.begin(), in.end()};
-    std::vector<ColorSpinorField> tmp2{out.begin(), out.end()};
+    blas::copy(z, in);
+    mat(Az, z);
+    blas::axpbyz(d2, z, d1, Az, z_old);
+    std::swap(z_old, z);
 
     // Using Chebyshev polynomial recursion relation,
     // C_{m+1}(x) = 2*x*C_{m} - C_{m-1}
 
-    double sigma_old = sigma1;
-
     // construct C_{m+1}(x)
-    for (int i = 2; i < eig_param->poly_deg; i++) {
+    for (int i = 1; i < eig_param->poly_deg; i++) {
       sigma = 1.0 / (2.0 / sigma1 - sigma_old);
 
       d1 = 2.0 * sigma / delta;
@@ -311,15 +310,14 @@ namespace quda
 
       // FIXME - we could introduce a fused mat + blas kernel here, eliminating one temporary
       // mat*C_{m}(x)
-      mat(out, tmp2);
-
-      blas::axpbypczw(d3, tmp1, d2, tmp2, d1, out, tmp1);
-      std::swap(tmp1, tmp2);
+      mat(Az, z);
+      blas::axpbypczw(d3, z_old, d2, z, d1, Az, z_old);
+      std::swap(z_old, z);
 
       sigma_old = sigma;
     }
 
-    for (auto i = 0u; i < in.size(); i++) std::swap(out[i], tmp2[i]);
+    blas::copy(out, z);
   }
 
   double EigenSolver::estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in)
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 0b3657102b..6d5435c48c 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -301,7 +301,7 @@ namespace quda {
     LatticeField::setTuningString();
     std::stringstream aux_ss;
     aux_ss << "vol=" << volume << ",stride=" << stride << ",precision=" << precision << ",geometry=" << geometry
-           << ",Nc=" << nColor;
+           << ",Nc=" << nColor << ",memory=" << mem_type;
     if (ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED) aux_ss << ",r=" << r[0] << r[1] << r[2] << r[3];
     aux_string = aux_ss.str();
     if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 47290a14e8..6d15b416c0 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -37,6 +37,8 @@
 #include <clover_backup.h>
 #include <split_grid.h>
 
+#include <overlap_kernel.h>
+
 #include <ks_force_quda.h>
 #include <ks_qsmear.h>
 
@@ -106,6 +108,12 @@ CloverField *cloverPrecondition = nullptr;
 CloverField *cloverRefinement = nullptr;
 CloverField *cloverEigensolver = nullptr;
 
+OverlapKernel *overlapPrecise = nullptr;
+OverlapKernel *overlapSloppy = nullptr;
+OverlapKernel *overlapPrecondition = nullptr;
+OverlapKernel *overlapRefinement = nullptr;
+OverlapKernel *overlapEigensolver = nullptr;
+
 GaugeField momResident;
 GaugeField *extendedGaugeResident = nullptr;
 
@@ -156,6 +164,9 @@ static TimeProfile profileGauge("loadGaugeQuda");
 //!< Profile for loadCloverQuda
 static TimeProfile profileClover("loadCloverQuda");
 
+//!< Profiler for loadOverlapQuda
+static TimeProfile profileOverlap("loadOverlapQuda");
+
 //!< Profiler for dslashQuda
 static TimeProfile profileDslash("dslashQuda");
 
@@ -1081,6 +1092,155 @@ void loadSloppyCloverQuda(const QudaPrecision *prec)
 
 }
 
+void freeSloppyOverlapQuda()
+{
+  if (!initialized) errorQuda("QUDA not initialized");
+
+  // Delete overlapRefinement if it does not alias overlapSloppy.
+  if (overlapRefinement != overlapSloppy && overlapRefinement) delete overlapRefinement;
+
+  // Delete overlapPrecondition if it does not alias overlapPrecise, overlapSloppy, or overlapEigensolver.
+  if (overlapPrecondition != overlapSloppy && overlapPrecondition != overlapPrecise
+      && overlapPrecondition != overlapEigensolver && overlapPrecondition)
+    delete overlapPrecondition;
+
+  // Delete overlapEigensolver if it does not alias overlapPrecise or overlapSloppy.
+  if (overlapEigensolver != overlapSloppy && overlapEigensolver != overlapPrecise && overlapEigensolver)
+    delete overlapEigensolver;
+
+  // Delete overlapSloppy if it does not alias overlapPrecise.
+  if (overlapSloppy != overlapPrecise && overlapSloppy) delete overlapSloppy;
+
+  overlapEigensolver = nullptr;
+  overlapRefinement = nullptr;
+  overlapPrecondition = nullptr;
+  overlapSloppy = nullptr;
+}
+
+void freeOverlapQuda(void)
+{
+  if (!initialized) errorQuda("QUDA not initialized");
+  freeSloppyOverlapQuda();
+  if (overlapPrecise) { delete overlapPrecise; }
+  overlapPrecise = nullptr;
+}
+
+void loadSloppyOverlapQuda(const QudaPrecision prec[])
+{
+  freeSloppyOverlapQuda();
+
+  if (overlapPrecise) {
+
+    if (prec[0] == overlapPrecise->Precision()) {
+      overlapSloppy = overlapPrecise;
+    } else {
+      overlapSloppy = new OverlapKernel(overlapPrecise, prec[0]);
+    }
+
+    // create the mirror preconditioner overlap field
+    if (prec[1] == overlapPrecise->Precision()) {
+      overlapPrecondition = overlapPrecise;
+    } else if (prec[1] == overlapSloppy->Precision()) {
+      overlapPrecondition = overlapSloppy;
+    } else {
+      overlapPrecondition = new OverlapKernel(overlapPrecise, prec[1]);
+    }
+
+    // create the mirror refinement overlap field
+    if (prec[2] == overlapSloppy->Precision()) {
+      overlapRefinement = overlapSloppy;
+    } else {
+      overlapRefinement = new OverlapKernel(overlapPrecise, prec[2]);
+    }
+
+    // create the mirror eigensolver overlap field
+    if (prec[3] == overlapPrecise->Precision()) {
+      overlapEigensolver = overlapPrecise;
+    } else if (prec[3] == overlapSloppy->Precision()) {
+      overlapEigensolver = overlapSloppy;
+    } else if (prec[3] == overlapPrecondition->Precision()) {
+      overlapEigensolver = overlapPrecondition;
+    } else {
+      overlapEigensolver = new OverlapKernel(overlapPrecise, prec[3]);
+    }
+  }
+}
+
+void loadOverlapQuda(QudaInvertParam *inv_param, QudaEigParam *eig_param)
+{
+  auto profile = pushProfile(profileOverlap);
+  pushVerbosity(inv_param->verbosity);
+
+  checkInvertParam(inv_param);
+  checkEigParam(eig_param);
+
+  if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded before clover");
+
+  ColorSpinorParam cpuParam(nullptr, *inv_param, gaugePrecise->X(), false, inv_param->input_location);
+  ColorSpinorParam cudaParam(cpuParam, *inv_param, QUDA_CUDA_FIELD_LOCATION);
+  cudaParam.setPrecision(inv_param->cuda_prec, inv_param->cuda_prec, true);
+  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
+  cudaParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
+
+  const int n_eig = eig_param->n_conv;
+  std::vector<Complex> evals(n_eig, 0.0);
+  std::vector<ColorSpinorField> evecs(n_eig, ColorSpinorField(cudaParam));
+
+  QudaEigParam eig_param_g5w = newQudaEigParam();
+  eig_param_g5w.eig_type = QUDA_EIG_TR_LANCZOS;
+  eig_param_g5w.spectrum = QUDA_SPECTRUM_SR_EIG;
+  eig_param_g5w.use_dagger = QUDA_BOOLEAN_FALSE;
+  eig_param_g5w.use_norm_op = QUDA_BOOLEAN_TRUE;
+  eig_param_g5w.use_pc = QUDA_BOOLEAN_FALSE;
+  eig_param_g5w.compute_gamma5 = QUDA_BOOLEAN_FALSE;
+  eig_param_g5w.batched_rotate = 1; // Save device memory
+  eig_param_g5w.compute_evals_batch_size = 1;
+
+  eig_param_g5w.use_poly_acc = eig_param->use_poly_acc;
+  eig_param_g5w.poly_deg = eig_param->poly_deg;
+  eig_param_g5w.a_min = eig_param->a_min * eig_param->a_min;
+  eig_param_g5w.a_max = (1 + 8 * inv_param->kappa) * (1 + 8 * inv_param->kappa);
+  eig_param_g5w.n_ev = eig_param->n_ev;
+  eig_param_g5w.n_kr = eig_param->n_kr;
+  eig_param_g5w.n_conv = eig_param->n_conv;
+  eig_param_g5w.tol = eig_param->tol;
+  eig_param_g5w.max_restarts = eig_param->max_restarts;
+  strcpy(eig_param_g5w.vec_infile, eig_param->vec_infile);
+  strcpy(eig_param_g5w.vec_outfile, eig_param->vec_outfile);
+
+  DiracParam diracParam;
+  setDiracParam(diracParam, inv_param, false);
+  Dirac *d = new DiracWilson(diracParam);
+
+  DiracMatrix *m = new DiracMdagM(*d);
+  auto *eig_solve = quda::EigenSolver::create(&eig_param_g5w, *m);
+  (*eig_solve)(evecs, evals);
+  delete eig_solve;
+
+  // Recalculate eigenvalues
+  delete m;
+  m = new DiracG5M(*d);
+  ColorSpinorField tmp(cudaParam);
+  for (int i = 0; i < n_eig; ++i) {
+    (*m)(tmp, evecs[i]);
+    evals[i] = blas::cDotProduct(tmp, evecs[i]);
+  }
+
+  delete m;
+  delete d;
+
+  freeOverlapQuda();
+  std::vector<double> remez_tol(1, inv_param->overlap_invsqrt_tol);
+  overlapPrecise = new OverlapKernel(evecs, evals, inv_param->kappa, remez_tol);
+  QudaPrecision prec[] = {inv_param->cuda_prec_sloppy, inv_param->cuda_prec_precondition,
+                          inv_param->cuda_prec_refinement_sloppy, inv_param->cuda_prec_eigensolver};
+  loadSloppyOverlapQuda(prec);
+
+  flushPoolQuda(QUDA_MEMORY_DEVICE);
+
+  popVerbosity();
+}
+
 // just free the sloppy fields used in mixed-precision solvers
 void freeSloppyGaugeQuda()
 {
@@ -1463,6 +1623,7 @@ void endQuda(void)
 
     freeGaugeQuda();
     freeCloverQuda();
+    freeOverlapQuda();
 
     flushChrono();
 
@@ -1508,6 +1669,7 @@ void endQuda(void)
     profileInit.Print();
     profileGauge.Print();
     profileClover.Print();
+    profileOverlap.Print();
     profileDslash.Print();
     profileInvert.Print();
     profileInvertMultiSrc.Print();
@@ -1606,6 +1768,7 @@ namespace quda {
       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
       memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
       break;
+    case QUDA_OVERLAP_DSLASH: diracParam.type = QUDA_OVERLAP_DIRAC; break;
     case QUDA_STAGGERED_DSLASH:
       diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC;
       break;
@@ -1650,6 +1813,7 @@ namespace quda {
     diracParam.fatGauge = gaugeFatPrecise;
     diracParam.longGauge = gaugeLongPrecise;
     diracParam.clover = cloverPrecise;
+    diracParam.overlap_kernel = overlapPrecise;
     diracParam.kappa = kappa;
     diracParam.mass = inv_param->mass;
     diracParam.m5 = inv_param->m5;
@@ -1675,6 +1839,7 @@ namespace quda {
     diracParam.fatGauge = gaugeFatSloppy;
     diracParam.longGauge = gaugeLongSloppy;
     diracParam.clover = cloverSloppy;
+    diracParam.overlap_kernel = overlapSloppy;
 
     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = 1;   // comms are always on
@@ -1693,6 +1858,7 @@ namespace quda {
     diracParam.fatGauge = gaugeFatRefinement;
     diracParam.longGauge = gaugeLongRefinement;
     diracParam.clover = cloverRefinement;
+    diracParam.overlap_kernel = overlapRefinement;
 
     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = 1;   // comms are always on
@@ -1718,6 +1884,7 @@ namespace quda {
       diracParam.longGauge = gaugeLongPrecondition;
     }
     diracParam.clover = cloverPrecondition;
+    diracParam.overlap_kernel = overlapPrecondition;
 
     for (int i=0; i<4; i++) {
       diracParam.commDim[i] = comms ? 1 : 0;
@@ -1763,6 +1930,7 @@ namespace quda {
       diracParam.longGauge = gaugeLongEigensolver;
     }
     diracParam.clover = cloverEigensolver;
+    diracParam.overlap_kernel = overlapEigensolver;
 
     for (int i = 0; i < 4; i++) { diracParam.commDim[i] = 1; }
 
@@ -1847,6 +2015,8 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
     errorQuda("Gauge field not allocated");
   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
+  if (overlapPrecise == nullptr && (inv_param->dslash_type == QUDA_OVERLAP_DSLASH))
+    errorQuda("Overlap kernel not allocated");
 
   pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
@@ -2342,6 +2512,8 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
     errorQuda("Gauge field not allocated");
   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
+  if (overlapPrecise == nullptr && (inv_param->dslash_type == QUDA_OVERLAP_DSLASH))
+    errorQuda("Overlap kernel not allocated");
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
@@ -2406,6 +2578,8 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
     errorQuda("Gauge field not allocated");
   if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH)))
     errorQuda("Clover field not allocated");
+  if (overlapPrecise == nullptr && (inv_param->dslash_type == QUDA_OVERLAP_DSLASH))
+    errorQuda("Overlap kernel not allocated");
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
@@ -2502,6 +2676,34 @@ void checkClover(QudaInvertParam *param) {
   if (cloverEigensolver == nullptr) errorQuda("Eigensolver clover field doesn't exist");
 }
 
+void checkOverlap(QudaInvertParam *param)
+{
+  if (param->dslash_type != QUDA_OVERLAP_DSLASH) { return; }
+
+  if (param->cuda_prec != overlapPrecise->Precision()) {
+    errorQuda("Solve precision %d doesn't match overlap precision %d", param->cuda_prec, overlapPrecise->Precision());
+  }
+  if (param->kappa != overlapPrecise->Kappa()) {
+    errorQuda("Solve kappa %f doesn't match overlap kappa %f", param->kappa, overlapPrecise->Kappa());
+  }
+
+  if ((!overlapSloppy || param->cuda_prec_sloppy != overlapSloppy->Precision())
+      || (!overlapPrecondition || param->cuda_prec_precondition != overlapPrecondition->Precision())
+      || (!overlapRefinement || param->cuda_prec_refinement_sloppy != overlapRefinement->Precision())
+      || (!overlapEigensolver || param->cuda_prec_eigensolver != overlapEigensolver->Precision())) {
+    freeSloppyOverlapQuda();
+    QudaPrecision prec[4] = {param->cuda_prec_sloppy, param->cuda_prec_precondition, param->cuda_prec_refinement_sloppy,
+                             param->cuda_prec_eigensolver};
+    loadSloppyOverlapQuda(prec);
+  }
+
+  if (overlapPrecise == nullptr) errorQuda("Precise overlap kernel doesn't exist");
+  if (overlapSloppy == nullptr) errorQuda("Sloppy overlap kernel doesn't exist");
+  if (overlapPrecondition == nullptr) errorQuda("Precondition kernel field doesn't exist");
+  if (overlapRefinement == nullptr) errorQuda("Refinement kernel field doesn't exist");
+  if (overlapEigensolver == nullptr) errorQuda("Eigensolver kernel field doesn't exist");
+}
+
 quda::GaugeField *checkGauge(QudaInvertParam *param)
 {
   quda::GaugeField *U = param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise :
@@ -2567,6 +2769,7 @@ quda::GaugeField *checkGauge(QudaInvertParam *param)
   }
 
   checkClover(param);
+  checkOverlap(param);
 
   return U;
 }
@@ -2651,7 +2854,10 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
     inv_param->solution_type = QUDA_MATPC_SOLUTION;
   } else {
     if (eig_param->use_norm_op)
-      inv_param->solve_type = QUDA_NORMOP_SOLVE;
+      if (eig_param->chirality == QUDA_INVALID_CHIRALITY)
+        inv_param->solve_type = QUDA_NORMOP_SOLVE;
+      else
+        inv_param->solve_type = QUDA_NORMOP_CHIRAL_SOLVE;
     else
       inv_param->solve_type = QUDA_DIRECT_SOLVE;
     inv_param->solution_type = QUDA_MAT_SOLUTION;
@@ -2691,7 +2897,8 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   // Construct vectors
   //------------------------------------------------------
   // Create host wrappers around application vector set
-  ColorSpinorParam cpuParam(nullptr, *inv_param, cudaGauge->X(), inv_param->solution_type, inv_param->input_location);
+  ColorSpinorParam cpuParam(nullptr, *inv_param, cudaGauge->X(), pc_solve, inv_param->input_location);
+  if (eig_param->chirality != QUDA_INVALID_CHIRALITY) { cpuParam.nSpin = 2; }
 
   int n_eig = eig_param->n_conv;
   if (eig_param->compute_svd) n_eig *= 2;
@@ -2713,8 +2920,11 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   ColorSpinorParam cudaParam(cpuParam, *inv_param, QUDA_CUDA_FIELD_LOCATION);
   cudaParam.create = QUDA_ZERO_FIELD_CREATE;
   cudaParam.setPrecision(inv_param->cuda_prec_eigensolver, inv_param->cuda_prec_eigensolver, true);
+  // Overlap fermion will use almost all device memroy to construct the operator
+  // and so we need to ensure that the eigenvectors are stored in pinned memory.
+  if (inv_param->dslash_type == QUDA_OVERLAP_DSLASH) { cudaParam.mem_type = QUDA_MEMORY_HOST_PINNED; }
   // Ensure device vectors qre in UKQCD basis for Wilson type fermions
-  if (cudaParam.nSpin != 1) cudaParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
+  if (cudaParam.nSpin == 4) cudaParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
 
   std::vector<ColorSpinorField> kSpace(n_eig);
   for (int i = 0; i < n_eig; i++) {
@@ -2760,7 +2970,12 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   } else if (!eig_param->use_norm_op && eig_param->use_dagger) {
     m = new DiracMdag(dirac);
   } else if (eig_param->use_norm_op && !eig_param->use_dagger) {
-    m = new DiracMdagM(dirac);
+    if (eig_param->chirality == QUDA_INVALID_CHIRALITY) {
+      m = new DiracMdagM(dirac);
+    } else {
+      m = new DiracMdagMChiral(dirac);
+      ((DiracMdagMChiral *)m)->setChirality(eig_param->chirality);
+    }
   } else if (eig_param->use_norm_op && eig_param->use_dagger) {
     m = new DiracMMdag(dirac);
   } else {
@@ -3045,7 +3260,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
 
   if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) {
     ritzParam.setPrecision(param->cuda_prec_ritz, param->cuda_prec_ritz, true); // set native field order
-    if (ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
+    if (ritzParam.nSpin == 4) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
 
     //select memory location here, by default ritz vectors will be allocated on the device
     //but if not sufficient device memory, then the user may choose mapped type of memory
@@ -3577,6 +3792,40 @@ void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, Quda
   callMultiSrcQuda(_hp_x, _hp_b, param, op, parity);
 }
 
+namespace quda
+{
+  void separateChiral(std::vector<ColorSpinorField> &b_left, std::vector<ColorSpinorField> &b_right,
+                      const ColorSpinorField &b, double nb)
+  {
+    ColorSpinorParam chiralParam(b);
+    chiralParam.nSpin = 2;
+    chiralParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+    chiralParam.setPrecision(chiralParam.Precision(), chiralParam.Precision(), true);
+    b_left.resize(0);
+    b_right.resize(0);
+    {
+      ColorSpinorField tmp_left(chiralParam), tmp_right(chiralParam);
+      spinorChiralProject(tmp_left, tmp_right, b);
+      if (blas::norm2(tmp_left) / nb > 1e-16) { b_left.push_back(std::move(tmp_left)); }
+      if (blas::norm2(tmp_right) / nb > 1e-16) { b_right.push_back(std::move(tmp_right)); }
+    }
+  }
+
+  void combineChiral(cvector_ref<ColorSpinorField> &x_left, cvector_ref<ColorSpinorField> &x_right,
+                     cvector_ref<ColorSpinorField> &x)
+  {
+    auto tmp = getFieldTmp(x[0]);
+    for (size_t i = 0; i < x_left.size(); i++) {
+      spinorChiralReconstruct(tmp, x_left[i], QUDA_LEFT_CHIRALITY);
+      blas::xpy(tmp, x[i]);
+    }
+    for (size_t i = 0; i < x_right.size(); i++) {
+      spinorChiralReconstruct(tmp, x_right[i], QUDA_RIGHT_CHIRALITY);
+      blas::xpy(tmp, x[i]);
+    }
+  }
+} // namespace quda
+
 /*!
  * Generic version of the multi-shift solver. Should work for
  * most fermions. Note that offset[0] is not folded into the mass parameter.
@@ -3611,6 +3860,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE);
   bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type ==  QUDA_MATPC_SOLUTION);
   bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE);
+  bool chiral_solve = (param->solve_type == QUDA_NORMOP_CHIRAL_SOLVE);
 
   if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
       param->dslash_type == QUDA_STAGGERED_DSLASH) {
@@ -3625,7 +3875,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 
   } else { // Wilson type
 
-    if (mat_solution) {
+    if (mat_solution && !chiral_solve) {
       errorQuda("For Wilson-type fermions, multi-shift solver does not support MAT or MATPC solution types");
     }
     if (direct_solve) {
@@ -3748,13 +3998,98 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
       param->dslash_type == QUDA_STAGGERED_DSLASH) {
     m = new DiracM(dirac);
     mSloppy = new DiracM(diracSloppy);
+  } else if (chiral_solve) {
+    m = new DiracMdagMChiral(dirac);
+    mSloppy = new DiracMdagMChiral(diracSloppy);
   } else {
     m = new DiracMdagM(dirac);
     mSloppy = new DiracMdagM(diracSloppy);
   }
 
+  std::vector<ColorSpinorField> b_left, b_right;
+  if (chiral_solve) {
+    cudaParam.create = QUDA_NULL_FIELD_CREATE;
+    cudaParam.nSpin = 2;
+    cudaParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+    cudaParam.setPrecision(cudaParam.Precision(), cudaParam.Precision(), true);
+    separateChiral(b_left, b_right, b, nb);
+    blas::zero(x);
+  }
+  std::vector<ColorSpinorField> x_left(b_left.size() * param->num_offset, cudaParam);
+  std::vector<ColorSpinorField> x_right(b_right.size() * param->num_offset, cudaParam);
+
+  if (chiral_solve) {
+    // load the overlap low-mode eigensystem if it exists
+    Complex *evals_ov;
+    Complex **evecs_ov;
+    int n_low = 0;
+
+    if (param->ov_n_ev > 0 && param->ov_eigvals != NULL && param->ov_eigvecs != NULL) {
+      n_low = param->ov_n_ev;
+      evals_ov = reinterpret_cast<Complex *>(param->ov_eigvals);
+      evecs_ov = reinterpret_cast<Complex **>(param->ov_eigvecs);
+    } else {
+      errorQuda("No overlap eigensystem loaded.\n");
+    }
+
+    ColorSpinorParam gpuParam(b);
+    gpuParam.create = QUDA_COPY_FIELD_CREATE;
+
+    std::vector<ColorSpinorField> gpu_evecs(n_low);
+    {
+      ColorSpinorParam tmpParam(nullptr, *param, gpuParam.x, false, QUDA_CPU_FIELD_LOCATION);
+      tmpParam.create = QUDA_REFERENCE_FIELD_CREATE;
+
+      for (int i = 0; i < n_low; i++) {
+        tmpParam.v = evecs_ov[i];
+
+        ColorSpinorField cpu_ref(tmpParam);
+        gpuParam.field = &cpu_ref;
+        gpuParam.create = QUDA_COPY_FIELD_CREATE;
+        gpu_evecs[i] = ColorSpinorField(gpuParam);
+      }
+    }
+
+    // low-mode propagator & deflation for chiral overlap
+    for (QudaChirality chirality : {QUDA_LEFT_CHIRALITY, QUDA_RIGHT_CHIRALITY}) {
+      auto &b_chiral = (chirality == QUDA_LEFT_CHIRALITY) ? b_left : b_right;
+      if (b_chiral.size() > 0) {
+        auto tmp = getFieldTmp(x[0]);
+        for (int i = 0; i < n_low; i++) {
+          auto tmp_chiral = getFieldTmp<ColorSpinorField>(cudaParam);
+          spinorChiralProject(tmp_chiral, gpu_evecs[i], chirality);
+          Complex alpha = blas::cDotProduct(tmp_chiral, b_chiral[0]);
+          Complex lambda = evals_ov[i];
+          if (sqrt(std::fabs(lambda.real())) <= 100 * std::fabs(lambda.imag())) {
+            alpha *= -2.0;
+          } else {
+            alpha *= -1.0;
+          }
+          blas::caxpy(alpha, tmp_chiral, b_chiral[0]);
+          for (int j = 0; j < param->num_offset; j++) {
+            const double inv_m = 1.0 / (param->offset[j] + lambda.real() * lambda.real() + lambda.imag() * lambda.imag());
+            spinorChiralReconstruct(tmp, tmp_chiral, chirality);
+            blas::caxpy(-inv_m * alpha, tmp, x[j]);
+          }
+        }
+      }
+    }
+  }
+
   SolverParam solverParam(*param);
-  {
+  if (chiral_solve) {
+    // high-mode propagator for chiral overlap
+    for (QudaChirality chirality : {QUDA_LEFT_CHIRALITY, QUDA_RIGHT_CHIRALITY}) {
+      auto &b_chiral = (chirality == QUDA_LEFT_CHIRALITY) ? b_left : b_right;
+      auto &x_chiral = (chirality == QUDA_LEFT_CHIRALITY) ? x_left : x_right;
+      ((DiracMdagMChiral *)m)->setChirality(chirality);
+      ((DiracMdagMChiral *)mSloppy)->setChirality(chirality);
+      if (b_chiral.size() > 0) {
+        MultiShiftCG cg_m(*m, *mSloppy, solverParam);
+        cg_m(x_chiral, b_chiral[0], p, r2_old);
+      }
+    }
+  } else {
     MultiShiftCG cg_m(*m, *mSloppy, solverParam);
     cg_m(x, b, p, r2_old);
   }
@@ -3806,19 +4141,28 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
           diracSloppy.setMass(sqrt(param->offset[i]/4));
         }
 
+        if (param->dslash_type == QUDA_OVERLAP_DSLASH) {
+          dirac.setMass(sqrt(param->offset[i] / (param->offset[i] + 1.0)));
+          diracSloppy.setMass(sqrt(param->offset[i] / (param->offset[i] + 1.0)));
+        }
+
         DiracMatrix *m, *mSloppy;
 
         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
             param->dslash_type == QUDA_STAGGERED_DSLASH) {
           m = new DiracM(dirac);
           mSloppy = new DiracM(diracSloppy);
+        } else if (chiral_solve) {
+          m = new DiracMdagMChiral(dirac);
+          mSloppy = new DiracMdagMChiral(diracSloppy);
         } else {
           m = new DiracMdagM(dirac);
           mSloppy = new DiracMdagM(diracSloppy);
         }
 
         // need to curry in the shift if we are not doing staggered
-        if (param->dslash_type != QUDA_ASQTAD_DSLASH && param->dslash_type != QUDA_STAGGERED_DSLASH) {
+        if (param->dslash_type != QUDA_ASQTAD_DSLASH && param->dslash_type != QUDA_STAGGERED_DSLASH
+            && param->dslash_type != QUDA_OVERLAP_DSLASH) {
           m->shift = param->offset[i];
           mSloppy->shift = param->offset[i];
         }
@@ -3856,7 +4200,21 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
         solverParam.tol_hq = param->tol_hq_offset[i];                                     // set heavy quark tolerance
         solverParam.delta = param->reliable_delta_refinement;
 
-        {
+        if (chiral_solve) {
+          for (QudaChirality chirality : {QUDA_LEFT_CHIRALITY, QUDA_RIGHT_CHIRALITY}) {
+            auto &b_chiral = (chirality == QUDA_LEFT_CHIRALITY) ? b_left : b_right;
+            auto &x_chiral = (chirality == QUDA_LEFT_CHIRALITY) ? x_left : x_right;
+            ((DiracMdagMChiral *)m)->setChirality(chirality);
+            ((DiracMdagMChiral *)mSloppy)->setChirality(chirality);
+            if (b_chiral.size() > 0) {
+              CG cg(*m, *mSloppy, *mSloppy, *mSloppy, solverParam);
+              if (i == 0)
+                cg(x_chiral[i], b_chiral[0], p[i], r2_old[i]);
+              else
+                cg(x_chiral[i], b_chiral[0]);
+            }
+          }
+        } else {
           CG cg(*m, *mSloppy, *mSloppy, *mSloppy, solverParam);
           if (i == 0)
             cg(x[i], b, p[i], r2_old[i]);
@@ -3874,12 +4232,29 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
           diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case
         }
 
+        if (param->dslash_type == QUDA_OVERLAP_DSLASH) {
+          dirac.setMass(sqrt(param->offset[0] / (param->offset[0] + 1.0)));
+          diracSloppy.setMass(sqrt(param->offset[0] / (param->offset[0] + 1.0)));
+        }
+
         delete m;
         delete mSloppy;
       }
     }
   }
 
+  if (chiral_solve) { combineChiral(x_left, x_right, x); }
+
+  if (chiral_solve) {
+    auto tmp = getFieldTmp(x[0]);
+    for (int i = 0; i < param->num_offset; i++) {
+      d->setMass(sqrt(param->offset[i] / (param->offset[i] + 1.0)));
+      blas::copy(tmp, x[i]);
+      d->Mdag(x[i], tmp);
+      d->reconstruct(x[i], b, param->solution_type);
+    }
+  }
+
   // restore shifts
   for (int i = 0; i < param->num_offset; i++) param->offset[i] = unscaled_shifts[i];
 
@@ -5218,7 +5593,7 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
 
   inv_param->dslash_type = QUDA_ASQTAD_DSLASH;
 
-  ColorSpinorParam cpuParam(h_in, *inv_param, X, QUDA_MAT_SOLUTION, QUDA_CPU_FIELD_LOCATION);
+  ColorSpinorParam cpuParam(h_in, *inv_param, X, false, QUDA_CPU_FIELD_LOCATION);
   cpuParam.nSpin = 1;
   // QUDA style pointer for host data.
   ColorSpinorField in_h(cpuParam);
@@ -5243,6 +5618,7 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   diracParam.fatGauge  = gaugeFatPrecise;
   diracParam.longGauge = gaugeLongPrecise;
   diracParam.clover = cloverPrecise;
+  diracParam.overlap_kernel = overlapPrecise;
   diracParam.kappa  = inv_param->kappa;
   diracParam.mass   = inv_param->mass;
   diracParam.m5     = inv_param->m5;
diff --git a/lib/inv_multi_cg_quda.cpp b/lib/inv_multi_cg_quda.cpp
index dba0e11f4e..dfc40b59c8 100644
--- a/lib/inv_multi_cg_quda.cpp
+++ b/lib/inv_multi_cg_quda.cpp
@@ -71,7 +71,7 @@ namespace quda {
   public:
     ShiftUpdate(ColorSpinorField &r, std::vector<ColorSpinorField> &p, std::vector<ColorSpinorField> &x,
                 std::vector<double> &alpha, std::vector<double> &beta, std::vector<double> &zeta,
-                std::vector<double> &zeta_old, int j_low, int n_shift) :
+                std::vector<double> &zeta_old, int j_low, int n_shift, int n_update) :
       r(r),
       p(p),
       x(x),
@@ -81,7 +81,7 @@ namespace quda {
       zeta_old(zeta_old),
       j_low(j_low),
       n_shift(n_shift),
-      n_update((r.Nspin() == 4) ? 4 : 2)
+      n_update(n_update)
     {
     }
 
@@ -265,7 +265,8 @@ namespace quda {
 
     // now create the worker class for updating the shifted solutions and gradient vectors
     bool aux_update = false;
-    ShiftUpdate shift_update(r_sloppy, p, x_sloppy, alpha, beta, zeta, zeta_old, j_low, num_offset_now);
+    ShiftUpdate shift_update(r_sloppy, p, x_sloppy, alpha, beta, zeta, zeta_old, j_low, num_offset_now,
+                             matSloppy.getStencilSteps());
 
     getProfile().TPSTOP(QUDA_PROFILE_PREAMBLE);
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
diff --git a/lib/multi_blas_quda.cu b/lib/multi_blas_quda.cu
index e52a63b434..5738ec4506 100644
--- a/lib/multi_blas_quda.cu
+++ b/lib/multi_blas_quda.cu
@@ -99,7 +99,7 @@ namespace quda {
         staticCheck<NXZ, store_t, y_store_t, decltype(f)>(f, x, y);
 
         constexpr bool site_unroll_check = !std::is_same<store_t, y_store_t>::value || isFixed<store_t>::value;
-        if (site_unroll_check && (x[0].Ncolor() != 3 || x[0].Nspin() == 2))
+        if (site_unroll_check && (x[0].Ncolor() != 3 && x[0].Nspin() == 2))
           errorQuda("site unroll not supported for nSpin = %d nColor = %d", x[0].Nspin(), x[0].Ncolor());
 
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
@@ -115,7 +115,7 @@ namespace quda {
           constexpr bool site_unroll = !std::is_same<device_store_t, device_y_store_t>::value || isFixed<device_store_t>::value;
           constexpr int N = n_vector<device_store_t, true, nSpin, site_unroll>();
           constexpr int Ny = n_vector<device_y_store_t, true, nSpin, site_unroll>();
-          constexpr int M = site_unroll ? (nSpin == 4 ? 24 : 6) : N; // real numbers per thread
+          constexpr int M = site_unroll ? n_vector<device_store_t, false, nSpin, true>() : N; // real numbers per thread
           const int length = x[0].Length() / (nParity * M);
 
           if (tp.aux.x > 1 && (length * tp.aux.x) % device::warp_size() != 0) {
diff --git a/lib/multi_reduce_quda.cu b/lib/multi_reduce_quda.cu
index 2e6a564df5..c1c387744c 100644
--- a/lib/multi_reduce_quda.cu
+++ b/lib/multi_reduce_quda.cu
@@ -107,7 +107,7 @@ namespace quda {
 
         auto &x0 = x[0];
         constexpr bool site_unroll_check = !std::is_same<store_t, y_store_t>::value || isFixed<store_t>::value;
-        if (site_unroll_check && (x0.Ncolor() != 3 || x0.Nspin() == 2))
+        if (site_unroll_check && (x0.Ncolor() != 3 && x0.Nspin() == 2))
           errorQuda("site unroll not supported for nSpin = %d nColor = %d", x0.Nspin(), x0.Ncolor());
 
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
@@ -123,7 +123,7 @@ namespace quda {
           constexpr bool site_unroll = !std::is_same<device_store_t, device_y_store_t>::value || isFixed<device_store_t>::value;
           constexpr int N = n_vector<device_store_t, true, nSpin, site_unroll>();
           constexpr int Ny = n_vector<device_y_store_t, true, nSpin, site_unroll>();
-          constexpr int M = site_unroll ? (nSpin == 4 ? 24 : 6) : N; // real numbers per thread
+          constexpr int M = site_unroll ? n_vector<device_store_t, false, nSpin, true>() : N; // real numbers per thread
           const int length = x0.Length() / M;
 
           MultiReduceArg<device_real_t, M, NXZ, device_store_t, N, device_y_store_t, Ny, decltype(r_)> arg(x, y, z, w, r_, NYW, length, nParity);
@@ -660,7 +660,9 @@ namespace quda {
             max_YW_size<multiCdot<device_reduce_t, float>>(x.size(), x0.Precision(), y0.Precision());
 
           // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
-          uint2 max_tile_size = make_uint2(1, std::min({NYW_max, (int)y.size(), x0.Ncolor() == 3 ? 32 : NYW_max}));
+          // if not on device memory we set max tile size to 8 to avoid extreamly long tuning
+          const int x0_max_tile_size = x0.Ncolor() == 3 ? (x0.MemType() == QUDA_MEMORY_DEVICE ? 32 : 8) : NYW_max;
+          uint2 max_tile_size = make_uint2(1, std::min({NYW_max, (int)y.size(), x0_max_tile_size}));
           multiReduce_recurse<multiCdot, multiCdot>(result_tmp, x, y, x, x, 0, 0, false, max_tile_size);
         } else if (y.size() == 1 && x0.Precision() == y0.Precision()) {
 
@@ -672,7 +674,9 @@ namespace quda {
             max_YW_size<multiCdot<device_reduce_t, float>>(y.size(), y0.Precision(), x0.Precision());
 
           // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
-          uint2 max_tile_size = make_uint2(1, std::min({NXZ_max, (int)x.size(), x0.Ncolor() == 3 ? 32 : NXZ_max}));
+          // if not on device memory we set max tile size to 8 to avoid extreamly long tuning
+          const int x0_max_tile_size = x0.Ncolor() == 3 ? (x0.MemType() == QUDA_MEMORY_DEVICE ? 32 : 8) : NXZ_max;
+          uint2 max_tile_size = make_uint2(1, std::min({NXZ_max, (int)x.size(), x0_max_tile_size}));
           multiReduce_recurse<multiCdot, multiCdot>(result_trans, y, x, y, y, 0, 0, false, max_tile_size);
 
           // transpose the result if we are doing the transpose calculation
diff --git a/lib/overlap_kernel.cpp b/lib/overlap_kernel.cpp
new file mode 100644
index 0000000000..8af0609122
--- /dev/null
+++ b/lib/overlap_kernel.cpp
@@ -0,0 +1,154 @@
+#include <eigen_helper.h>
+#include <overlap_kernel.h>
+
+namespace quda
+{
+  // Chebyshev polynomial the first kind
+  // T_{k+1}(x) = 2 x T_k(x) - T_{k-1}(x)
+  double Tn(double x, int n)
+  {
+    if (abs(x) <= 1.0) { return cos(n * std::acos(x)); }
+    double T0 = 1, T1 = x, Tk = 2 * x * x - 1;
+    switch (n) {
+    case 0: return T0;
+    case 1: return T1;
+    case 2: return Tk;
+    default:
+      for (int k = 3; k <= n; ++k) {
+        T0 = T1;
+        T1 = Tk;
+        Tk = 2 * x * T1 - T0;
+      }
+      return Tk;
+    }
+  }
+
+  // \sum_{i=0}^n c_i T_i
+  // T_{k+1}(x) = 2 x T_k(x) - T_{k-1}(x)
+  // Use Clenshaw algorithm
+  double ciTi(double x, std::vector<double> c, int n)
+  {
+    double b2 = 0.0, b1 = 0.0, bk;
+    for (int k = n; k >= 1; --k) {
+      bk = c[k] + 2 * x * b1 - b2;
+      b2 = b1;
+      b1 = bk;
+    }
+    return c[0] + x * b1 - b2;
+  }
+
+  // (\sum_{i=0}^n c_i T_i)' = \sum_{i=1}^n i c_i U_{i-1}
+  // U_{k+1}(x) = 2 x U_k(x) - U_{k-1}
+  // Use Clenshaw algorithm
+  double iciUim1(double x, std::vector<double> c, int n)
+  {
+    double b2 = 0.0, b1 = 0.0, bk;
+    for (int k = n - 1; k >= 1; --k) {
+      bk = (k + 1) * c[k + 1] + 2 * x * b1 - b2;
+      b2 = b1;
+      b1 = bk;
+    }
+    return c[1] + 2 * x * b1 - b2;
+  }
+
+  double residual(double x, std::vector<double> c, int n, double epsilon, bool derivative)
+  {
+    const double z = (x * 2 - (1 + epsilon)) / (1 - epsilon);
+    if (derivative) {
+      return -1 / (2 * sqrt(x)) * ciTi(z, c, n) - sqrt(x) * iciUim1(z, c, n) * (2 / (1 - epsilon));
+    } else {
+      return 1 - sqrt(x) * ciTi(z, c, n);
+    }
+  }
+
+  double findRoot(double x_l, double x_r, std::vector<double> c, int n, double epsilon, bool derivative)
+  {
+    double x_m, res_r, res_l, res_m;
+
+    res_l = residual(x_l, c, n, epsilon, derivative);
+    res_r = residual(x_r, c, n, epsilon, derivative);
+    if (abs(res_l) < 1e-15) return x_l;
+    if (abs(res_r) < 1e-15) return x_r;
+    if (res_r * res_l > 0)
+      errorQuda("ERROR: findRoot with derivative=%d called with wrong ends: (%e %e)->(%e %e)\n", derivative, x_l, x_r,
+                res_l, res_r);
+    for (int i = 0; i < 10; i++) {
+      x_m = (res_l * x_r - res_r * x_l) / (res_l - res_r);
+      res_m = residual(x_m, c, n, epsilon, derivative);
+      if (res_m * res_l > 0) {
+        x_l = x_m;
+        res_l = res_m;
+      } else {
+        x_r = x_m;
+        res_r = res_m;
+      }
+    }
+    return (res_l * x_r - res_r * x_l) / (res_l - res_r);
+  }
+
+  std::vector<double> minimaxApproximationRemez(double delta, double epsilon)
+  {
+    const int n = ceil(-log(delta / 0.41) / (2.083 * sqrt(epsilon))) + 1;
+    constexpr int max_iter = 5;
+    std::vector<double> y(n + 1), z(n + 1), c(n + 1), b(n + 1);
+    Eigen::Map<Eigen::VectorXd> b_eigen(b.data(), b.size()), c_eigen(c.data(), c.size());
+    Eigen::MatrixXd M_eigen(n + 1, n + 1);
+
+    for (int i = 0; i < n + 1; ++i) {
+      z[i] = cos(M_PI * i / n);
+      y[i] = (z[i] * (1 - epsilon) + (1 + epsilon)) / 2;
+    }
+
+    int iter = 0;
+    while (iter < max_iter) {
+      // Construct matrix M_ij=\sqrt{y_i}T_j(z_i)
+      for (int i = 0; i < n + 1; ++i) {
+        for (int j = 0; j < n; ++j) { M_eigen(i, j) = sqrt(y[i]) * Tn(z[i], j); }
+        M_eigen(i, n) = i % 2 == 0 ? 1 : -1; // T_n is not a real Chebyshev polynomial
+        b_eigen(i) = 1.0;
+      }
+      c_eigen = M_eigen.lu().solve(b_eigen);
+
+      // Drop T_n
+      for (int i = 0; i < n; ++i) { b[i] = findRoot(y[i], y[i + 1], c, n - 1, epsilon, false); }
+      for (int i = n - 1; i > 0; --i) { y[i] = findRoot(b[i], b[i - 1], c, n - 1, epsilon, true); }
+      for (int i = 1; i < n; ++i) { z[i] = (2 * y[i] - (1 + epsilon)) / (1 - epsilon); }
+      for (int i = 0; i < n + 1; ++i) { b[i] = abs(1 - sqrt(y[i]) * ciTi(z[i], c, n - 1)); }
+      if (*std::max_element(b.begin(), b.end()) <= delta) { break; }
+      iter += 1;
+    }
+    if (iter == max_iter) { errorQuda("minimaxApproximationRemez can not converge"); }
+    return {c.begin(), c.begin() + n};
+  }
+
+  OverlapKernel::OverlapKernel(std::vector<ColorSpinorField> &evecs, const std::vector<Complex> &evals, double kappa,
+                               const std::vector<double> remez_tol) :
+    evals(evals.size()),
+    kappa(kappa),
+    epsilon(pow(evals.back().real() / (1.0 + 8.0 * kappa), 2)),
+    remez_tol(remez_tol),
+    remez_coeff(remez_tol.size()),
+    remez_order(remez_tol.size())
+  {
+    this->evecs = std::move(evecs);
+    for (size_t i = 0; i < evals.size(); i++) { this->evals[i] = evals[i].real(); }
+    for (size_t i = 0; i < remez_tol.size(); i++) {
+      remez_coeff[i] = minimaxApproximationRemez(remez_tol[i], epsilon);
+      remez_order[i] = remez_coeff[i].size() - 1;
+    }
+  }
+
+  OverlapKernel::OverlapKernel(const OverlapKernel *overlap_kernel, QudaPrecision precision) :
+    evals(overlap_kernel->evals),
+    kappa(overlap_kernel->kappa),
+    epsilon(overlap_kernel->epsilon),
+    remez_tol(overlap_kernel->remez_tol),
+    remez_coeff(overlap_kernel->remez_coeff),
+    remez_order(overlap_kernel->remez_order)
+  {
+    ColorSpinorParam param(overlap_kernel->evecs[0]);
+    param.setPrecision(precision, precision, true);
+    evecs.resize(overlap_kernel->evecs.size(), ColorSpinorField(param));
+    for (size_t i = 0; i < overlap_kernel->evecs.size(); i++) { evecs[i].copy(overlap_kernel->evecs[i]); }
+  }
+} // namespace quda
diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp
index ac1a9bdd8b..088e570142 100644
--- a/lib/quda_ptr.cpp
+++ b/lib/quda_ptr.cpp
@@ -46,6 +46,10 @@ namespace quda
       device = nullptr;
       host = ptr;
       break;
+    case QUDA_MEMORY_MAPPED:
+      host = ptr;
+      device = get_mapped_device_pointer(ptr);
+      break;
     case QUDA_MEMORY_MANAGED:
       device = ptr;
       host = ptr;
@@ -78,6 +82,7 @@ namespace quda
       case QUDA_MEMORY_HOST: host_free(host); break;
       case QUDA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break;
       case QUDA_MEMORY_MAPPED: host_free(host); break;
+      case QUDA_MEMORY_MANAGED: managed_free(host); break;
       default: errorQuda("Unknown memory type %d", type);
       }
       getProfile().TPSTOP(QUDA_PROFILE_FREE);
@@ -105,6 +110,7 @@ namespace quda
     switch (type) {
     case QUDA_MEMORY_DEVICE:
     case QUDA_MEMORY_DEVICE_PINNED:
+    case QUDA_MEMORY_HOST_PINNED: // Host pinned memory is visible to the device
     case QUDA_MEMORY_MAPPED:
     case QUDA_MEMORY_MANAGED: return true;
     default: return false;
@@ -116,6 +122,7 @@ namespace quda
     switch (type) {
     case QUDA_MEMORY_HOST:
     case QUDA_MEMORY_HOST_PINNED:
+    case QUDA_MEMORY_MAPPED:
     case QUDA_MEMORY_MANAGED: return true;
     default: return false;
     }
diff --git a/lib/reduce_quda.cu b/lib/reduce_quda.cu
index 58e2e9b199..d5b612365f 100644
--- a/lib/reduce_quda.cu
+++ b/lib/reduce_quda.cu
@@ -74,7 +74,7 @@ namespace quda {
       void apply(const qudaStream_t &stream) override
       {
         constexpr bool site_unroll_check = !std::is_same<store_t, y_store_t>::value || isFixed<store_t>::value || decltype(r)::site_unroll;
-        if (site_unroll_check && (x.Ncolor() != 3 || x.Nspin() == 2))
+        if (site_unroll_check && (x.Ncolor() != 3 && x.Nspin() == 2))
           errorQuda("site unroll not supported for nSpin = %d nColor = %d", x.Nspin(), x.Ncolor());
 
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
@@ -89,7 +89,7 @@ namespace quda {
           constexpr bool site_unroll = !std::is_same<device_store_t, device_y_store_t>::value || isFixed<device_store_t>::value || decltype(r)::site_unroll;
           constexpr int N = n_vector<device_store_t, true, nSpin, site_unroll>();
           constexpr int Ny = n_vector<device_y_store_t, true, nSpin, site_unroll>();
-          constexpr int M = site_unroll ? (nSpin == 4 ? 24 : 6) : N; // real numbers per thread
+          constexpr int M = site_unroll ? n_vector<device_store_t, false, nSpin, true>() : N; // real numbers per thread
           const int length = x.Length() / M;
 
           ReductionArg<device_real_t, M, device_store_t, N, device_y_store_t, Ny, decltype(r_)> arg(x, y, z, w, v, r_, length, nParity);
diff --git a/lib/solve.cpp b/lib/solve.cpp
index 79fa868633..7af518e400 100644
--- a/lib/solve.cpp
+++ b/lib/solve.cpp
@@ -33,6 +33,21 @@ namespace quda
       for (auto &b2i : b2) printfQuda("Mass rescale: norm of source in = %g\n", b2i);
     }
 
+    // overlap dslash uses mass normalization internally
+    if (param.dslash_type == QUDA_OVERLAP_DSLASH) {
+      const double two_rho = 8.0 - 1.0 / kappa;
+      switch (param.solution_type) {
+      case QUDA_MAT_SOLUTION:
+        if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(1.0 / two_rho, b);
+        break;
+      case QUDA_MATDAG_MAT_SOLUTION:
+        if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(1.0 / (two_rho * two_rho), b);
+        break;
+      default: errorQuda("Not implemented");
+      }
+      return;
+    }
+
     // staggered dslash uses mass normalization internally
     if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) {
       switch (param.solution_type) {
@@ -125,6 +140,46 @@ namespace quda
     }
   }
 
+  void separateChiral(std::vector<size_t> &idx_left, std::vector<ColorSpinorField> &in_left,
+                      std::vector<size_t> &idx_right, std::vector<ColorSpinorField> &in_right,
+                      cvector_ref<const ColorSpinorField> &in, std::vector<double> &nb)
+  {
+    ColorSpinorParam chiralParam(in[0]);
+    chiralParam.nSpin = 2;
+    chiralParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+    chiralParam.setPrecision(chiralParam.Precision(), chiralParam.Precision(), true);
+    in_left.resize(0);
+    in_right.resize(0);
+    for (size_t i = 0; i < in.size(); i++) {
+      ColorSpinorField tmp_left(chiralParam);
+      ColorSpinorField tmp_right(chiralParam);
+      spinorChiralProject(tmp_left, tmp_right, in[i]);
+      if (blas::norm2(tmp_left) / nb[i] > 1e-6) {
+        idx_left.push_back(i);
+        in_left.push_back(std::move(tmp_left));
+      }
+      if (blas::norm2(tmp_right) / nb[i] > 1e-6) {
+        idx_right.push_back(i);
+        in_right.push_back(std::move(tmp_right));
+      }
+    }
+  }
+
+  void combineChiral(std::vector<size_t> &idx_left, cvector_ref<ColorSpinorField> &out_left,
+                     std::vector<size_t> &idx_right, cvector_ref<ColorSpinorField> &out_right,
+                     cvector_ref<ColorSpinorField> &out)
+  {
+    auto tmp = getFieldTmp(out[0]);
+    for (size_t i = 0; i < out_left.size(); i++) {
+      spinorChiralReconstruct(tmp, out_left[i], QUDA_LEFT_CHIRALITY);
+      blas::xpy(tmp, out[idx_left[i]]);
+    }
+    for (size_t i = 0; i < out_right.size(); i++) {
+      spinorChiralReconstruct(tmp, out_right[i], QUDA_RIGHT_CHIRALITY);
+      blas::xpy(tmp, out[idx_right[i]]);
+    }
+  }
+
   void solve(cvector_ref<ColorSpinorField> &x, cvector_ref<ColorSpinorField> &b, Dirac &dirac, Dirac &diracSloppy,
              Dirac &diracPre, Dirac &diracEig, QudaInvertParam &param)
   {
@@ -132,7 +187,9 @@ namespace quda
 
     bool mat_solution = (param.solution_type == QUDA_MAT_SOLUTION) || (param.solution_type == QUDA_MATPC_SOLUTION);
     bool direct_solve = (param.solve_type == QUDA_DIRECT_SOLVE) || (param.solve_type == QUDA_DIRECT_PC_SOLVE);
-    bool norm_error_solve = (param.solve_type == QUDA_NORMERR_SOLVE) || (param.solve_type == QUDA_NORMERR_PC_SOLVE);
+    bool norm_error_solve = (param.solve_type == QUDA_NORMERR_SOLVE) || (param.solve_type == QUDA_NORMERR_PC_SOLVE)
+      || (param.solve_type == QUDA_NORMERR_CHIRAL_SOLVE);
+    bool chiral_solve = (param.solve_type == QUDA_NORMOP_CHIRAL_SOLVE);
 
     auto nb = blas::norm2(b);
     for (auto &bi : nb) {
@@ -212,7 +269,101 @@ namespace quda
       solverParam.updateInvertParam(param);
     }
 
-    if (direct_solve) {
+    if (chiral_solve && !direct_solve) {
+      DiracMdagMChiral m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
+      SolverParam solverParam(param);
+
+      std::vector<size_t> idx_left, idx_right;
+      std::vector<ColorSpinorField> in_left, in_right;
+      separateChiral(idx_left, in_left, idx_right, in_right, in, nb);
+      auto out_left = getFieldTmp<ColorSpinorField>(in_left);
+      auto out_right = getFieldTmp<ColorSpinorField>(in_right);
+
+      // load the overlap low-mode eigensystem if it exists
+      Complex *evals_ov;
+      Complex **evecs_ov;
+      int n_low = 0;
+
+      if (param.ov_n_ev > 0 && param.ov_eigvals != NULL && param.ov_eigvecs != NULL) {
+        n_low = param.ov_n_ev;
+        evals_ov = reinterpret_cast<Complex *>(param.ov_eigvals);
+        evecs_ov = reinterpret_cast<Complex **>(param.ov_eigvecs);
+      } else {
+        errorQuda("No overlap eigensystem loaded.\n");
+      }
+
+      ColorSpinorParam gpuParam(in[0]);
+      gpuParam.create = QUDA_COPY_FIELD_CREATE;
+
+      std::vector<ColorSpinorField> gpu_evecs(n_low);
+      {
+        ColorSpinorParam tmpParam(nullptr, param, gpuParam.x, false, QUDA_CPU_FIELD_LOCATION);
+        tmpParam.create = QUDA_REFERENCE_FIELD_CREATE;
+
+        for (int i = 0; i < n_low; i++) {
+          tmpParam.v = evecs_ov[i];
+
+          ColorSpinorField cpu_ref(tmpParam);
+          gpuParam.field = &cpu_ref;
+          gpuParam.create = QUDA_COPY_FIELD_CREATE;
+          gpu_evecs[i] = ColorSpinorField(gpuParam);
+        }
+      }
+
+      ColorSpinorParam chiralParam(in[0]);
+      chiralParam.nSpin = 2;
+      chiralParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+      chiralParam.setPrecision(chiralParam.Precision(), chiralParam.Precision(), true);
+
+      // low-mode propagator & deflation for chiral overlap
+      for (QudaChirality chirality : {QUDA_LEFT_CHIRALITY, QUDA_RIGHT_CHIRALITY}) {
+        auto &in_chiral = (chirality == QUDA_LEFT_CHIRALITY) ? in_left : in_right;
+        if (in_chiral.size() > 0) {
+          auto tmp = getFieldTmp(out[0]);
+          for (int i = 0; i < n_low; i++) {
+            auto tmp_chiral = getFieldTmp<ColorSpinorField>(chiralParam);
+            spinorChiralProject(tmp_chiral, gpu_evecs[i], chirality);
+            std::vector<Complex> alpha;
+            blas::block::cDotProduct(alpha, tmp_chiral, in_chiral);
+            Complex lambda = evals_ov[i];
+            if (sqrt(std::fabs(lambda.real())) <= 100 * std::fabs(lambda.imag())) {
+              for (auto &v : alpha) { v *= -2.0; }
+            } else {
+              for (auto &v : alpha) { v *= -1.0; }
+            }
+            blas::block::caxpy(alpha, tmp_chiral, in_chiral);
+            const double mass = param.mass;
+            const double offset = (mass * mass) / (1.0 - mass * mass);
+            const double inv_m = 1.0 / (offset + lambda.real() * lambda.real() + lambda.imag() * lambda.imag());
+            for (auto &v : alpha) { v *= -inv_m; }
+            spinorChiralReconstruct(tmp, tmp_chiral, chirality);
+            blas::block::caxpy(alpha, tmp, out);
+          }
+        }
+      }
+
+      // high-mode propagator for chiral overlap
+      for (QudaChirality chirality : {QUDA_LEFT_CHIRALITY, QUDA_RIGHT_CHIRALITY}) {
+        auto &in_chiral = (chirality == QUDA_LEFT_CHIRALITY) ? in_left : in_right;
+        auto &out_chiral = (chirality == QUDA_LEFT_CHIRALITY) ? out_left : out_right;
+        m.setChirality(chirality);
+        mSloppy.setChirality(chirality);
+        mPre.setChirality(chirality);
+        mEig.setChirality(chirality);
+        if (in_chiral.size() > 0) {
+          Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig);
+          (*solve)(out_chiral, in_chiral);
+          delete solve;
+          solverParam.updateInvertParam(param);
+        }
+      }
+      combineChiral(idx_left, out_left, idx_right, out_right, out);
+      if (mat_solution) {
+        auto tmp = getFieldTmp<ColorSpinorField>(out);
+        blas::copy(tmp, out);
+        dirac.Mdag(out, tmp);
+      }
+    } else if (direct_solve) {
       DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre), mEig(diracEig);
       SolverParam solverParam(param);
 
diff --git a/lib/spinor_chiral_project.cu b/lib/spinor_chiral_project.cu
new file mode 100644
index 0000000000..c83d781180
--- /dev/null
+++ b/lib/spinor_chiral_project.cu
@@ -0,0 +1,167 @@
+#include <color_spinor_field.h>
+#include <tunable_nd.h>
+#include <kernels/spinor_chiral_project.cuh>
+#include <instantiate.h>
+
+namespace quda
+{
+
+  template <typename Float, int Nc> class SpinorChiralReconstruct : TunableKernel2D
+  {
+    ColorSpinorField &out;
+    const ColorSpinorField &in_left;
+    const ColorSpinorField &in_right;
+    const QudaChirality chirality;
+    unsigned int minThreads() const { return out.VolumeCB(); }
+
+  public:
+    SpinorChiralReconstruct(ColorSpinorField &out, const ColorSpinorField &in_left, const ColorSpinorField &in_right,
+                            QudaChirality chirality) :
+      TunableKernel2D(out, out.SiteSubset()), out(out), in_left(in_left), in_right(in_right), chirality(chirality)
+    {
+      apply(device::get_default_stream());
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      if (chirality == QUDA_INVALID_CHIRALITY) {
+        ChiralReconstructSpinorArg<Float, Nc, QUDA_INVALID_CHIRALITY> arg(out, in_left, in_right);
+        launch<ChiralReconstructSpinor>(tp, stream, arg);
+      } else if (chirality == QUDA_LEFT_CHIRALITY) {
+        ChiralReconstructSpinorArg<Float, Nc, QUDA_LEFT_CHIRALITY> arg(out, in_left, in_right);
+        launch<ChiralReconstructSpinor>(tp, stream, arg);
+      } else if (chirality == QUDA_RIGHT_CHIRALITY) {
+        ChiralReconstructSpinorArg<Float, Nc, QUDA_RIGHT_CHIRALITY> arg(out, in_left, in_right);
+        launch<ChiralReconstructSpinor>(tp, stream, arg);
+      } else {
+        errorQuda("Unsupported chirality %d", chirality);
+      }
+    }
+
+    long long bytes() const
+    {
+      return ((chirality != QUDA_RIGHT_CHIRALITY) ? in_left.Bytes() : 0)
+        + ((chirality != QUDA_LEFT_CHIRALITY) ? in_right.Bytes() : 0) + out.Bytes();
+    }
+  };
+
+  void spinorChiralReconstruct(ColorSpinorField &dst, const ColorSpinorField &src_left,
+                               const ColorSpinorField &src_right, QudaChirality chirality)
+  {
+    checkPrecision(dst, src_left, src_right);
+    checkColor(dst, src_left, src_right);
+
+    if (dst.Nspin() != 4 || src_left.Nspin() != 2 || src_right.Nspin() != 2) {
+      errorQuda("Unsupported nspin combination: dst=%d, src_left=%d, src_right=%d\n", dst.Nspin(), src_left.Nspin(),
+                src_right.Nspin());
+    }
+    if (dst.GammaBasis() != QUDA_UKQCD_GAMMA_BASIS || src_left.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS
+        || src_right.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
+      errorQuda("Unsupported gamma basis combination: dst_left %d, dst_right %d, src %d\n", dst.GammaBasis(),
+                src_left.GammaBasis(), src_right.GammaBasis());
+    }
+
+    if (dst.Ncolor() == 3) {
+      if (dst.Precision() == QUDA_DOUBLE_PRECISION) {
+        SpinorChiralReconstruct<double, 3>(dst, src_left, src_right, chirality);
+      } else if (dst.Precision() == QUDA_SINGLE_PRECISION) {
+        SpinorChiralReconstruct<float, 3>(dst, src_left, src_right, chirality);
+      } else {
+        errorQuda("Precision %d not implemented", dst.Precision());
+      }
+    } else {
+      errorQuda("nColor=%d not implemented", dst.Ncolor());
+    }
+  }
+
+  void spinorChiralReconstruct(ColorSpinorField &dst, const ColorSpinorField &src, QudaChirality chirality)
+  {
+    spinorChiralReconstruct(dst, src, src, chirality);
+  }
+
+  void spinorChiralReconstruct(ColorSpinorField &dst, const ColorSpinorField &src_left, const ColorSpinorField &src_right)
+  {
+    spinorChiralReconstruct(dst, src_left, src_right, QUDA_INVALID_CHIRALITY);
+  }
+
+  template <typename Float, int Nc> class SpinorChiralProject : TunableKernel2D
+  {
+    ColorSpinorField &out_left;
+    ColorSpinorField &out_right;
+    const ColorSpinorField &in;
+    const QudaChirality chirality;
+    unsigned int minThreads() const { return in.VolumeCB(); }
+
+  public:
+    SpinorChiralProject(ColorSpinorField &out_left, ColorSpinorField &out_right, const ColorSpinorField &in,
+                        QudaChirality chirality) :
+      TunableKernel2D(in, in.SiteSubset()), out_left(out_left), out_right(out_right), in(in), chirality(chirality)
+    {
+      apply(device::get_default_stream());
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      if (chirality == QUDA_INVALID_CHIRALITY) {
+        ChiralProjectSpinorArg<Float, Nc, QUDA_INVALID_CHIRALITY> arg(out_left, out_right, in);
+        launch<ChiralProjectSpinor>(tp, stream, arg);
+      } else if (chirality == QUDA_LEFT_CHIRALITY) {
+        ChiralProjectSpinorArg<Float, Nc, QUDA_LEFT_CHIRALITY> arg(out_left, out_right, in);
+        launch<ChiralProjectSpinor>(tp, stream, arg);
+      } else if (chirality == QUDA_RIGHT_CHIRALITY) {
+        ChiralProjectSpinorArg<Float, Nc, QUDA_RIGHT_CHIRALITY> arg(out_left, out_right, in);
+        launch<ChiralProjectSpinor>(tp, stream, arg);
+      } else {
+        errorQuda("Unsupported chirality %d", chirality);
+      }
+    }
+
+    long long bytes() const
+    {
+      return in.Bytes() + ((chirality != QUDA_RIGHT_CHIRALITY) ? out_left.Bytes() : 0)
+        + ((chirality != QUDA_LEFT_CHIRALITY) ? out_right.Bytes() : 0);
+    }
+  };
+
+  void spinorChiralProject(ColorSpinorField &dst_left, ColorSpinorField &dst_right, const ColorSpinorField &src,
+                           QudaChirality chirality)
+  {
+    checkPrecision(dst_left, dst_right, src);
+    checkColor(dst_left, dst_right, src);
+
+    if (dst_left.Nspin() != 2 || dst_right.Nspin() != 2 || src.Nspin() != 4) {
+      errorQuda("Unsupported nspin combination: dst_left=%d, dst_right=%d, src=%d\n", dst_left.Nspin(),
+                dst_right.Nspin(), src.Nspin());
+    }
+    if (dst_left.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS
+        || dst_right.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS || src.GammaBasis() != QUDA_UKQCD_GAMMA_BASIS) {
+      errorQuda("Unsupported gamma basis combination: dst_left %d, dst_right %d, src %d\n", dst_left.GammaBasis(),
+                dst_right.GammaBasis(), src.GammaBasis());
+    }
+
+    if (src.Ncolor() == 3) {
+      if (src.Precision() == QUDA_DOUBLE_PRECISION) {
+        SpinorChiralProject<double, 3>(dst_left, dst_right, src, chirality);
+      } else if (src.Precision() == QUDA_SINGLE_PRECISION) {
+        SpinorChiralProject<float, 3>(dst_left, dst_right, src, chirality);
+      } else {
+        errorQuda("Precision %d not implemented", src.Precision());
+      }
+    } else {
+      errorQuda("nColor=%d not implemented", src.Ncolor());
+    }
+  }
+
+  void spinorChiralProject(ColorSpinorField &dst, const ColorSpinorField &src, QudaChirality chirality)
+  {
+    spinorChiralProject(dst, dst, src, chirality);
+  }
+
+  void spinorChiralProject(ColorSpinorField &dst_left, ColorSpinorField &dst_right, const ColorSpinorField &src)
+  {
+    spinorChiralProject(dst_left, dst_right, src, QUDA_INVALID_CHIRALITY);
+  }
+
+} // namespace quda