diff --git a/ggml-quants.c b/ggml-quants.c
index a15a240487084..a21815dad7d18 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -424,6 +424,128 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 
+static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int qk, float *min, float *d) {
+    // Least squares fits `d * q + m = x` for d and m.
+    float qs = 0.0f;
+    float qs2 = 0.0f;
+    float xs = 0.0f;
+    float xq = 0.0f;
+    float minx = x[0];
+    float maxx = x[0];
+    float q1 = 0.0f;
+    for (int i = 0; i < qk; i++) {
+        float qf = q[i];
+        float w = fabsf(x[i]);
+        q1 += w;
+        qs += w*qf;
+        qs2 += w*qf*qf;
+        xs += w*x[i];
+        xq += w*x[i]*qf;
+        if (x[i] < minx) minx = x[i];
+        if (x[i] > maxx) maxx = x[i];
+    }
+    float denom = qs*qs - qs2*q1;
+    if (minx == maxx) {
+        *min = x[0];
+        *d = 0.0f;
+    } else if (denom == 0.0f) {
+        *min = 0.0f;
+        *d = 0.0f;
+    } else {
+        *min = (qs*xq - qs2*xs) / denom;
+        *d = (qs*xs - q1*xq) / denom;
+    }
+}
+
+static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) {
+    // Least squares fits `d * q = x` for d.
+    float qs2 = 0.0f;
+    float xq = 0.0f;
+    for (int i = 0; i < qk; i++) {
+        qs2 += q[i]*q[i];
+        xq += x[i]*q[i];
+    }
+    if (qs2 == 0.0f) {
+        return 0.0f;
+    }
+    return xq / qs2;
+}
+
+static void lstsq_q_k(const float * restrict q, const float * restrict x, const float * restrict s, int bs, float *min, float *d) {
+    // Least squares fits `d * q - s * m = x` for d and m.
+    float s2 = 0.0f;
+    float qs = 0.0f;
+    float q2 = 0.0f;
+    float sx = 0.0f;
+    float qx = 0.0f;
+    for (int i = 0; i < QK_K; i++) {
+        float w = fabsf(x[i]);
+        s2 += w*s[i/bs]*s[i/bs];
+        qs += w*q[i]*s[i/bs];
+        q2 += w*q[i]*q[i];
+        sx += w*s[i/bs]*x[i];
+        qx += w*q[i]*x[i];
+    }
+    float denom = qs*qs - q2*s2;
+    if (s2 == 0.0f && q2 != 0.0f) {
+        // All s are zero.
+        *min = 0.0f;
+        *d = qx / q2;
+    } else if (denom == 0.0f) {
+        *min = 0.0f;
+        *d = 0.0f;
+    } else {
+        *min = (q2*sx - qs*qx) / denom;
+        *d = (qs*sx - qx*s2) / denom;
+    }
+}
+
+static void quantize_1_0min(const float * restrict x, int n, int bits, uint8_t * restrict q, float * scale) {
+    // Least squares fits `d * q = x` for d with unsigned q.
+    float max = -FLT_MAX;
+    for (int l = 0; l < n; l++) {
+        const float v = fabsf(x[l]);
+        if (v > max) max = v;
+    }
+
+    float d  = max / ((1 << bits) - 1);
+    *scale = d;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    for (int l = 0; l < n; l++) {
+        const float x0 = x[l]*id;
+
+        const uint8_t xi0 = MAX(0, MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f)));
+
+        q[l] = xi0;
+    }
+    *scale = lstsq_q_0_u8(q, x, n);
+}
+
+static void quantize_1(const float * restrict x, int n, int bits, uint8_t * restrict q, float * m, float * scale) {
+    float min = FLT_MAX;
+    float max = -FLT_MAX;
+    for (int l = 0; l < n; l++) {
+        const float v = x[l];
+
+        if (v < min) min = v;
+        if (v > max) max = v;
+    }
+
+    float d  = (max - min) / ((1 << bits) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    for (int l = 0; l < n; l++) {
+        const float x0 = (x[l] - min)*id;
+
+        const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f));
+
+        q[l] = xi0;
+    }
+
+    lstsq_q_1(q, x, n, m, scale);
+}
+
 // reference implementation for deterministic creation of model files
 void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
     static const int qk = QK4_0;
@@ -1195,6 +1317,135 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
+static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_size, int * block_scales, int * block_mins, uint8_t * L, ggml_fp16_t * block_scale, ggml_fp16_t * block_min) {
+    float max_scale = 0.0f;
+    float max_min = 0.0f;
+
+    // If all the weight are positive we can invert the sign of min.
+    // Otherwise blocks with all positive weights need to be quantized with zero
+    // min, because min scale is unsigned.
+    bool all_positive = true;
+    for (int j = 0; j < QK_K; j++) {
+        if (x[j] < 0.0f) {
+            all_positive = false;
+            break;
+        }
+    }
+
+    float scales[QK_K];
+    float mins[QK_K];
+
+    for (int j = 0; j < QK_K/block_size; j++) {
+        uint8_t q[QK_K];
+        // First find least squares solution for min and scale for each block.
+        quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]);
+        // Flip the sign because quantize_1 assumes that min is added, but min
+        // is subtracted in k-quants.
+        mins[j] = -mins[j];
+        if ((!all_positive && mins[j] < 0) || (all_positive && mins[j] > 0)) {
+            // All weights are positive in this block, but some blocks have
+            // negative weights. Find new least squares scale with zero min.
+            mins[j] = 0.0f;
+            quantize_1_0min(&x[block_size*j], block_size, bits, q, &scales[j]);
+        }
+        if (scales[j] > max_scale) {
+            max_scale = scales[j];
+        }
+        if (j == 0) {
+            max_min = mins[j];
+        } else if (!all_positive && mins[j] > max_min) {
+            max_min = mins[j];
+        } else if (all_positive && mins[j] < max_min) {
+            max_min = mins[j];
+        }
+    }
+
+    int max_group = (1 << scale_bits) - 1;
+
+    // Increasing passes would decrease RMS error by miniscule amount with
+    // drawback of taking more time.
+    for(int pass = 0; pass < 2; pass++) {
+        float inv_scale  = max_scale == 0.0f ? 0.0f : max_group/max_scale;
+        float inv_min    = max_min == 0.0f ? 0.0f : max_group/max_min;
+        float block_d    = max_scale/max_group;
+        float block_dmin = max_min/max_group;
+        for (int j = 0; j < QK_K/block_size; ++j) {
+            uint8_t ls = MAX(0, nearest_int(inv_scale*scales[j]));
+            uint8_t lm = MAX(0, nearest_int(inv_min*mins[j]));
+            ls = MIN(max_group, ls);
+            lm = MIN(max_group, lm);
+            uint8_t best_lm = lm;
+            uint8_t best_ls = ls;
+            float best_rms = FLT_MAX;
+            int limit = 1;
+            // Increase limit for minor RMS error decrease while increasing the
+            // quantization run time.
+            if (pass > 0) limit = 8;
+            // Due to quantization the best ls and lm might not be the nearest
+            // to the ones obtained by the round to nearest.
+            // Loop through few nearby choices and choose lm and ls that
+            // minimize RMS error.
+            for (int lst = MAX(0, ls-limit); lst <= MIN(max_group, ls+limit); lst++) {
+                for (int lmt = MAX(0, lm-limit); lmt <= MIN(max_group, lm+limit); lmt++) {
+                    float rms = 0.0f;
+                    for (int ii = 0; ii < block_size; ii++) {
+                        const float d = block_d * lst;
+                        const float dm = block_dmin * lmt;
+                        int l1 = 0;
+                        if (d) {
+                            l1 = nearest_int((x[block_size*j + ii] + dm)/d);
+                            l1 = MAX(0, MIN((1 << bits) - 1, l1));
+                        }
+                        const float e = (d*l1 - dm) - x[block_size*j + ii];
+                        rms += e*e;
+                    }
+                    if (rms < best_rms) {
+                        best_lm = lmt;
+                        best_ls = lst;
+                        best_rms = rms;
+                    }
+                }
+            }
+            block_scales[j] = best_ls;
+            block_mins[j] = best_lm;
+        }
+
+        float q_fit[QK_K];
+        float q_m[QK_K];
+
+        // Quantize elements and populate arrays needed for least squares fit.
+        for (int j = 0; j < QK_K/block_size; ++j) {
+            const float d = block_d * block_scales[j];
+            const float dm = block_dmin * block_mins[j];
+            q_m[j] = block_mins[j];
+            for (int ii = 0; ii < block_size; ++ii) {
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[block_size*j + ii] + dm)/d);
+                    l = MAX(0, MIN((1 << bits) - 1, l));
+                }
+                L[block_size*j + ii] = l;
+                q_fit[block_size*j + ii] = block_scales[j] * l;
+            }
+        }
+
+        if (pass == 0) {
+            // Least squares fit min and scale.
+            float min, scale;
+            lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale);
+            // Check for nans.
+            assert(min == min);
+            assert(scale == scale);
+            // Quantize to fp16 for the next pass.
+            max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group;
+            max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group;
+
+            *block_scale = GGML_FP32_TO_FP16(scale);
+            *block_min = GGML_FP32_TO_FP16(min);
+        }
+    }
+}
+
 static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
     float max = 0;
     float amax = 0;
@@ -1318,49 +1569,6 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
-        int ntry, float alpha) {
-    float min = x[0];
-    float max = x[0];
-    for (int i = 1; i < n; ++i) {
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-    }
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = 0;
-        return 0.f;
-    }
-    if (min > 0) min = 0;
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    for (int itry = 0; itry < ntry; ++itry) {
-        float sumlx = 0; int suml2 = 0;
-        bool did_change = false;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            if (l != L[i]) {
-                L[i] = l;
-                did_change = true;
-            }
-            sumlx += (x[i] - min)*l;
-            suml2 += l*l;
-        }
-        scale = sumlx/suml2;
-        float sum = 0;
-        for (int i = 0; i < n; ++i) {
-            sum += x[i] - scale*L[i];
-        }
-        min = alpha*min + (1 - alpha)*sum/n;
-        if (min > 0) min = 0;
-        iscale = 1/scale;
-        if (!did_change) break;
-    }
-    *the_min = -min;
-    return scale;
-}
-
 static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
         uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
         float rmin, float rdelta, int nstep, bool use_mad) {
@@ -1812,13 +2020,30 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
-    uint8_t Laux[32];
-    float   weights[32];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
 
     for (int i = 0; i < nb; i++) {
+#if QK_K == 256
+        int block_scales[QK_K/32];
+        int block_mins[QK_K/32];
+        quantize_q_k_1(x, 4, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
 
+        for (int j = 0; j < QK_K/32; ++j) {
+            int ls = block_scales[j];
+            int lm = block_mins[j];
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
+        }
+#else
+        float weights[32];
+        float mins[QK_K/32];
+        float scales[QK_K/32];
+        uint8_t Laux[32];
         float max_scale = 0; // as we are deducting the min, scales are always positive
         float max_min = 0;
         for (int j = 0; j < QK_K/32; ++j) {
@@ -1838,39 +2063,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
             }
         }
 
-#if QK_K == 256
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            if (j < 4) {
-                y[i].scales[j] = ls;
-                y[i].scales[j+4] = lm;
-            } else {
-                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-                y[i].scales[j-4] |= ((ls >> 4) << 6);
-                y[i].scales[j-0] |= ((lm >> 4) << 6);
-            }
-        }
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(15, l));
-                L[32*j + ii] = l;
-            }
-        }
-#else
         const float s_factor = 15.f;
         float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;
         float inv_min   = max_min   > 0 ? s_factor/max_min   : 0.f;
@@ -1978,10 +2170,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
 
 #if QK_K == 256
     uint8_t L[QK_K];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
-    float weights[32];
-    uint8_t Laux[32];
 #else
     int8_t L[QK_K];
     float scales[QK_K/16];
@@ -1990,33 +2178,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     for (int i = 0; i < nb; i++) {
 
 #if QK_K == 256
+        int block_scales[QK_K/32];
+        int block_mins[QK_K/32];
+        quantize_q_k_1(x, 5, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
 
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
         for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
-            }
-        }
-
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
+            int ls = block_scales[j];
+            int lm = block_mins[j];
             if (j < 4) {
                 y[i].scales[j] = ls;
                 y[i].scales[j+4] = lm;
@@ -2026,21 +2194,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(31, l));
-                L[32*j + ii] = l;
-            }
-        }
 
         uint8_t * restrict qh = y[i].qh;
         uint8_t * restrict ql = y[i].qs;