diff --git a/ggml-quants.c b/ggml-quants.c index a15a240487084..a21815dad7d18 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -424,6 +424,128 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif +static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int qk, float *min, float *d) { + // Least squares fits `d * q + m = x` for d and m. + float qs = 0.0f; + float qs2 = 0.0f; + float xs = 0.0f; + float xq = 0.0f; + float minx = x[0]; + float maxx = x[0]; + float q1 = 0.0f; + for (int i = 0; i < qk; i++) { + float qf = q[i]; + float w = fabsf(x[i]); + q1 += w; + qs += w*qf; + qs2 += w*qf*qf; + xs += w*x[i]; + xq += w*x[i]*qf; + if (x[i] < minx) minx = x[i]; + if (x[i] > maxx) maxx = x[i]; + } + float denom = qs*qs - qs2*q1; + if (minx == maxx) { + *min = x[0]; + *d = 0.0f; + } else if (denom == 0.0f) { + *min = 0.0f; + *d = 0.0f; + } else { + *min = (qs*xq - qs2*xs) / denom; + *d = (qs*xs - q1*xq) / denom; + } +} + +static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) { + // Least squares fits `d * q = x` for d. + float qs2 = 0.0f; + float xq = 0.0f; + for (int i = 0; i < qk; i++) { + qs2 += q[i]*q[i]; + xq += x[i]*q[i]; + } + if (qs2 == 0.0f) { + return 0.0f; + } + return xq / qs2; +} + +static void lstsq_q_k(const float * restrict q, const float * restrict x, const float * restrict s, int bs, float *min, float *d) { + // Least squares fits `d * q - s * m = x` for d and m. + float s2 = 0.0f; + float qs = 0.0f; + float q2 = 0.0f; + float sx = 0.0f; + float qx = 0.0f; + for (int i = 0; i < QK_K; i++) { + float w = fabsf(x[i]); + s2 += w*s[i/bs]*s[i/bs]; + qs += w*q[i]*s[i/bs]; + q2 += w*q[i]*q[i]; + sx += w*s[i/bs]*x[i]; + qx += w*q[i]*x[i]; + } + float denom = qs*qs - q2*s2; + if (s2 == 0.0f && q2 != 0.0f) { + // All s are zero. + *min = 0.0f; + *d = qx / q2; + } else if (denom == 0.0f) { + *min = 0.0f; + *d = 0.0f; + } else { + *min = (q2*sx - qs*qx) / denom; + *d = (qs*sx - qx*s2) / denom; + } +} + +static void quantize_1_0min(const float * restrict x, int n, int bits, uint8_t * restrict q, float * scale) { + // Least squares fits `d * q = x` for d with unsigned q. + float max = -FLT_MAX; + for (int l = 0; l < n; l++) { + const float v = fabsf(x[l]); + if (v > max) max = v; + } + + float d = max / ((1 << bits) - 1); + *scale = d; + const float id = d ? 1.0f/d : 0.0f; + + for (int l = 0; l < n; l++) { + const float x0 = x[l]*id; + + const uint8_t xi0 = MAX(0, MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f))); + + q[l] = xi0; + } + *scale = lstsq_q_0_u8(q, x, n); +} + +static void quantize_1(const float * restrict x, int n, int bits, uint8_t * restrict q, float * m, float * scale) { + float min = FLT_MAX; + float max = -FLT_MAX; + for (int l = 0; l < n; l++) { + const float v = x[l]; + + if (v < min) min = v; + if (v > max) max = v; + } + + float d = (max - min) / ((1 << bits) - 1); + const float id = d ? 1.0f/d : 0.0f; + + for (int l = 0; l < n; l++) { + const float x0 = (x[l] - min)*id; + + const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f)); + + q[l] = xi0; + } + + lstsq_q_1(q, x, n, m, scale); +} + // reference implementation for deterministic creation of model files void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { static const int qk = QK4_0; @@ -1195,6 +1317,135 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } +static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_size, int * block_scales, int * block_mins, uint8_t * L, ggml_fp16_t * block_scale, ggml_fp16_t * block_min) { + float max_scale = 0.0f; + float max_min = 0.0f; + + // If all the weight are positive we can invert the sign of min. + // Otherwise blocks with all positive weights need to be quantized with zero + // min, because min scale is unsigned. + bool all_positive = true; + for (int j = 0; j < QK_K; j++) { + if (x[j] < 0.0f) { + all_positive = false; + break; + } + } + + float scales[QK_K]; + float mins[QK_K]; + + for (int j = 0; j < QK_K/block_size; j++) { + uint8_t q[QK_K]; + // First find least squares solution for min and scale for each block. + quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]); + // Flip the sign because quantize_1 assumes that min is added, but min + // is subtracted in k-quants. + mins[j] = -mins[j]; + if ((!all_positive && mins[j] < 0) || (all_positive && mins[j] > 0)) { + // All weights are positive in this block, but some blocks have + // negative weights. Find new least squares scale with zero min. + mins[j] = 0.0f; + quantize_1_0min(&x[block_size*j], block_size, bits, q, &scales[j]); + } + if (scales[j] > max_scale) { + max_scale = scales[j]; + } + if (j == 0) { + max_min = mins[j]; + } else if (!all_positive && mins[j] > max_min) { + max_min = mins[j]; + } else if (all_positive && mins[j] < max_min) { + max_min = mins[j]; + } + } + + int max_group = (1 << scale_bits) - 1; + + // Increasing passes would decrease RMS error by miniscule amount with + // drawback of taking more time. + for(int pass = 0; pass < 2; pass++) { + float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale; + float inv_min = max_min == 0.0f ? 0.0f : max_group/max_min; + float block_d = max_scale/max_group; + float block_dmin = max_min/max_group; + for (int j = 0; j < QK_K/block_size; ++j) { + uint8_t ls = MAX(0, nearest_int(inv_scale*scales[j])); + uint8_t lm = MAX(0, nearest_int(inv_min*mins[j])); + ls = MIN(max_group, ls); + lm = MIN(max_group, lm); + uint8_t best_lm = lm; + uint8_t best_ls = ls; + float best_rms = FLT_MAX; + int limit = 1; + // Increase limit for minor RMS error decrease while increasing the + // quantization run time. + if (pass > 0) limit = 8; + // Due to quantization the best ls and lm might not be the nearest + // to the ones obtained by the round to nearest. + // Loop through few nearby choices and choose lm and ls that + // minimize RMS error. + for (int lst = MAX(0, ls-limit); lst <= MIN(max_group, ls+limit); lst++) { + for (int lmt = MAX(0, lm-limit); lmt <= MIN(max_group, lm+limit); lmt++) { + float rms = 0.0f; + for (int ii = 0; ii < block_size; ii++) { + const float d = block_d * lst; + const float dm = block_dmin * lmt; + int l1 = 0; + if (d) { + l1 = nearest_int((x[block_size*j + ii] + dm)/d); + l1 = MAX(0, MIN((1 << bits) - 1, l1)); + } + const float e = (d*l1 - dm) - x[block_size*j + ii]; + rms += e*e; + } + if (rms < best_rms) { + best_lm = lmt; + best_ls = lst; + best_rms = rms; + } + } + } + block_scales[j] = best_ls; + block_mins[j] = best_lm; + } + + float q_fit[QK_K]; + float q_m[QK_K]; + + // Quantize elements and populate arrays needed for least squares fit. + for (int j = 0; j < QK_K/block_size; ++j) { + const float d = block_d * block_scales[j]; + const float dm = block_dmin * block_mins[j]; + q_m[j] = block_mins[j]; + for (int ii = 0; ii < block_size; ++ii) { + int l = 0; + if (d) { + l = nearest_int((x[block_size*j + ii] + dm)/d); + l = MAX(0, MIN((1 << bits) - 1, l)); + } + L[block_size*j + ii] = l; + q_fit[block_size*j + ii] = block_scales[j] * l; + } + } + + if (pass == 0) { + // Least squares fit min and scale. + float min, scale; + lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale); + // Check for nans. + assert(min == min); + assert(scale == scale); + // Quantize to fp16 for the next pass. + max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group; + max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group; + + *block_scale = GGML_FP32_TO_FP16(scale); + *block_min = GGML_FP32_TO_FP16(min); + } + } +} + static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { float max = 0; float amax = 0; @@ -1318,49 +1569,6 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, - int ntry, float alpha) { - float min = x[0]; - float max = x[0]; - for (int i = 1; i < n; ++i) { - if (x[i] < min) min = x[i]; - if (x[i] > max) max = x[i]; - } - if (max == min) { - for (int i = 0; i < n; ++i) L[i] = 0; - *the_min = 0; - return 0.f; - } - if (min > 0) min = 0; - float iscale = nmax/(max - min); - float scale = 1/iscale; - for (int itry = 0; itry < ntry; ++itry) { - float sumlx = 0; int suml2 = 0; - bool did_change = false; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - l = MAX(0, MIN(nmax, l)); - if (l != L[i]) { - L[i] = l; - did_change = true; - } - sumlx += (x[i] - min)*l; - suml2 += l*l; - } - scale = sumlx/suml2; - float sum = 0; - for (int i = 0; i < n; ++i) { - sum += x[i] - scale*L[i]; - } - min = alpha*min + (1 - alpha)*sum/n; - if (min > 0) min = 0; - iscale = 1/scale; - if (!did_change) break; - } - *the_min = -min; - return scale; -} - static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, float rmin, float rdelta, int nstep, bool use_mad) { @@ -1812,13 +2020,30 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; - uint8_t Laux[32]; - float weights[32]; - float mins[QK_K/32]; - float scales[QK_K/32]; for (int i = 0; i < nb; i++) { +#if QK_K == 256 + int block_scales[QK_K/32]; + int block_mins[QK_K/32]; + quantize_q_k_1(x, 4, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin); + for (int j = 0; j < QK_K/32; ++j) { + int ls = block_scales[j]; + int lm = block_mins[j]; + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); + } + } +#else + float weights[32]; + float mins[QK_K/32]; + float scales[QK_K/32]; + uint8_t Laux[32]; float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { @@ -1838,39 +2063,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -#if QK_K == 256 - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; - for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - ls = MIN(63, ls); - lm = MIN(63, lm); - if (j < 4) { - y[i].scales[j] = ls; - y[i].scales[j+4] = lm; - } else { - y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); - y[i].scales[j-4] |= ((ls >> 4) << 6); - y[i].scales[j-0] |= ((lm >> 4) << 6); - } - } - y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); - y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); - - uint8_t sc, m; - for (int j = 0; j < QK_K/32; ++j) { - get_scale_min_k4(j, y[i].scales, &sc, &m); - const float d = GGML_FP16_TO_FP32(y[i].d) * sc; - if (!d) continue; - const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; - for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(15, l)); - L[32*j + ii] = l; - } - } -#else const float s_factor = 15.f; float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; float inv_min = max_min > 0 ? s_factor/max_min : 0.f; @@ -1978,10 +2170,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict #if QK_K == 256 uint8_t L[QK_K]; - float mins[QK_K/32]; - float scales[QK_K/32]; - float weights[32]; - uint8_t Laux[32]; #else int8_t L[QK_K]; float scales[QK_K/16]; @@ -1990,33 +2178,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict for (int i = 0; i < nb; i++) { #if QK_K == 256 + int block_scales[QK_K/32]; + int block_mins[QK_K/32]; + quantize_q_k_1(x, 5, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin); - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - float sum_x2 = 0; - for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; - float av_x = sqrtf(sum_x2/32); - for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); - scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; - } - float min = mins[j]; - if (min > max_min) { - max_min = min; - } - } - - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; - for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - ls = MIN(63, ls); - lm = MIN(63, lm); + int ls = block_scales[j]; + int lm = block_mins[j]; if (j < 4) { y[i].scales[j] = ls; y[i].scales[j+4] = lm; @@ -2026,21 +2194,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y[i].scales[j-0] |= ((lm >> 4) << 6); } } - y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); - y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); - - uint8_t sc, m; - for (int j = 0; j < QK_K/32; ++j) { - get_scale_min_k4(j, y[i].scales, &sc, &m); - const float d = GGML_FP16_TO_FP32(y[i].d) * sc; - if (!d) continue; - const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; - for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(31, l)); - L[32*j + ii] = l; - } - } uint8_t * restrict qh = y[i].qh; uint8_t * restrict ql = y[i].qs;