@@ -5288,6 +5288,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5288
5288
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
5289
5289
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
5290
5290
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
5291
+ case LLAMA_FTYPE_MOSTLY_Q5_K_XSR: return "Q5_K - Xtra-Small Reloaded";
5292
+ case LLAMA_FTYPE_MOSTLY_Q5_K_SR: return "Q5_K - Small Reloaded";
5293
+ case LLAMA_FTYPE_MOSTLY_Q5_K_ML: return "Q5_K - Medium-Large";
5291
5294
case LLAMA_FTYPE_MOSTLY_Q5_K_XL: return "Q5_K - Xtra-Large";
5292
5295
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
5293
5296
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
@@ -18266,7 +18269,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18266
18269
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
18267
18270
new_type = GGML_TYPE_Q4_K;
18268
18271
}
18269
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
18272
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML ||
18273
+ ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
18270
18274
new_type = GGML_TYPE_Q6_K;
18271
18275
}
18272
18276
}
@@ -18390,7 +18394,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18390
18394
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) new_type = GGML_TYPE_Q5_K;
18391
18395
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
18392
18396
(difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv))) new_type = GGML_TYPE_Q6_K;
18393
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) new_type = GGML_TYPE_Q6_K;
18397
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML ||
18398
+ ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
18399
+ new_type = GGML_TYPE_Q6_K;
18400
+ }
18394
18401
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
18395
18402
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
18396
18403
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
@@ -18641,7 +18648,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18641
18648
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
18642
18649
else new_type = GGML_TYPE_Q4_K;
18643
18650
}
18644
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) new_type = GGML_TYPE_Q6_K;
18651
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML ||
18652
+ ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
18653
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
18654
+ }
18645
18655
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M)
18646
18656
&& (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
18647
18657
new_type = GGML_TYPE_Q5_K;
@@ -18910,8 +18920,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18910
18920
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
18911
18921
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
18912
18922
}
18923
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR) {
18924
+ if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
18925
+ else new_type = GGML_TYPE_Q4_K;
18926
+ }
18913
18927
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL)
18914
- new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18928
+ if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
18929
+ else new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18915
18930
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
18916
18931
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
18917
18932
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
@@ -19059,9 +19074,30 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19059
19074
new_type = GGML_TYPE_Q5_K;
19060
19075
}
19061
19076
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
19062
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
19077
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR) {
19078
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19079
+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19080
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19081
+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19082
+ else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19083
+ }
19084
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR) {
19063
19085
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19064
19086
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19087
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19088
+ new_type = (difquant_half_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19089
+ else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19090
+ }
19091
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M3L) {
19092
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19093
+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19094
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19095
+ new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19096
+ else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19097
+ }
19098
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_X4L) {
19099
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19100
+ new_type = (difquant_half_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19065
19101
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19066
19102
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19067
19103
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
@@ -19429,8 +19465,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19429
19465
}
19430
19466
else new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
19431
19467
}
19468
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR)
19469
+ new_type = GGML_TYPE_Q4_K;
19432
19470
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL)
19433
- new_type = difquant_fl_more_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19471
+ new_type = difquant_first_last_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19434
19472
} else {
19435
19473
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K;
19436
19474
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL)
@@ -19566,7 +19604,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19566
19604
new_type = GGML_TYPE_Q4_K;
19567
19605
}
19568
19606
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
19569
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) new_type = GGML_TYPE_Q6_K;
19607
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M ||
19608
+ ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL)
19609
+ new_type = GGML_TYPE_Q6_K;
19570
19610
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
19571
19611
new_type = GGML_TYPE_IQ2_XS;
19572
19612
}
@@ -19692,12 +19732,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19692
19732
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
19693
19733
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
19694
19734
}
19735
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR) {
19736
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19737
+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19738
+ else new_type = GGML_TYPE_Q5_K;
19739
+ }
19740
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML) {
19741
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19742
+ new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19743
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19744
+ new_type = (difquant_first_last_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19745
+ else new_type = GGML_TYPE_Q5_K;
19746
+ }
19695
19747
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
19696
- // if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19697
- // new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19698
- // else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19699
- // new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19700
- new_type = (difquant_fl_more_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19748
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19749
+ new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19750
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19751
+ new_type = (difquant_fl_more_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19752
+ else new_type = (difquant_first_last_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19701
19753
}
19702
19754
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
19703
19755
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
@@ -19863,12 +19915,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19863
19915
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
19864
19916
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
19865
19917
}
19918
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR) {
19919
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19920
+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19921
+ else new_type = GGML_TYPE_Q5_K;
19922
+ }
19923
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML) {
19924
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19925
+ new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19926
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19927
+ new_type = (difquant_first_last_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19928
+ else new_type = GGML_TYPE_Q5_K;
19929
+ }
19866
19930
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
19867
- // if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19868
- // new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19869
- // else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19870
- // new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19871
- new_type = (difquant_fl_more_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19931
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19932
+ new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19933
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19934
+ new_type = (difquant_fl_more_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19935
+ else new_type = (difquant_first_last_tensors (i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19872
19936
}
19873
19937
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
19874
19938
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
@@ -20136,6 +20200,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
20136
20200
case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
20137
20201
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
20138
20202
case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
20203
+ case LLAMA_FTYPE_MOSTLY_Q5_K_XSR:
20204
+ case LLAMA_FTYPE_MOSTLY_Q5_K_SR:
20205
+ case LLAMA_FTYPE_MOSTLY_Q5_K_ML:
20139
20206
case LLAMA_FTYPE_MOSTLY_Q5_K_XL:
20140
20207
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
20141
20208
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
0 commit comments