Skip to content

Commit 29cecae

Browse files
committed
Q5_K_XSR, SR, ML, and XL revamp
1 parent 412b56f commit 29cecae

File tree

4 files changed

+95
-19
lines changed

4 files changed

+95
-19
lines changed

examples/quantize/quantize.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
6262
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
6363
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
6464
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
65+
{ "Q5_K_XSR", LLAMA_FTYPE_MOSTLY_Q5_K_XL, " 5.4 bpw quantization mix", },
66+
{ "Q5_K_SR", LLAMA_FTYPE_MOSTLY_Q5_K_XL, " 5.6 bpw quantization mix", },
67+
{ "Q5_K_ML", LLAMA_FTYPE_MOSTLY_Q5_K_XL, " 5.8 bpw quantization mix", },
6568
{ "Q5_K_XL", LLAMA_FTYPE_MOSTLY_Q5_K_XL, " 6 bpw quantization mix", },
6669
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
6770
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },

gguf-py/gguf/constants.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1466,7 +1466,10 @@ class LlamaFileType(IntEnum):
14661466
MOSTLY_IQ4_XSR = 113 # except 1d tensors
14671467
MOSTLY_IQ4_MR = 114 # except 1d tensors
14681468
MOSTLY_IQ4_LR = 115 # except 1d tensors
1469-
MOSTLY_Q5_K_XL = 116 # except 1d tensors
1469+
MOSTLY_Q5_K_XSR = 116 # except 1d tensors
1470+
MOSTLY_Q5_K_SR = 117 # except 1d tensors
1471+
MOSTLY_Q5_K_ML = 118 # except 1d tensors
1472+
MOSTLY_Q5_K_XL = 119 # except 1d tensors
14701473
MOSTLY_CQS = 199 # except 1d tensors
14711474

14721475
GUESSED = 1024 # not specified in the model file

include/llama.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,10 @@ extern "C" {
191191
LLAMA_FTYPE_MOSTLY_IQ4_XSR = 113, // except 1d tensors
192192
LLAMA_FTYPE_MOSTLY_IQ4_MR = 114, // except 1d tensors
193193
LLAMA_FTYPE_MOSTLY_IQ4_LR = 115, // except 1d tensors
194-
LLAMA_FTYPE_MOSTLY_Q5_K_XL = 116, // except 1d tensors
194+
LLAMA_FTYPE_MOSTLY_Q5_K_XSR = 116, // except 1d tensors
195+
LLAMA_FTYPE_MOSTLY_Q5_K_SR = 117, // except 1d tensors
196+
LLAMA_FTYPE_MOSTLY_Q5_K_ML = 118, // except 1d tensors
197+
LLAMA_FTYPE_MOSTLY_Q5_K_XL = 119, // except 1d tensors
195198
LLAMA_FTYPE_CQS = 199, // except 1d tensors
196199

197200
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file

src/llama.cpp

Lines changed: 84 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5288,6 +5288,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
52885288
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
52895289
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
52905290
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
5291+
case LLAMA_FTYPE_MOSTLY_Q5_K_XSR: return "Q5_K - Xtra-Small Reloaded";
5292+
case LLAMA_FTYPE_MOSTLY_Q5_K_SR: return "Q5_K - Small Reloaded";
5293+
case LLAMA_FTYPE_MOSTLY_Q5_K_ML: return "Q5_K - Medium-Large";
52915294
case LLAMA_FTYPE_MOSTLY_Q5_K_XL: return "Q5_K - Xtra-Large";
52925295
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
52935296
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
@@ -18266,7 +18269,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1826618269
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
1826718270
new_type = GGML_TYPE_Q4_K;
1826818271
}
18269-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
18272+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML ||
18273+
ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
1827018274
new_type = GGML_TYPE_Q6_K;
1827118275
}
1827218276
}
@@ -18390,7 +18394,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1839018394
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) new_type = GGML_TYPE_Q5_K;
1839118395
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
1839218396
(difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv))) new_type = GGML_TYPE_Q6_K;
18393-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) new_type = GGML_TYPE_Q6_K;
18397+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML ||
18398+
ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
18399+
new_type = GGML_TYPE_Q6_K;
18400+
}
1839418401
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
1839518402
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
1839618403
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
@@ -18641,7 +18648,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1864118648
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1864218649
else new_type = GGML_TYPE_Q4_K;
1864318650
}
18644-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) new_type = GGML_TYPE_Q6_K;
18651+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML ||
18652+
ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
18653+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
18654+
}
1864518655
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M)
1864618656
&& (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1864718657
new_type = GGML_TYPE_Q5_K;
@@ -18910,8 +18920,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1891018920
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
1891118921
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
1891218922
}
18923+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR) {
18924+
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
18925+
else new_type = GGML_TYPE_Q4_K;
18926+
}
1891318927
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL)
18914-
new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18928+
if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
18929+
else new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1891518930
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
1891618931
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
1891718932
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
@@ -19059,9 +19074,30 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1905919074
new_type = GGML_TYPE_Q5_K;
1906019075
}
1906119076
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
19062-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
19077+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XSR) {
19078+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19079+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19080+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19081+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19082+
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19083+
}
19084+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR) {
1906319085
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1906419086
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19087+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19088+
new_type = (difquant_half_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19089+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19090+
}
19091+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M3L) {
19092+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19093+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19094+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19095+
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19096+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19097+
}
19098+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_X4L) {
19099+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19100+
new_type = (difquant_half_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1906519101
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1906619102
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1906719103
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
@@ -19429,8 +19465,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1942919465
}
1943019466
else new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1943119467
}
19468+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR)
19469+
new_type = GGML_TYPE_Q4_K;
1943219470
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL)
19433-
new_type = difquant_fl_more_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19471+
new_type = difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1943419472
} else {
1943519473
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K;
1943619474
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL)
@@ -19566,7 +19604,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1956619604
new_type = GGML_TYPE_Q4_K;
1956719605
}
1956819606
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
19569-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) new_type = GGML_TYPE_Q6_K;
19607+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M ||
19608+
ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL)
19609+
new_type = GGML_TYPE_Q6_K;
1957019610
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
1957119611
new_type = GGML_TYPE_IQ2_XS;
1957219612
}
@@ -19692,12 +19732,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1969219732
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1969319733
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1969419734
}
19735+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR) {
19736+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19737+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19738+
else new_type = GGML_TYPE_Q5_K;
19739+
}
19740+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML) {
19741+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19742+
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19743+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19744+
new_type = (difquant_first_last_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19745+
else new_type = GGML_TYPE_Q5_K;
19746+
}
1969519747
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
19696-
// if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19697-
// new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19698-
// else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19699-
// new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19700-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19748+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19749+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19750+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19751+
new_type = (difquant_fl_more_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19752+
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1970119753
}
1970219754
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1970319755
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
@@ -19863,12 +19915,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1986319915
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1986419916
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1986519917
}
19918+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_SR) {
19919+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19920+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19921+
else new_type = GGML_TYPE_Q5_K;
19922+
}
19923+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_ML) {
19924+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19925+
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19926+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19927+
new_type = (difquant_first_last_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19928+
else new_type = GGML_TYPE_Q5_K;
19929+
}
1986619930
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_XL) {
19867-
// if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19868-
// new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19869-
// else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19870-
// new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19871-
new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19931+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19932+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19933+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
19934+
new_type = (difquant_fl_more_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
19935+
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1987219936
}
1987319937
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1987419938
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
@@ -20136,6 +20200,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2013620200
case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
2013720201
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
2013820202
case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
20203+
case LLAMA_FTYPE_MOSTLY_Q5_K_XSR:
20204+
case LLAMA_FTYPE_MOSTLY_Q5_K_SR:
20205+
case LLAMA_FTYPE_MOSTLY_Q5_K_ML:
2013920206
case LLAMA_FTYPE_MOSTLY_Q5_K_XL:
2014020207
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
2014120208
case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;

0 commit comments

Comments
 (0)