Skip to content

Commit c667f2e

Browse files
committed
Temporary settings for IQ3 attn_k and attn_v
1 parent 294aeec commit c667f2e

File tree

1 file changed

+41
-21
lines changed

1 file changed

+41
-21
lines changed

src/llama.cpp

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17768,14 +17768,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1776817768
new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
1776917769
else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1777017770
}
17771-
else if (qs.model.hparams.n_gqa() >= 7) {
17771+
// else if (qs.model.hparams.n_gqa() >= 7) {
1777217772
// The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is
1777317773
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
1777417774
// nearly negligible increase in model size by quantizing this tensor with more bits.
1777517775
// That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12).
17776-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
17777-
new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
17778-
}
17776+
// if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
17777+
// new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
17778+
// }
1777917779
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
1778017780
new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1778117781
}
@@ -17797,30 +17797,43 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1779717797
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1779817798
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
1779917799
}
17800-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17800+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1780117801
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1780217802
}
17803+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17804+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17805+
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
17806+
else new_type = GGML_TYPE_IQ3_S;
17807+
}
1780317808
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
17804-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17809+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17810+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1780517811
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1780617812
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1780717813
}
1780817814
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
17809-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17815+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17816+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1781017817
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1781117818
else new_type = GGML_TYPE_Q4_K;
1781217819
}
17813-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
17814-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17815-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
17820+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
17821+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17822+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
17823+
else new_type = GGML_TYPE_Q4_K;
17824+
}
17825+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17826+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
17827+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1781617828
else new_type = GGML_TYPE_Q4_K;
1781717829
}
1781817830
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) &&
1781917831
(qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1782017832
new_type = GGML_TYPE_Q5_K;
1782117833
}
1782217834
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
17823-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
17835+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
17836+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1782417837
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
1782517838
difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1782617839
}
@@ -17993,41 +18006,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1799318006
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1799418007
}
1799518008
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17996-
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
18009+
if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
18010+
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
1799718011
else new_type = GGML_TYPE_IQ3_XXS;
1799818012
}
1799918013
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18000-
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18014+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18015+
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
1800118016
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1800218017
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1800318018
}
1800418019
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18005-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18020+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18021+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1800618022
new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1800718023
else new_type = GGML_TYPE_IQ3_S;
1800818024
}
1800918025
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
18010-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18026+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18027+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1801118028
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1801218029
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1801318030
}
1801418031
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
18015-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18032+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18033+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1801618034
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1801718035
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1801818036
}
1801918037
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
18020-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18038+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18039+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1802118040
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1802218041
else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1802318042
}
1802418043
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18025-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18044+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18045+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1802618046
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1802718047
else new_type = GGML_TYPE_IQ4_XS;
1802818048
}
1802918049
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18030-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
18050+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18051+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1803118052
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
1803218053
difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1803318054
}
@@ -18139,8 +18160,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1813918160
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
1814018161
}
1814118162
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
18142-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
18143-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18163+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1814418164
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
1814518165
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1814618166
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;

0 commit comments

Comments
 (0)