@@ -17768,14 +17768,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17768
17768
new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
17769
17769
else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17770
17770
}
17771
- else if (qs.model.hparams.n_gqa() >= 7) {
17771
+ // else if (qs.model.hparams.n_gqa() >= 7) {
17772
17772
// The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is
17773
17773
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
17774
17774
// nearly negligible increase in model size by quantizing this tensor with more bits.
17775
17775
// That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12).
17776
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
17777
- new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
17778
- }
17776
+ // if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
17777
+ // new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
17778
+ // }
17779
17779
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
17780
17780
new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17781
17781
}
@@ -17797,30 +17797,43 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17797
17797
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
17798
17798
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
17799
17799
}
17800
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL ) {
17800
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
17801
17801
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
17802
17802
}
17803
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17804
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17805
+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
17806
+ else new_type = GGML_TYPE_IQ3_S;
17807
+ }
17803
17808
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
17804
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17809
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17810
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17805
17811
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17806
17812
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
17807
17813
}
17808
17814
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
17809
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17815
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17816
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17810
17817
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17811
17818
else new_type = GGML_TYPE_Q4_K;
17812
17819
}
17813
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
17814
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17815
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
17820
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
17821
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
17822
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
17823
+ else new_type = GGML_TYPE_Q4_K;
17824
+ }
17825
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17826
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
17827
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
17816
17828
else new_type = GGML_TYPE_Q4_K;
17817
17829
}
17818
17830
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) &&
17819
17831
(qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
17820
17832
new_type = GGML_TYPE_Q5_K;
17821
17833
}
17822
17834
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
17823
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
17835
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
17836
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
17824
17837
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
17825
17838
difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17826
17839
}
@@ -17993,41 +18006,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17993
18006
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
17994
18007
}
17995
18008
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
17996
- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
18009
+ if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
18010
+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
17997
18011
else new_type = GGML_TYPE_IQ3_XXS;
17998
18012
}
17999
18013
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18000
- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18014
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18015
+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18001
18016
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18002
18017
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18003
18018
}
18004
18019
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18005
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18020
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18021
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18006
18022
new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18007
18023
else new_type = GGML_TYPE_IQ3_S;
18008
18024
}
18009
18025
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
18010
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18026
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18027
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18011
18028
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
18012
18029
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18013
18030
}
18014
18031
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
18015
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18032
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18033
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18016
18034
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
18017
18035
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18018
18036
}
18019
18037
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
18020
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18038
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
18039
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18021
18040
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
18022
18041
else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18023
18042
}
18024
18043
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18025
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18044
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18045
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18026
18046
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
18027
18047
else new_type = GGML_TYPE_IQ4_XS;
18028
18048
}
18029
18049
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18030
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
18050
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
18051
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
18031
18052
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
18032
18053
difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18033
18054
}
@@ -18139,8 +18160,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18139
18160
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
18140
18161
}
18141
18162
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
18142
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
18143
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18163
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18144
18164
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
18145
18165
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
18146
18166
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
0 commit comments