Skip to content

Commit 252164c

Browse files
authored
Merge pull request #77 from l3utterfly/master
merge from upstream
2 parents 2a9564d + 0d92267 commit 252164c

39 files changed

+1288
-172
lines changed

CMakePresets.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,17 @@
5555
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
5656
}
5757
},
58+
{
59+
"name": "x64-linux-gcc", "hidden": true,
60+
"cacheVariables": {
61+
"CMAKE_C_COMPILER": "gcc",
62+
"CMAKE_CXX_COMPILER": "g++"
63+
}
64+
},
65+
{ "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
66+
{ "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
67+
{ "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
68+
{ "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
5869

5970
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
6071
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
133133
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
134134
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
135135
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
136+
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
136137

137138
#### Multimodal
138139

convert_hf_to_gguf.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ def prepare_tensors(self):
300300
gguf.MODEL_TENSOR.POS_EMBD,
301301
gguf.MODEL_TENSOR.TOKEN_TYPES,
302302
gguf.MODEL_TENSOR.SSM_CONV1D,
303+
gguf.MODEL_TENSOR.SHORTCONV_CONV,
303304
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
304305
gguf.MODEL_TENSOR.TIME_MIX_W1,
305306
gguf.MODEL_TENSOR.TIME_MIX_W2,
@@ -836,6 +837,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
836837
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
837838
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
838839
res = "midm-2.0"
840+
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
841+
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842+
res = "lfm2"
839843

840844
if res is None:
841845
logger.warning("\n")
@@ -1078,7 +1082,14 @@ def _set_vocab_rwkv_world(self):
10781082
self.gguf_writer.add_token_list(tokens)
10791083
self.gguf_writer.add_token_types(toktypes)
10801084
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
1081-
special_vocab.chat_template = "rwkv-world"
1085+
if special_vocab.chat_template is None:
1086+
template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
1087+
if template_path.is_file():
1088+
with open(template_path, "r", encoding="utf-8") as f:
1089+
template = f.read()
1090+
else:
1091+
template = "rwkv-world"
1092+
special_vocab.chat_template = template
10821093
# hack: Add '\n\n' as the EOT token to make it chat normally
10831094
special_vocab._set_special_token("eot", 261)
10841095
# hack: Override these as they have already been set (incorrectly)
@@ -7073,6 +7084,50 @@ def set_vocab(self):
70737084
chat_template = tokenizer.chat_template.replace("[:]", "")
70747085
self.gguf_writer.add_chat_template(chat_template)
70757086

7087+
7088+
@ModelBase.register("Lfm2ForCausalLM")
7089+
@ModelBase.register("LFM2ForCausalLM")
7090+
class LFM2Model(TextModel):
7091+
model_arch = gguf.MODEL_ARCH.LFM2
7092+
7093+
def _add_feed_forward_length(self):
7094+
ff_dim = self.hparams["block_ff_dim"]
7095+
7096+
auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
7097+
ff_dim = self.hparams["block_ff_dim"]
7098+
ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
7099+
multiple_of = self.hparams["block_multiple_of"]
7100+
7101+
if auto_adjust_ff_dim:
7102+
ff_dim = int(2 * ff_dim / 3)
7103+
# custom dim factor multiplier
7104+
if ffn_dim_multiplier is not None:
7105+
ff_dim = int(ffn_dim_multiplier * ff_dim)
7106+
ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
7107+
7108+
self.gguf_writer.add_feed_forward_length(ff_dim)
7109+
7110+
def set_gguf_parameters(self):
7111+
# set num_key_value_heads only for attention layers
7112+
self.hparams["num_key_value_heads"] = [
7113+
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
7114+
for layer_type in self.hparams["layer_types"]
7115+
]
7116+
7117+
super().set_gguf_parameters()
7118+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
7119+
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
7120+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
7121+
self._add_feed_forward_length()
7122+
7123+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7124+
# conv op requires 2d tensor
7125+
if 'conv.conv' in name:
7126+
data_torch = data_torch.squeeze(1)
7127+
7128+
return [(self.map_tensor_name(name), data_torch)]
7129+
7130+
70767131
###### CONVERSION LOGIC ######
70777132

70787133

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class TOKENIZER_TYPE(IntEnum):
130130
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
131131
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
132132
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
133+
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
133134
]
134135

135136
# some models are known to be broken upstream, so we will skip them as exceptions

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2090,6 +2090,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
20902090
{
20912091
// TODO: add support
20922092
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
2093+
#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
20932094
return false;
20942095
} break;
20952096
case GGML_OP_CPY: {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include "ggml-cuda/upscale.cuh"
4444
#include "ggml-cuda/wkv.cuh"
4545
#include "ggml-cuda/gla.cuh"
46+
#include "ggml-cuda/set-rows.cuh"
4647
#include "ggml.h"
4748

4849
#include <algorithm>
@@ -2230,6 +2231,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
22302231
case GGML_OP_GET_ROWS_BACK:
22312232
ggml_cuda_op_get_rows_back(ctx, dst);
22322233
break;
2234+
case GGML_OP_SET_ROWS:
2235+
ggml_cuda_op_set_rows(ctx, dst);
2236+
break;
22332237
case GGML_OP_DUP:
22342238
ggml_cuda_dup(ctx, dst);
22352239
break;
@@ -2299,6 +2303,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
22992303
case GGML_UNARY_OP_EXP:
23002304
ggml_cuda_op_exp(ctx, dst);
23012305
break;
2306+
case GGML_UNARY_OP_ELU:
2307+
ggml_cuda_op_elu(ctx, dst);
2308+
break;
23022309
default:
23032310
return false;
23042311
}
@@ -3112,6 +3119,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
31123119
case GGML_UNARY_OP_GELU_QUICK:
31133120
case GGML_UNARY_OP_TANH:
31143121
case GGML_UNARY_OP_EXP:
3122+
case GGML_UNARY_OP_ELU:
31153123
return ggml_is_contiguous(op->src[0]);
31163124
default:
31173125
return false;
@@ -3216,6 +3224,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32163224
{
32173225
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
32183226
} break;
3227+
case GGML_OP_SET_ROWS:
3228+
{
3229+
#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
3230+
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
3231+
op->src[0]->type == GGML_TYPE_F32 &&
3232+
op->src[1]->type == GGML_TYPE_I64;
3233+
} break;
32193234
case GGML_OP_CPY:
32203235
{
32213236
ggml_type src0_type = op->src[0]->type;

ggml/src/ggml-cuda/set-rows.cu

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#include "set-rows.cuh"
2+
3+
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
4+
5+
template<typename src_t, typename dst_t>
6+
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {}
7+
8+
template<>
9+
__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
10+
*dst_h = __float2half(*src_f);
11+
}
12+
13+
template<>
14+
__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
15+
*dst_b = *src_f;
16+
}
17+
18+
template<>
19+
__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
20+
*dst_f = *src_f;
21+
}
22+
23+
template<typename src_t, typename dst_t>
24+
static __global__ void k_set_rows(
25+
const src_t * __restrict__ src0, const int64_t * __restrict__ src1, dst_t * __restrict__ dst,
26+
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
27+
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
28+
const int64_t s01, const int64_t s02, const int64_t s03,
29+
const int64_t s10, const int64_t s11, const int64_t s12,
30+
const int64_t s1, const int64_t s2, const int64_t s3) {
31+
32+
const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
33+
const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
34+
35+
if (i >= ne_total) {
36+
return;
37+
}
38+
39+
const int64_t i03 = i / (ne00 * ne01 * ne02);
40+
const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
41+
const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
42+
const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
43+
44+
const int64_t i12 = i03 % ne12;
45+
const int64_t i11 = i02 % ne11;
46+
const int64_t i10 = i01;
47+
48+
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
49+
50+
const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
51+
dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3;
52+
53+
const src_t* src_elem = src0_row + i00;
54+
dst_t* dst_elem = dst_row_ptr + i00;
55+
set_rows_1(src_elem, dst_elem);
56+
}
57+
58+
template<typename src_t, typename dst_t>
59+
static void set_rows_cuda(
60+
const src_t * src0_d, const int64_t * src1_d, dst_t * dst_d,
61+
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
62+
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
63+
const size_t nb01, const size_t nb02, const size_t nb03,
64+
const size_t nb10, const size_t nb11, const size_t nb12,
65+
const size_t nb1, const size_t nb2, const size_t nb3,
66+
cudaStream_t stream) {
67+
68+
const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
69+
const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE;
70+
const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE);
71+
const dim3 grid_size(num_blocks);
72+
73+
74+
const int64_t s01 = nb01/sizeof(src_t);
75+
const int64_t s02 = nb02/sizeof(src_t);
76+
const int64_t s03 = nb03/sizeof(src_t);
77+
const int64_t s10 = nb10/sizeof(int64_t);
78+
const int64_t s11 = nb11/sizeof(int64_t);
79+
const int64_t s12 = nb12/sizeof(int64_t);
80+
const int64_t s1 = nb1/sizeof(dst_t);
81+
const int64_t s2 = nb2/sizeof(dst_t);
82+
const int64_t s3 = nb3/sizeof(dst_t);
83+
84+
if (ne_total > 0) {
85+
k_set_rows<<<grid_size, block_size, 0, stream>>>(
86+
src0_d, src1_d, dst_d,
87+
ne00, ne01, ne02, ne03,
88+
ne10, ne11, ne12, ne13,
89+
s01, s02, s03,
90+
s10, s11, s12,
91+
s1, s2, s3);
92+
}
93+
}
94+
95+
96+
void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
97+
const ggml_tensor * src0 = dst->src[0];
98+
const ggml_tensor * src1 = dst->src[1];
99+
100+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
101+
GGML_ASSERT(src1->type == GGML_TYPE_I64);
102+
103+
GGML_TENSOR_BINARY_OP_LOCALS
104+
105+
const float * src0_d = (const float *)src0->data;
106+
const int64_t * src1_d = (const int64_t *)src1->data;
107+
108+
cudaStream_t stream = ctx.stream();
109+
110+
111+
112+
if (dst->type == GGML_TYPE_F32) {
113+
set_rows_cuda(
114+
src0_d, src1_d, (float*)dst->data,
115+
ne00, ne01, ne02, ne03,
116+
ne10, ne11, ne12, ne13,
117+
nb01, nb02, nb03,
118+
nb10, nb11, nb12,
119+
nb1, nb2, nb3,
120+
stream
121+
);
122+
} else if (dst->type == GGML_TYPE_F16) {
123+
set_rows_cuda(
124+
src0_d, src1_d, (half*)dst->data,
125+
ne00, ne01, ne02, ne03,
126+
ne10, ne11, ne12, ne13,
127+
nb01, nb02, nb03,
128+
nb10, nb11, nb12,
129+
nb1, nb2, nb3,
130+
stream
131+
);
132+
} else if (dst->type == GGML_TYPE_BF16) {
133+
set_rows_cuda(
134+
src0_d, src1_d, (nv_bfloat16*)dst->data,
135+
ne00, ne01, ne02, ne03,
136+
ne10, ne11, ne12, ne13,
137+
nb01, nb02, nb03,
138+
nb10, nb11, nb12,
139+
nb1, nb2, nb3,
140+
stream
141+
);
142+
} else {
143+
GGML_ABORT("unsupported type");
144+
}
145+
}

ggml/src/ggml-cuda/set-rows.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#pragma once
2+
3+
#include "common.cuh"
4+
5+
#define CUDA_SET_ROWS_BLOCK_SIZE 256
6+
7+
void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-cuda/ssm-conv.cu

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,17 +107,25 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
107107
if (nc == 4) {
108108
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
109109
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
110+
} else if (nc == 3) {
111+
ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
112+
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
110113
} else {
111-
GGML_ABORT("Only support kernel size = 4 now.");
114+
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
112115
}
113116
} else {
114117
if (nc == 4) {
115118
const int64_t split_n_t = 32;
116119
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
117120
ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
118121
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
122+
} else if (nc == 3) {
123+
const int64_t split_n_t = 32;
124+
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
125+
ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
126+
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
119127
} else {
120-
GGML_ABORT("Only support kernel size = 4 right now.");
128+
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
121129
}
122130
}
123131
}

ggml/src/ggml-cuda/unary.cu

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ static __device__ __forceinline__ float op_log(float x) {
8383
return logf(x);
8484
}
8585

86+
static __device__ __forceinline__ float op_elu(float x) {
87+
return (x > 0.f) ? x : expm1f(x);
88+
}
89+
8690
template <float (*op)(float), typename T>
8791
static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
8892
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -196,6 +200,9 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
196200
ggml_cuda_op_unary<op_log>(ctx, dst);
197201
}
198202

203+
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
204+
ggml_cuda_op_unary<op_elu>(ctx, dst);
205+
}
199206
/* gated ops */
200207

201208
template <float (*op)(float), typename T>

0 commit comments

Comments
 (0)