From 58333237546096f2ec6d0370e1301e1f82dccc25 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 26 May 2024 22:40:28 -0400 Subject: [PATCH 1/5] Ignore second mlp layer if weights are null --- examples/llava/clip.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 95fbe3d0216c4..58c5b86e6f271 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -705,10 +705,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - + // paligemma missing second linear layer + if (model.mm_2_w) { + embeddings = ggml_gelu(ctx0, embeddings); + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); @@ -2067,6 +2069,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->vision_model.mm_model_peg_0_b->ne[0]; } if (ctx->proj_type == PROJECTOR_TYPE_MLP) { + // paligemma missing second linear layer + if (ctx->vision_model.mm_2_b == nullptr) { + return ctx->vision_model.mm_0_b->ne[0]; + } return ctx->vision_model.mm_2_b->ne[0]; } if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { From 9bed1aebbe14e0bd7b89baaa6fd796be23a26f1f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 11 Jun 2024 21:12:43 -0400 Subject: [PATCH 2/5] Reserve logits when causal attention is disabled on context --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 8b675ea993a38..d9f30a26da8b4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11957,7 +11957,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead - const bool has_logits = cparams.causal_attn; + const bool has_logits = hparams.causal_attn; const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; From dd34db26366d350e5cfd4a0abc3fc343e5769030 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 29 Sep 2024 16:15:30 -0400 Subject: [PATCH 3/5] Update GGML_ASSERT --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 81b651c6a438d..3736076e57a44 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -13994,7 +13994,7 @@ static void ggml_compute_forward_get_rows_f32( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - GGML_ASSERT(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 <= ne01); ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), From 9aecd38a8d8c831f017cb81ba7bef7346246262a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 1 Oct 2024 06:12:31 -0400 Subject: [PATCH 4/5] Add embeddings scale to clip_ctx to rescale final image embeddings --- examples/llava/clip.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index b3de2d73bcab4..9539b0c759961 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) { #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_EMBD_SCALE "clip.embeddings_scale" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -548,6 +549,7 @@ struct clip_ctx { float image_mean[3]; float image_std[3]; + float embeddings_scale = 1.0f; bool use_gelu = false; int32_t ftype = 1; @@ -1021,6 +1023,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } } + if (ctx->embeddings_scale != 1.0f) { + embeddings = ggml_scale(ctx0, embeddings, ctx->embeddings_scale); + } + // build the graph ggml_build_forward_expand(gf, embeddings); @@ -1322,6 +1328,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->image_std[i] = std_data[i]; } + try { + new_clip->embeddings_scale = get_f32(ctx, KEY_EMBD_SCALE); + } catch (const std::exception& /*e*/) { + new_clip->embeddings_scale = 1.0f; + } + if (verbosity >= 2) { LOG_INF("\n%s: vision model hparams\n", __func__); LOG_INF("image_size %d\n", hparams.image_size); From c702e5593086c42b3fb52ad68e04e37ffe29f61f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 1 Oct 2024 23:57:13 -0400 Subject: [PATCH 5/5] Add llama_token_in_embd function to embed input tokens --- include/llama.h | 13 +++++++++++ src/llama.cpp | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/include/llama.h b/include/llama.h index 7cae1bbe2e5b8..1a6f1d74d0827 100644 --- a/include/llama.h +++ b/include/llama.h @@ -960,6 +960,19 @@ extern "C" { bool remove_special, bool unparse_special); + + // @details Get the input embeddings for a sequence of tokens + // @param tokens The tokens to embed + // @param n_tokens The number of tokens + // @param embeddings The embeddings pointer must be large enough to hold the resulting embeddings. + // @param n_embd The number of embeddings per token + // @return Returns a negative number on failure + LLAMA_API int32_t llama_token_inp_embd( + struct llama_context * ctx, + llama_token * tokens, + int32_t n_tokens, + float * embeddings); + // // Chat templates // diff --git a/src/llama.cpp b/src/llama.cpp index c466cd88b7c14..b9cde30b35633 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21334,6 +21334,63 @@ int32_t llama_detokenize( return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special); } +int32_t llama_token_inp_embd(struct llama_context * ctx, llama_token * tokens, int32_t n_tokens, float * embeddings) { + int32_t n_embd = llama_n_embd(&ctx->model); + const struct llama_hparams & hparams = ctx->model.hparams; + llama_ubatch batch = {}; + batch.token = tokens; + batch.n_tokens = n_tokens; + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + ggml_backend_cpu_set_n_threads(ctx->backend_cpu, ctx->cparams.n_threads); + if (ctx->threadpool) { + ggml_backend_cpu_set_threadpool(ctx->backend_cpu, ctx->threadpool); + } + + ggml_init_params params = ggml_init_params{ + GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(), + nullptr, + true + }; + + ggml_context * ctx0 = ggml_init(params); + if (!ctx0) { + return -1; + } + + ggml_tensor * output = llm_build_inp_embd( + ctx0, + *ctx, + hparams, + batch, + ctx->model.tok_embd, + cb + ); + + ggml_backend_buffer_type_t buffer_type = ggml_backend_get_default_buffer_type(ctx->backend_cpu); + ggml_gallocr_t graph_allocator = ggml_gallocr_new(buffer_type); + ggml_cgraph * gf = ggml_new_graph(ctx0); + + ggml_set_output(output); + ggml_build_forward_expand(gf, output); + + if (!ggml_gallocr_reserve(graph_allocator, gf) || !ggml_gallocr_alloc_graph(graph_allocator, gf)) { + ggml_gallocr_free(graph_allocator); + ggml_free(ctx0); + return -1; + } + + ggml_backend_tensor_set(ctx->inp_tokens, tokens, 0, n_tokens * sizeof(int32_t)); + + ggml_backend_graph_compute(ctx->backend_cpu, gf); + + ggml_backend_tensor_get(output, embeddings, 0, n_tokens * n_embd * sizeof(float)); + + ggml_gallocr_free(graph_allocator); + ggml_free(ctx0); + + return 0; +} + // // chat templates //