ngxson
diff --git a/‎common/common.cpp
Lines changed: 2 additions & 2 deletions b/‎common/common.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/speculative.cpp
Lines changed: 1 addition & 1 deletion b/‎common/speculative.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/batched-bench/batched-bench.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/batched-bench/batched-bench.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/batched/batched.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/batched/batched.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cvector-generator/cvector-generator.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/cvector-generator/cvector-generator.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/embedding/embedding.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/embedding/embedding.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/eval-callback/eval-callback.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/eval-callback/eval-callback.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/gritlm/gritlm.cpp
Lines changed: 2 additions & 2 deletions b/‎examples/gritlm/gritlm.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/imatrix/imatrix.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/imatrix/imatrix.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/infill/infill.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/infill/infill.cpp
Lines changed: 1 addition & 1 deletion
@@ -1016,7 +1016,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
 
         if (llama_model_has_encoder(model)) {
-            auto batch = llama_batch_ext_ptr::init_from_text(tmp.data(), tmp.size(), 0, 0, true);
+            auto batch = llama_batch_ext_ptr::init_from_text(lctx, tmp.data(), tmp.size(), 0, 0, true);
             llama_encode_ext(lctx, batch.get());
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
             if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
@@ -1026,7 +1026,7 @@ struct common_init_result common_init_from_params(common_params & params) {
             tmp.push_back(decoder_start_token_id);
         }
         if (llama_model_has_decoder(model)) {
-            auto batch = llama_batch_ext_ptr::init_from_text(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true);
+            auto batch = llama_batch_ext_ptr::init_from_text(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true);
             llama_decode_ext(lctx, batch.get());
         }
         llama_kv_self_clear(lctx);
 
@@ -23,7 +23,7 @@ struct common_speculative * common_speculative_init(
     auto * result = new common_speculative {
         /* .ctx    = */ ctx_dft,
         /* .smpl   = */ nullptr,
-        /* .batch  = */ llama_batch_ext_ptr(llama_batch_ext_init(llama_n_batch(ctx_dft), 1)),
+        /* .batch  = */ llama_batch_ext_ptr(ctx_dft),
         /* .prompt = */ {},
     };
 
 
@@ -59,7 +59,7 @@ int main(int argc, char ** argv) {
 
     const int32_t n_kv_max = llama_n_ctx(ctx);
 
-    llama_batch_ext * batch = llama_batch_ext_init(n_kv_max, 1);
+    llama_batch_ext * batch = llama_batch_ext_init(ctx);
 
     // decode in batches of ctx_params.n_batch tokens
     auto decode_helper = [](llama_context * ctx, llama_batch_ext * batch, int32_t n_batch) {
 
@@ -102,7 +102,7 @@ int main(int argc, char ** argv) {
 
     // create a llama_batch
     // we use this object to submit token data for decoding
-    llama_batch_ext * batch = llama_batch_ext_init(std::max(tokens_list.size(), (size_t) n_parallel), n_parallel);
+    llama_batch_ext * batch = llama_batch_ext_init(ctx);
 
     std::vector<llama_seq_id> seq_ids(n_parallel, 0);
     for (int32_t i = 0; i < n_parallel; ++i) {
 
@@ -343,7 +343,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
     llama_kv_self_clear(ctx);
-    auto batch = llama_batch_ext_ptr::init_from_text(tokens.data(), tokens.size(), 0, 0, true);
+    auto batch = llama_batch_ext_ptr::init_from_text(ctx, tokens.data(), tokens.size(), 0, 0, true);
     if (llama_decode_ext(ctx, batch.get())) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
 
@@ -167,7 +167,7 @@ int main(int argc, char ** argv) {
 
     // initialize batch
     const int n_prompts = prompts.size();
-    llama_batch_ext * batch = llama_batch_ext_init(n_batch, 1);
+    llama_batch_ext * batch = llama_batch_ext_init(ctx);
 
     // count number of embeddings
     int n_embd_count = 0;
 
@@ -134,7 +134,7 @@ static bool run(llama_context * ctx, const common_params & params) {
 
     std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
 
-    auto batch = llama_batch_ext_ptr::init_from_text(tokens.data(), tokens.size(), 0, 0, true);
+    auto batch = llama_batch_ext_ptr::init_from_text(ctx, tokens.data(), tokens.size(), 0, 0, true);
     if (llama_decode_ext(ctx, batch.get())) {
         LOG_ERR("%s : failed to eval\n", __func__);
         return false;
 
@@ -14,7 +14,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_batch_ext_ptr batch(llama_batch_ext_init(llama_n_batch(ctx), 1));
+    llama_batch_ext_ptr batch(ctx);
 
     for (uint64_t i = 0; i < sentences.size(); i++) {
         batch.clear();
@@ -105,7 +105,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
     llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
 
-    llama_batch_ext_ptr batch(llama_batch_ext_init(llama_n_batch(ctx), 1));
+    llama_batch_ext_ptr batch(ctx);
 
     std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
     int32_t i_current_token = 0;
 
@@ -497,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         // clear the KV cache
         llama_kv_self_clear(ctx);
 
-        llama_batch_ext * batch = llama_batch_ext_init(n_batch, 1);
+        llama_batch_ext * batch = llama_batch_ext_init(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
 
@@ -353,7 +353,7 @@ int main(int argc, char ** argv) {
 
                 LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
-                auto batch = llama_batch_ext_ptr::init_from_text(&embd[i], n_eval, n_past, 0, true);
+                auto batch = llama_batch_ext_ptr::init_from_text(ctx, &embd[i], n_eval, n_past, 0, true);
                 if (llama_decode_ext(ctx, batch.get())) {
                     LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
Original file line number	Diff line number	Diff line change
`@@ -1016,7 +1016,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`1016`	`1016`	`}`
`1017`	`1017`
`1018`	`1018`	`if (llama_model_has_encoder(model)) {`
`1019`		`- auto batch = llama_batch_ext_ptr::init_from_text(tmp.data(), tmp.size(), 0, 0, true);`
	`1019`	`+ auto batch = llama_batch_ext_ptr::init_from_text(lctx, tmp.data(), tmp.size(), 0, 0, true);`
`1020`	`1020`	`llama_encode_ext(lctx, batch.get());`
`1021`	`1021`	`llama_token decoder_start_token_id = llama_model_decoder_start_token(model);`
`1022`	`1022`	`if (decoder_start_token_id == LLAMA_TOKEN_NULL) {`
`@@ -1026,7 +1026,7 @@ struct common_init_result common_init_from_params(common_params & params) {`
`1026`	`1026`	`tmp.push_back(decoder_start_token_id);`
`1027`	`1027`	`}`
`1028`	`1028`	`if (llama_model_has_decoder(model)) {`
`1029`		`- auto batch = llama_batch_ext_ptr::init_from_text(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true);`
	`1029`	`+ auto batch = llama_batch_ext_ptr::init_from_text(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true);`
`1030`	`1030`	`llama_decode_ext(lctx, batch.get());`
`1031`	`1031`	`}`
`1032`	`1032`	`llama_kv_self_clear(lctx);`