ngxson
diff --git a/‎cpp/actions.hpp
Lines changed: 9 additions & 7 deletions b/‎cpp/actions.hpp
Lines changed: 9 additions & 7 deletions
diff --git a/‎llama.cpp b/‎llama.cpp
diff --git a/‎package.json
Lines changed: 1 addition & 1 deletion b/‎package.json
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/multi-thread/wllama.js
Lines changed: 1 addition & 1 deletion b/‎src/multi-thread/wllama.js
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/multi-thread/wllama.wasm
60.5 KB b/‎src/multi-thread/wllama.wasm
60.5 KB
diff --git a/‎src/single-thread/wllama.js
Lines changed: 1 addition & 1 deletion b/‎src/single-thread/wllama.js
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/single-thread/wllama.wasm
60.3 KB b/‎src/single-thread/wllama.wasm
60.3 KB
diff --git a/‎src/wasm-from-cdn.ts
Lines changed: 2 additions & 2 deletions b/‎src/wasm-from-cdn.ts
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/workers-code/generated.ts
Lines changed: 2 additions & 2 deletions b/‎src/workers-code/generated.ts
Lines changed: 2 additions & 2 deletions
@@ -649,13 +649,14 @@ glue_msg_get_kv_remove_res action_kv_remove(app_t &app, const char *req_raw)
   PARSE_REQ(glue_msg_get_kv_remove_req);
   const int n_keep = req.n_keep.value;
   const int n_discard = req.n_discard.value;
+  auto * mem = llama_get_memory(app.ctx);
 
   if (n_discard > 0)
   {
     // TODO: this code branch is kinda broken, to be fixed later
     const int n_past = app.tokens.size();
-    llama_kv_self_seq_rm(app.ctx, 0, n_keep, n_keep + n_discard);
-    llama_kv_self_seq_add(app.ctx, 0, n_keep + n_discard, n_past, -n_discard);
+    llama_memory_seq_rm(mem, 0, n_keep, n_keep + n_discard);
+    llama_memory_seq_add(mem, 0, n_keep + n_discard, n_past, -n_discard);
     app.tokens.erase(
         app.tokens.begin() + n_keep,
         app.tokens.begin() + n_keep + n_discard);
@@ -664,11 +665,11 @@ glue_msg_get_kv_remove_res action_kv_remove(app_t &app, const char *req_raw)
   {
     if (n_keep == 0)
     {
-      llama_kv_self_clear(app.ctx);
+      llama_memory_clear(mem, true);
     }
     else
     {
-      llama_kv_self_seq_rm(app.ctx, 0, n_keep, -1);
+      llama_memory_seq_rm(mem, 0, n_keep, -1);
       app.tokens.erase(
           app.tokens.begin() + n_keep,
           app.tokens.end());
@@ -685,7 +686,8 @@ glue_msg_get_kv_remove_res action_kv_remove(app_t &app, const char *req_raw)
 glue_msg_get_kv_clear_res action_kv_clear(app_t &app, const char *req_raw)
 {
   PARSE_REQ(glue_msg_get_kv_clear_req);
-  llama_kv_self_clear(app.ctx);
+  auto * mem = llama_get_memory(app.ctx);
+  llama_memory_clear(mem, true);
   app.tokens.clear();
 
   glue_msg_get_kv_clear_res res;
@@ -766,7 +768,7 @@ glue_msg_test_benchmark_res action_test_benchmark(app_t &app, const char *req_ra
   std::string type = req.type.value;   // "pp" (prompt proc) or "tg" (tok gen)
   int n_samples = req.n_samples.value; // n_batch in pp and n_predict in pg
 
-  llama_kv_self_clear(app.ctx);
+  llama_memory_clear(llama_get_memory(app.ctx), true);
   int n_vocab = llama_vocab_n_tokens(app.vocab);
   int64_t t_start = ggml_time_ms();
 
@@ -837,7 +839,7 @@ glue_msg_test_perplexity_res action_test_perplexity(app_t &app, const char *req_
   }
 
   // Clear existing context to start fresh
-  llama_kv_self_clear(app.ctx);
+  llama_memory_clear(llama_get_memory(app.ctx), true);
   app.tokens.clear();
 
   const int32_t n_vocab = llama_vocab_n_tokens(app.vocab);
 
@@ -1,6 +1,6 @@
 {
   "name": "@wllama/wllama",
-  "version": "2.3.1",
+  "version": "2.3.2",
   "description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",
   "main": "index.js",
   "type": "module",
 
@@ -2,8 +2,8 @@
 // Do not edit this file directly
 
 const WasmFromCDN = {
-  'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].1/src/single-thread/wllama.wasm',
-  'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].1/src/multi-thread/wllama.wasm',
+  'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/single-thread/wllama.wasm',
+  'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/multi-thread/wllama.wasm',
 };
 
 export default WasmFromCDN;
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@wllama/wllama",`
`3`		`- "version": "2.3.1",`
	`3`	`+ "version": "2.3.2",`
`4`	`4`	`"description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",`
`5`	`5`	`"main": "index.js",`
`6`	`6`	`"type": "module",`