From 0a46d73056a911853764abb6a8ee6d6afa5be121 Mon Sep 17 00:00:00 2001 From: ngxson Date: Fri, 24 May 2024 11:11:55 +0200 Subject: [PATCH 01/56] add control-vector-generator --- .gitignore | 1 + Makefile | 4 + .../control-vector-generator/CMakeLists.txt | 5 + .../control-vector-generator.cpp | 160 ++++++++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 examples/control-vector-generator/CMakeLists.txt create mode 100644 examples/control-vector-generator/control-vector-generator.cpp diff --git a/.gitignore b/.gitignore index 50ae0973ae3b3..79a160bb5e22c 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ models-mnt /train-text-from-scratch /tokenize /vdot +/control-vector-generator /common/build-info.cpp arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index fe63cbd6063aa..c12a3e382435f 100644 --- a/Makefile +++ b/Makefile @@ -838,6 +838,10 @@ eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +control-vector-generator: examples/control-vector-generator/control-vector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/control-vector-generator/CMakeLists.txt b/examples/control-vector-generator/CMakeLists.txt new file mode 100644 index 0000000000000..2515d20116749 --- /dev/null +++ b/examples/control-vector-generator/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET control-vector-generator) +add_executable(${TARGET} control-vector-generator.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp new file mode 100644 index 0000000000000..5c64c3b747973 --- /dev/null +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -0,0 +1,160 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include +#include +#include + +struct callback_data { + std::vector data; + int n_tokens = 0; + int n_embd = 0; + bool is_eval_pos = true; + std::vector v_pos; + std::vector v_neg; + std::vector v_diff; +}; + +static std::string ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { + auto * cb_data = (callback_data *) user_data; + + static const char * l_out_name = "l_out"; + const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + if (ask) { + return is_l_out; + } + + if (!is_l_out || t->ne[1] != cb_data->n_tokens) { + return true; + } + + char src1_str[128] = {0}; + if (src1) { + sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); + } + + printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, + t->name, ggml_type_name(t->type), ggml_op_desc(t), + src0->name, ggml_ne_string(src0).c_str(), + src1 ? src1_str : "", + ggml_ne_string(t).c_str()); + + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(t->buffer); + + if (!is_host) { + auto n_bytes = ggml_nbytes(t); + cb_data->data.resize(n_bytes); + ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + } + + if (t->type == GGML_TYPE_F32) { + float * data = (float *) (is_host ? t->data : cb_data->data.data()); + float * dest = (float *) malloc(ggml_nbytes(t)); + memcpy(dest, data, ggml_nbytes(t)); + if (cb_data->is_eval_pos) { + cb_data->v_pos.push_back(dest); + } else { + cb_data->v_neg.push_back(dest); + } + } + + return true; +} + +static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + return true; +} + +static void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { + // TODO: customize padding token + std::vector pad_tokens = ::llama_tokenize(ctx, " ", false); + llama_token pad_tok = pad_tokens.back(); + while (tokens.size() < len) { + tokens.push_back(pad_tok); + } +} + +int main(int argc, char ** argv) { + callback_data cb_data; + std::string prompt_pos = "happy"; + std::string prompt_neg = "sad"; + + gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + print_build_info(); + llama_backend_init(); + llama_numa_init(params.numa); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = cb_eval; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + // init + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); + return 1; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + } + + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + std::vector tokens_pos = ::llama_tokenize(ctx, prompt_pos, add_bos); + std::vector tokens_neg = ::llama_tokenize(ctx, prompt_neg, add_bos); + size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); + padding_seq(ctx, tokens_pos, max_seq_len); + padding_seq(ctx, tokens_neg, max_seq_len); + cb_data.n_tokens = max_seq_len; + cb_data.n_embd = llama_n_embd(model); + + cb_data.is_eval_pos = true; + get_hidden_layers(ctx, tokens_pos); + cb_data.is_eval_pos = false; + get_hidden_layers(ctx, tokens_neg); + + printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4096]); + printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4096]); + + //llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} From c31c118d86d0725448933b37349db8304867fc59 Mon Sep 17 00:00:00 2001 From: ngxson Date: Fri, 24 May 2024 11:46:47 +0200 Subject: [PATCH 02/56] calc diff --- .../control-vector-generator.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 5c64c3b747973..2195e28fa0c56 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -97,6 +97,20 @@ static void padding_seq(llama_context * ctx, std::vector & tokens, } } +static void calc_diff(callback_data & cb_data) { + // TODO: assert cb_data.v_pos.size() == cb_data.v_neg.size() + const size_t n_elems = cb_data.n_embd * cb_data.n_tokens; + for (size_t il = 0; il < cb_data.v_pos.size(); il++) { + auto & inp_pos = cb_data.v_pos[il]; + auto & inp_neg = cb_data.v_neg[il]; + float * dest = (float *) malloc(n_elems * sizeof(float *)); + for (size_t i = 0; i < n_elems; i++) { + dest[i] = inp_pos[i] - inp_neg[i]; + } + cb_data.v_diff.push_back(dest); + } +} + int main(int argc, char ** argv) { callback_data cb_data; std::string prompt_pos = "happy"; @@ -149,6 +163,9 @@ int main(int argc, char ** argv) { printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4096]); printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4096]); + calc_diff(cb_data); + printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4096]); + //llama_print_timings(ctx); llama_free(ctx); From b30bea325738065411660bea37fcb91c03b6a594 Mon Sep 17 00:00:00 2001 From: ngxson Date: Fri, 24 May 2024 22:50:03 +0200 Subject: [PATCH 03/56] add comments --- .../control-vector-generator/control-vector-generator.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 2195e28fa0c56..e729606363254 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -13,9 +13,10 @@ struct callback_data { int n_tokens = 0; int n_embd = 0; bool is_eval_pos = true; - std::vector v_pos; - std::vector v_neg; - std::vector v_diff; + // each element of the vector correspond to one layer + std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] + std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] + std::vector v_diff; // vector of matrices of size [n_embd, n_tokens] }; static std::string ggml_ne_string(const ggml_tensor * t) { From 73747fe8eb2ffe2a71879a76996ad040a4837e86 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 30 May 2024 00:31:29 -0400 Subject: [PATCH 04/56] proof-of-concept stdlib implementation Implements PCA and file writing using mostly standard libraries. The output is recognized as a functional control vector, but outputs gibberish. --- examples/CMakeLists.txt | 1 + .../control-vector-generator.cpp | 162 ++++++++++++++++++ 2 files changed, 163 insertions(+) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b40ee4ccb2ec1..8a5a2b9e5af49 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,6 +12,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() + add_subdirectory(control-vector-generator) add_subdirectory(baby-llama) add_subdirectory(batched) add_subdirectory(batched-bench) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index e729606363254..eb7f05038bae9 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -17,6 +17,7 @@ struct callback_data { std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] std::vector v_diff; // vector of matrices of size [n_embd, n_tokens] + std::vector v_final; // vector of finished vectors of size [n_embd] }; static std::string ggml_ne_string(const ggml_tensor * t) { @@ -112,6 +113,162 @@ static void calc_diff(callback_data & cb_data) { } } +// BEGIN NON-GGML IMPLEMENTATION + +// TODO translate to ggml +// this probably doesn't want to be here - put it into the compute graph as a step in processing each layer +static float* square_diff(callback_data & cb_data, size_t idx) { + float* result = new float[cb_data.n_embd * cb_data.n_embd]; + std::memset(result, 0, cb_data.n_embd * cb_data.n_embd * sizeof(float)); + for (size_t i = 0; i < cb_data.n_embd; i++) { + for (size_t j = 0; j < cb_data.n_embd; j++) { + float sum = 0.0f; + for (size_t k = 0; k < cb_data.n_tokens; k++) { + sum += cb_data.v_diff[idx][i * cb_data.n_tokens + k] * cb_data.v_diff[idx][j * cb_data.n_tokens + k]; + } + result[i * cb_data.n_embd + j] = sum; + } + } + return result; +} + +// TODO translate to ggml +static void normalize_inplace(std::vector & vec) { + // inefficient(?) norm computation + float norm = 0.0f; + for (const float& val : vec) { + norm += val * val; + } + norm = std::sqrt(norm); + for (float& val : vec) { + val /= norm; + } +} + +// TODO translate to ggml +static std::vector mul_mat(const float * mat, const std::vector & vec, size_t dim) { + std::vector result(dim, 0.0f); + for (size_t i = 0; i < dim; ++i) { + for (size_t j = 0; j < dim; ++j) { + result[i] += mat[i * dim + j] * vec[j]; + } + } + return result; +} + +// TODO translate to ggml +static std::vector power_iteration(callback_data & cb_data, const float * matrix, int maxIterations = 1000, float tolerance = 1e-8) { + std::vector b_tensor = std::vector(); + + // random vector gen/norm + std::default_random_engine generator(static_cast(std::time(0))); + std::uniform_real_distribution distribution(0.0, 1.0); + for (int i = 0; i < cb_data.n_embd; ++i) { + b_tensor.push_back(distribution(generator)); + } + normalize_inplace(b_tensor); + + for (int iter = 0; iter < maxIterations; ++iter) { + + // store the previous one so we can check for convergence + std::vector b_prev_tensor = b_tensor; + + // matrix multiplication and renormalize + b_tensor = mul_mat(matrix, b_tensor, cb_data.n_embd); + normalize_inplace(b_tensor); + + // convergence check + float diff = 0.0; + for (int i = 0; i < cb_data.n_embd; ++i) { + diff += std::pow(b_tensor[i] - b_prev_tensor[i], 2); + } + if (std::sqrt(diff) < tolerance) { + break; + } + } + + return b_tensor; +} + +// TODO translate to ggml +static void pca(callback_data & cb_data) { + for (size_t i = 0; i < cb_data.v_diff.size(); i++) { + float* matrix = square_diff(cb_data, i); + std::vector eigenvector = power_iteration(cb_data, matrix); + cb_data.v_final.push_back(&eigenvector[0]); + delete[] matrix; + // TODO make your print outputs nicer + std::cout << "Done with layer " << i << "\n"; + } +} + +template +static std::string to_string(const T & val) { + std::stringstream ss; + ss << val; + return ss.str(); +} + +static void export_gguf(callback_data & cb_data, const std::string fname) { + struct gguf_context * ctx = gguf_init_empty(); + + gguf_set_val_str(ctx, "general.architecture", "controlvector"); + gguf_set_val_str(ctx, "controlvector.model_hint", "mistral"); // TODO steal this from the model somehow (arch) + gguf_set_val_i32(ctx, "controlvector.layer_count", cb_data.v_final.size()); + + //size_t buf_size = 3u*cb_data.n_embd*sizeof(float); // TODO how much size do i need??? + size_t buf_size = 128u*1024u*4096u; + std::vector buf(buf_size); + + // TODO customize mem size - I have no idea + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx_data = ggml_init(params); + + // TODO direction tensor invalid??? probably because you start at 0. see below + for (int i = 0; i < cb_data.v_final.size(); i++) { + const std::string name = "direction." + to_string(i+1); // TODO figure out how to get the number for direction - dl repeng locally and debug + // clone the repo and use importlib + // git clone https://github.com/vgel/repeng.git + + struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, cb_data.n_embd); + + std::cout << "Made it past tensor creation"; + + ggml_set_name(cur, name.c_str()); + std::cout << "Made it past tensor name set"; + + // whining about buf != NULL + // TODO figure out how to set data + //ggml_backend_tensor_set(cur, cb_data.v_final[i], 0, cb_data.n_embd * sizeof(float)); // if this doesn't work refer to gguf.cpp example + { + float * data = (float *) cur->data; + for(int j = 0; j < ggml_nelements(cur); j++) { + data[j] = cb_data.v_final[i][j]; + } + } + std::cout << "Made it past tensor backend set"; + + gguf_add_tensor(ctx, cur); + std::cout << "Added tensor " << i << "\n"; + } + + std::cout << "Writing file\n"; + + gguf_write_to_file(ctx, fname.c_str(), false); + + printf("%s: wrote file '%s;\n", __func__, fname.c_str()); + + ggml_free(ctx_data); + gguf_free(ctx); +} + +// END NON-GGML IMPLEMENTATION + int main(int argc, char ** argv) { callback_data cb_data; std::string prompt_pos = "happy"; @@ -167,6 +324,11 @@ int main(int argc, char ** argv) { calc_diff(cb_data); printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4096]); + pca(cb_data); + // TODO --outfile + std::cout << "Done with PCA" << "\n"; + export_gguf(cb_data, "controlvector.gguf"); + //llama_print_timings(ctx); llama_free(ctx); From f58f6af1336831e33fb627b7aa1e58289158de08 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 30 May 2024 11:31:45 -0400 Subject: [PATCH 05/56] param parsing, refactor, comments Added basic command-line parameters for outfile and one each positive/negative prompt. Refactored some messy code in PCA computation and GGUF exporting. Left a bunch of comments regarding further work needed. --- .../control-vector-generator.cpp | 172 ++++++++++++++---- 1 file changed, 137 insertions(+), 35 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index eb7f05038bae9..d4f619e9349dc 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -20,6 +20,98 @@ struct callback_data { std::vector v_final; // vector of finished vectors of size [n_embd] }; +struct ctrl_params { + std::string outfile = "control_vector.gguf"; + std::string positive = "happy"; // TODO support multiple positive prompts + std::string negative = "sad"; // TODO support multiple negative prompts +}; + +static void print_usage(const char * executable) { + printf("\n"); + printf("usage: %s [options] -m [gpt-opts]", executable); + printf("\n"); + printf("Creates a GGUF control vector for a given model."); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --outfile output file (default: 'control_vector.gguf')\n"); + printf(" --positive positive prompt (default: 'happy')\n"); + printf(" --negative negative prompt (default: 'sad')\n"); + printf("\n"); + printf("gpt-opts: other options from main\n"); + printf("\n"); +} + +static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) { + std::string arg; + const std::string arg_prefix = "--"; + int skipme = 0; + + int arg_idx = 1; + for(; arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) == 0; ++arg_idx) { + arg = argv[arg_idx]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (arg == "-h" || arg == "--help") { + print_usage(argv[0]); + exit(0); + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + if (arg == "--outfile") { + if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { + params.outfile = argv[arg_idx]; + // FIXME hack to skip these args in gpt_parse_params + skipme += 2; + } + else { + throw std::invalid_argument("error: missing argument for " + arg); + } + } + if (arg == "--positive") { + if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { + params.positive = argv[arg_idx]; + // FIXME hack to skip these args in gpt_parse_params + skipme += 2; + } + else { + throw std::invalid_argument("error: missing argument for " + arg); + } + } + if (arg == "--negative") { + if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { + params.negative = argv[arg_idx]; + // FIXME hack to skip these args in gpt_parse_params + skipme += 2; + } + else { + throw std::invalid_argument("error: missing argument for " + arg); + } + } + + // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params + } + return skipme; +} + +static int ctrlvec_params_parse(int argc, char ** argv, ctrl_params & params) { + int skipme = 0; + try { + skipme = ctrlvec_params_parse_ex(argc, argv, params); + } + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + print_usage(argv[0]); + exit(EXIT_FAILURE); + } + return skipme; +} + static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; for (int i = 0; i < GGML_MAX_DIMS; ++i) { @@ -192,14 +284,14 @@ static std::vector power_iteration(callback_data & cb_data, const float * // TODO translate to ggml static void pca(callback_data & cb_data) { - for (size_t i = 0; i < cb_data.v_diff.size(); i++) { + for (int i = 0; i < cb_data.v_diff.size(); i++) { float* matrix = square_diff(cb_data, i); std::vector eigenvector = power_iteration(cb_data, matrix); cb_data.v_final.push_back(&eigenvector[0]); delete[] matrix; - // TODO make your print outputs nicer - std::cout << "Done with layer " << i << "\n"; + printf("Done with layer %d\n", i); } + printf("Done with PCA."); } template @@ -209,59 +301,53 @@ static std::string to_string(const T & val) { return ss.str(); } -static void export_gguf(callback_data & cb_data, const std::string fname) { +static void export_gguf(callback_data & cb_data, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); - gguf_set_val_str(ctx, "general.architecture", "controlvector"); - gguf_set_val_str(ctx, "controlvector.model_hint", "mistral"); // TODO steal this from the model somehow (arch) - gguf_set_val_i32(ctx, "controlvector.layer_count", cb_data.v_final.size()); + const std::string arch = "controlvector"; + gguf_set_val_str(ctx, "general.architecture", arch.c_str()); + gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); + gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), cb_data.v_final.size()); - //size_t buf_size = 3u*cb_data.n_embd*sizeof(float); // TODO how much size do i need??? - size_t buf_size = 128u*1024u*4096u; - std::vector buf(buf_size); + //size_t buf_size = 3u*cb_data.n_embd*sizeof(float); // TODO how much size do i need? + size_t buf_size = 128u*1024u*4096u; // FIXME placehokder - // TODO customize mem size - I have no idea + // TODO customize mem size - I have no idea what this is supposed to be struct ggml_init_params params = { /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf.data(), + /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; struct ggml_context * ctx_data = ggml_init(params); - // TODO direction tensor invalid??? probably because you start at 0. see below - for (int i = 0; i < cb_data.v_final.size(); i++) { - const std::string name = "direction." + to_string(i+1); // TODO figure out how to get the number for direction - dl repeng locally and debug - // clone the repo and use importlib - // git clone https://github.com/vgel/repeng.git + for (int i = 0; i < cb_data.v_final.size(); ++i) { + // TODO this number is probably not right - figure out which layer is which + // the python implementation uses a dict to handle this, we don't know if it's 1, 2, 3, 4... or other + const std::string name = "direction." + to_string(i+1); struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, cb_data.n_embd); - std::cout << "Made it past tensor creation"; - ggml_set_name(cur, name.c_str()); - std::cout << "Made it past tensor name set"; - // whining about buf != NULL - // TODO figure out how to set data - //ggml_backend_tensor_set(cur, cb_data.v_final[i], 0, cb_data.n_embd * sizeof(float)); // if this doesn't work refer to gguf.cpp example + // TODO figure out how to set data - it's whining about buf != NULL when using the below commented line + //ggml_backend_tensor_set(cur, cb_data.v_final[i], 0, cb_data.n_embd * sizeof(float)); { float * data = (float *) cur->data; for(int j = 0; j < ggml_nelements(cur); j++) { data[j] = cb_data.v_final[i][j]; } } - std::cout << "Made it past tensor backend set"; gguf_add_tensor(ctx, cur); - std::cout << "Added tensor " << i << "\n"; + printf("Added tensor %d\n", i); } - std::cout << "Writing file\n"; + printf("Writing file...\n"); gguf_write_to_file(ctx, fname.c_str(), false); - printf("%s: wrote file '%s;\n", __func__, fname.c_str()); + printf("%s: wrote file '%s'\n", __func__, fname.c_str()); ggml_free(ctx_data); gguf_free(ctx); @@ -270,10 +356,14 @@ static void export_gguf(callback_data & cb_data, const std::string fname) { // END NON-GGML IMPLEMENTATION int main(int argc, char ** argv) { - callback_data cb_data; - std::string prompt_pos = "happy"; - std::string prompt_neg = "sad"; + ctrl_params cparams; + int skipme = ctrlvec_params_parse(argc, argv, cparams); + // FIXME hack to skip the ctrlvec args in parsing gpt params + argc -= skipme; + argv += skipme; + + callback_data cb_data; gpt_params params; if (!gpt_params_parse(argc, argv, params)) { return 1; @@ -305,8 +395,17 @@ int main(int argc, char ** argv) { } const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - std::vector tokens_pos = ::llama_tokenize(ctx, prompt_pos, add_bos); - std::vector tokens_neg = ::llama_tokenize(ctx, prompt_neg, add_bos); + + /* TODO this just tokenizes the exact pos/neg strings, correct? + * instead we want to create a bunch of starter prompts for it to work off + * we need to run get_hidden_layers many many times and then figure out how to combine the resulting vectors + * see the blogpost + python implementation for reference + * + * https://vgel.me/posts/representation-engineering/ + * https://github.com/vgel/repeng/blob/main/repeng/extract.py + */ + std::vector tokens_pos = ::llama_tokenize(ctx, cparams.positive, add_bos); + std::vector tokens_neg = ::llama_tokenize(ctx, cparams.negative, add_bos); size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len); @@ -325,9 +424,12 @@ int main(int argc, char ** argv) { printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4096]); pca(cb_data); - // TODO --outfile - std::cout << "Done with PCA" << "\n"; - export_gguf(cb_data, "controlvector.gguf"); + + // TODO figure out how to extract this from model - there's no API exposed to get model arch string + // we need get_arch_name() from llama.cpp + // TODO also has support been implemeneted for arches other than llama yet? see #5970 + std::string model_hint = "llama"; + export_gguf(cb_data, cparams.outfile, model_hint); //llama_print_timings(ctx); From dc46264ff068b87779632943eb43b530f7443719 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 30 May 2024 13:12:54 -0400 Subject: [PATCH 06/56] example template completions Implements an example template set built from the positive/negative prompts like the control vector Python implementation. --- .../control-vector-generator/completions.txt | 582 ++++++++++++++++++ .../control-vector-generator.cpp | 40 ++ 2 files changed, 622 insertions(+) create mode 100644 examples/control-vector-generator/completions.txt diff --git a/examples/control-vector-generator/completions.txt b/examples/control-vector-generator/completions.txt new file mode 100644 index 0000000000000..abc45ffd87269 --- /dev/null +++ b/examples/control-vector-generator/completions.txt @@ -0,0 +1,582 @@ + +That game +I can see +Hmm, this +I can relate to +Who is +I understand the +Ugh, +What the hell was +Hey, did anyone +Although +Thank you for choosing +What are you +Oh w +How dare you open +It was my pleasure +I'm hon +I appreciate that you +Are you k +Whoever left this +It's always +Ew, +Hey, I l +Hello? Is someone +I understand that +That poem +Aww, poor +Hey, it +Alright, who +I didn't +Well, life +The document +Oh no, this +I'm concerned +Hello, this is +This art +Hmm, this drink +Hi there! +It seems +Is +Good +I can't +Ex +Who are +I can see that +Wow, +Today is a +Hey friend +Sometimes friends +Oh, this old +The weather outside +This place is sur +I appreciate your input +Thank you for the +Look at +I'm disappoint +To my +How dare you +That's an +This piece of art +Eww +This park is +This is incredible +Oh no, someone +Exc +Well, it' +I warned +Hey, I understand +Hey, I saw +How dare you go +What the he +Hey +It's +Hello? Hello? +It +Oh no! +This is the perfect +Good morning, +Oh no, there +It's so +Yeah +Uh, +Hello everyone +Who turned off +The weather +Who' +Hey, this +Wait, +Eww, gross +Excuse +It seems like you +Thank you so +What happened? +Oh my g +I am deeply sad +I war +Okay, let' +Hey, that +That was a beautiful +Oh no! That +What happened +Hey there +The artist' +What?! +Hey, it' +I am disappoint +It seems like +Oh no! The +This park is a +If you +Yes! I did +It sounds +What +Who is it +Hmm, that +That's strange +Yeah, that was +That's interesting +This park +What the hell +Who is that +I feel like my +Oh well +What the hell is +Hello? Hello +To my dearest +Bless you!\" +Thank you for +Oh, looks like +Can you please +This place is +Eww, what +Bless you +Is everything +Hey, I just +Whoever left these +Well, that' +I feel +Hey, do you +It's sad +Oh no, it +Hey, that' +Oh my god, +Thank you, +Hello little one, +I apolog +Hey team, I +How dare you read +Who is this and +Whoever left +Hi there! W +A +If you have +I was +U +Bless +Well, this +Oh, I' +It's a +Eww, +Is everything okay? +Oh, I +Hello, can you +Al +That was a great +What are +I understand that not +Oh no, not +Who is it?\" +Hey, can we +Whoever is taking +I would love to +Hey, I noticed +Hey, could +I understand that there +Hello? +D +Oh man, I +Thank you so much +Oh no, my +Dear [Name +Uh +I remember +Hey, who +Well, it +Are you +I understand that it +Hey, is +I would +Who is this +Excuse me +Alright +I am thrilled +Sometimes friends have +Who the +It's interesting +I would love +E +Hello? Is anyone +Well, this is +This place +Well, +I warned you +Hey, watch where +Oh my +That' +Sometimes friends have different +I understand that everyone +What? +What do these notes +I can relate +I'm not +I understand +To my dear +Guys +Well +Hey, I appreciate +Wow, what +Dear +That melody +Who the hell +Today is +Hello little +Wow, look +That's great +Love is never wrong +I'm having +Whoa, did +Ugh +Can you please provide +I miss you, +I feel uncom +I know +Ugh, this +Hey, watch +Oh great, a +I didn +Okay +That game of char +Oh +I appreciate +Who's there +I am so +Oh great, someone +Hey, could you +I remember wondering +Wait, what? +What do +Hello? Can +Hey there, +That game of +This is incred +Oh my gosh +Oh great, f +I appreciate your +It sounds like +What the heck +Okay, I understand +Ew +I understand that this +Uh, hi +Hi everyone! +What the hell? +Thank you for your +Oh no, the +Wow, I +Who turned +Dear [ +Whoever +This is a +Whoa, he +What in the world +Although the physical +Hello, who is +That's amaz +Hey, I know +Okay, that +Hi everyone +Hey, is everything +I understand your fr +Oh no, poor +Oh, look +Good morning +Ew, gross +Oh no, did +Look at the family +Hey team +Yes! +Hey, can I +Okay, that' +It's great +Love is +Hey, what +Good morning, world +Who is it? +That poem really reson +I +That's +I understand the task +Gu +Hello? Who' +This postcard is +Whoa, +Oh, that +I understand that I +Whoever is +Hello? Who is +I'm really +Wow, this +Can +This artwork really +This is a shame +I miss you too +Who are you? +Today is a difficult +Hey, just +Are you okay +I am +Hi, +Wow, that +Hey there! Can +Okay, stay +Oh great, just +Yeah, +Hello? Can you +Oh, looks +Thank you for sharing +I'm glad +Hey, is that +Hmm +It was my +It sounds like you +Wow, your +I was promised certain +That was such a +Thank +Excuse you +That was +Hey team, +I feel un +It was +What' +Hey friend, I +How +Saying goodbye +That +It's heart +How dare +Oh, +Hello, may +What's this +Thank you for recogn +Aww, that +Oh, I remember +Hmm, that' +I miss +I know this +Wait +Is everything okay +Who is that person +Wow, you +Oh great +I'm sad +Wow, the +I am very disappoint +Who turned off the +I understand that things +I'm very +Hi +That's very +Okay, I +Oh no, +Wow, there +What's wrong +I apologize for +Hey, I +Can I help you +Oh, I didn +Alright, +Oh wow, +Oh my goodness +I know this event +What in the +Saying +Yeah, that +Guys, I +Hey, this v +This post +Are +Hey, can +Hello? Is +I can only imagine +Oh, that sounds +Hey, is anyone +I am disappointed +Hello, +Hey everyone, I +That was such +It's okay +The artist +Whoa +I understand that mistakes +Can I help +Who +Hi everyone! I +Hey, can you +Wow, how +Today +Oh no, I +Oh well, I +Well, that +This is the +Yes! I finally +Hey there little +Hello everyone! +Love is never +Look at the +This postcard +Oh great, +Can I +Hmm, this is +I understand your +Oh, look at +B +I'm so +Whoa, this +W +Oh, this +Sometimes +This piece of +What the +That was a +Hey, do +Oh no +Whoa, what +I feel like I +The documentary +Hello +Hello little one +I understand that my +Eww, that +Wow, an +Yes! Finally, +Although the physical location +Whoever is watching +That movie +I remember wondering about +Hey there, little +Who's +Hello, who +Hello everyone! Thank +Hello, can +That's too +Hey, just wanted +Hey there, I +Saying good +Hey there! +Who is there? +Oh my good +I am very +Oh no, what +Wow, thank +I was promised +Hi, is +Hey, I' +Guys, the +Oh no, that +Who is there +Hello, this +That movie really touched +If you have something +The documentary was +I'm starting +Are you kidd +That movie really +Hey everyone, +Thank you for considering +I didn' +Yes! I +Can you +Oh my god +Hey, whoever +That melody really +Thank you, little +Hello, may I +Look +Wow, we +It looks +What do these +Oh wow +I apologize +What are you all +It's such +It's clear +Hey, I was +Hey friend, +I can only +The weather outside is +Eww, this +I miss you +Wow +Aww, +Hi, is there +This artwork +Okay, +Oh well, +This +I' +Say +Hey there little gu +Hmm, +Whoa, who +I am thr +Oh man +Okay, stay calm +I'm happy +Oh, this cur +Oh man, +I'm sorry +Hello? Who +What?! That +This piece +Hey everyone +That's so +Are you okay? +What happened? Where +Hi there +The +Who the hell entered +I can +Guys, +What's +What in +It's important +I'm +I'm coming +It' +Yes! Finally +Wait, what +Wow, reading +I'm surprised +Hey, did +Hey, +Okay, let +I understand that you +Who the hell threw +Eww, who +Thank you for thinking +Who is this?\" +I am deeply +Thank you for including +Oh no, an +It looks like you +Aww +I'm confused +Wow, it +That poem really +Yes +Hey there, is +Hey, what' +Thank you for remember +To +This is +Thank you for making +I can' +That mel +Wow, they +I feel like +Although the +Who are you +Love +If +What the hell are +I am so sad +Oh, I found +Thank you +It looks like +Well, life is +I appreciate that +The artist's +Whoa, that +It's never \ No newline at end of file diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index d4f619e9349dc..32c200238e00d 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include struct callback_data { std::vector data; @@ -22,8 +24,11 @@ struct callback_data { struct ctrl_params { std::string outfile = "control_vector.gguf"; + std::string completions_file = "examples/control-vector-generator/completions.txt"; std::string positive = "happy"; // TODO support multiple positive prompts std::string negative = "sad"; // TODO support multiple negative prompts + std::vector positive_entries; + std::vector negative_entries; }; static void print_usage(const char * executable) { @@ -35,6 +40,7 @@ static void print_usage(const char * executable) { printf("options:\n"); printf(" -h, --help show this help message and exit\n"); printf(" --outfile output file (default: 'control_vector.gguf')\n"); + printf(" --completions-file completions file (default: 'examples/control-vector-generator/completions.txt')\n"); printf(" --positive positive prompt (default: 'happy')\n"); printf(" --negative negative prompt (default: 'sad')\n"); printf("\n"); @@ -73,6 +79,16 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) throw std::invalid_argument("error: missing argument for " + arg); } } + if (arg == "--completions-file") { + if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { + params.completions_file = argv[arg_idx]; + // FIXME hack to skip these args in gpt_parse_params + skipme += 2; + } + else { + throw std::invalid_argument("error: missing argument for " + arg); + } + } if (arg == "--positive") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { params.positive = argv[arg_idx]; @@ -112,6 +128,29 @@ static int ctrlvec_params_parse(int argc, char ** argv, ctrl_params & params) { return skipme; } +static std::string format_template(std::string persona, std::string suffix) { + const std::string user_tag = "[INST]"; + const std::string asst_tag = "[/INST]"; + // TODO make this dynamic - allow the user to change it somehow + return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; +} + +static void populate_entries(ctrl_params & cparams) { + std::string line; + std::ifstream completions_file(cparams.completions_file); + if (completions_file.is_open()) { + while (std::getline(completions_file, line)) { + // TODO replicate the truncations done by the python implementation + cparams.positive_entries.push_back(format_template(cparams.positive, line)); + cparams.negative_entries.push_back(format_template(cparams.negative, line)); + } + completions_file.close(); + } + else { + throw std::invalid_argument("error: invalid completions file or file could not be opened"); + } +} // TODO actually do something with this + static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; for (int i = 0; i < GGML_MAX_DIMS; ++i) { @@ -358,6 +397,7 @@ static void export_gguf(callback_data & cb_data, const std::string fname, const int main(int argc, char ** argv) { ctrl_params cparams; int skipme = ctrlvec_params_parse(argc, argv, cparams); + //populate_entries(cparams); // FIXME hack to skip the ctrlvec args in parsing gpt params argc -= skipme; From 447023fc43ddb6c86c10035ec3442b960fbc5bbe Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 30 May 2024 23:58:32 +0200 Subject: [PATCH 07/56] add multi prompts, multi-thread for PCA --- .../control-vector-generator.cpp | 246 +++++++++++------- 1 file changed, 156 insertions(+), 90 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 32c200238e00d..f36c1f6a2d835 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -20,13 +20,23 @@ struct callback_data { std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] std::vector v_diff; // vector of matrices of size [n_embd, n_tokens] std::vector v_final; // vector of finished vectors of size [n_embd] + ~callback_data() { + for (auto ptr : v_pos) free(ptr); + for (auto ptr : v_neg) free(ptr); + for (auto ptr : v_diff) free(ptr); + for (auto ptr : v_final) free(ptr); + } }; struct ctrl_params { std::string outfile = "control_vector.gguf"; std::string completions_file = "examples/control-vector-generator/completions.txt"; - std::string positive = "happy"; // TODO support multiple positive prompts - std::string negative = "sad"; // TODO support multiple negative prompts + /* pair of prompts to be used for generating the vectors */ + std::string positive_prompts_file = "positive.txt"; + std::string negative_prompts_file = "negative.txt"; + std::vector positive_prompts; + std::vector negative_prompts; + /* pair of prompts to be used for testing */ std::vector positive_entries; std::vector negative_entries; }; @@ -38,11 +48,11 @@ static void print_usage(const char * executable) { printf("Creates a GGUF control vector for a given model."); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" --outfile output file (default: 'control_vector.gguf')\n"); - printf(" --completions-file completions file (default: 'examples/control-vector-generator/completions.txt')\n"); - printf(" --positive positive prompt (default: 'happy')\n"); - printf(" --negative negative prompt (default: 'sad')\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --outfile output file (default: 'control_vector.gguf')\n"); + printf(" --completions-file completions file (default: 'examples/control-vector-generator/completions.txt')\n"); + printf(" -pf, --positive-file positive prompts file, one prompt per line (default: 'positive.txt')\n"); + printf(" -nf, --negative-file negative prompts file, one prompt per line (default: 'negative.txt')\n"); printf("\n"); printf("gpt-opts: other options from main\n"); printf("\n"); @@ -74,8 +84,7 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) params.outfile = argv[arg_idx]; // FIXME hack to skip these args in gpt_parse_params skipme += 2; - } - else { + } else { throw std::invalid_argument("error: missing argument for " + arg); } } @@ -84,28 +93,25 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) params.completions_file = argv[arg_idx]; // FIXME hack to skip these args in gpt_parse_params skipme += 2; - } - else { + } else { throw std::invalid_argument("error: missing argument for " + arg); } } - if (arg == "--positive") { + if (arg == "--positive-file" || arg == "-pf") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - params.positive = argv[arg_idx]; + params.positive_prompts_file = argv[arg_idx]; // FIXME hack to skip these args in gpt_parse_params skipme += 2; - } - else { + } else { throw std::invalid_argument("error: missing argument for " + arg); } } - if (arg == "--negative") { + if (arg == "--negative-file" || arg == "-nf") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - params.negative = argv[arg_idx]; + params.negative_prompts_file = argv[arg_idx]; // FIXME hack to skip these args in gpt_parse_params skipme += 2; - } - else { + } else { throw std::invalid_argument("error: missing argument for " + arg); } } @@ -128,6 +134,22 @@ static int ctrlvec_params_parse(int argc, char ** argv, ctrl_params & params) { return skipme; } +static std::vector ctrlvec_load_prompt_file(std::string path) { + std::vector output; + std::ifstream file(path); + if (!file.is_open()) { + throw std::runtime_error("Unable to open file " + path); + } + std::string line; + while (std::getline(file, line)) { + if (!line.empty()) { // skip empty lines + output.push_back(line); + } + } + file.close(); + return output; +} + static std::string format_template(std::string persona, std::string suffix) { const std::string user_tag = "[INST]"; const std::string asst_tag = "[/INST]"; @@ -135,7 +157,7 @@ static std::string format_template(std::string persona, std::string suffix) { return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; } -static void populate_entries(ctrl_params & cparams) { +/*static void populate_entries(ctrl_params & cparams) { std::string line; std::ifstream completions_file(cparams.completions_file); if (completions_file.is_open()) { @@ -145,11 +167,10 @@ static void populate_entries(ctrl_params & cparams) { cparams.negative_entries.push_back(format_template(cparams.negative, line)); } completions_file.close(); - } - else { + } else { throw std::invalid_argument("error: invalid completions file or file could not be opened"); } -} // TODO actually do something with this +}*/ // TODO actually do something with this static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; @@ -236,7 +257,7 @@ static void calc_diff(callback_data & cb_data) { for (size_t il = 0; il < cb_data.v_pos.size(); il++) { auto & inp_pos = cb_data.v_pos[il]; auto & inp_neg = cb_data.v_neg[il]; - float * dest = (float *) malloc(n_elems * sizeof(float *)); + float * dest = (float *) malloc(n_elems * sizeof(float)); for (size_t i = 0; i < n_elems; i++) { dest[i] = inp_pos[i] - inp_neg[i]; } @@ -323,13 +344,23 @@ static std::vector power_iteration(callback_data & cb_data, const float * // TODO translate to ggml static void pca(callback_data & cb_data) { - for (int i = 0; i < cb_data.v_diff.size(); i++) { - float* matrix = square_diff(cb_data, i); - std::vector eigenvector = power_iteration(cb_data, matrix); - cb_data.v_final.push_back(&eigenvector[0]); - delete[] matrix; - printf("Done with layer %d\n", i); + size_t n_threads = 8; + int n_layers = cb_data.v_diff.size(); + std::vector threads; + cb_data.v_final.reserve(n_layers); + auto worker_function = [&](int worker_id) { + for (int il = worker_id; il < n_layers; il += n_threads) { + float * matrix = square_diff(cb_data, il); + std::vector eigenvector = power_iteration(cb_data, matrix); + cb_data.v_final[il] = &eigenvector[0]; + delete[] matrix; + printf("Done with layer %d\n", il); + } + }; + for (int i = 0; i < n_threads; ++i) { + threads.emplace_back(worker_function, i); } + for (auto & th : threads) th.join(); printf("Done with PCA."); } @@ -340,32 +371,29 @@ static std::string to_string(const T & val) { return ss.str(); } -static void export_gguf(callback_data & cb_data, const std::string fname, const std::string model_hint) { +static void export_gguf(std::vector v_final, int n_embd, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); const std::string arch = "controlvector"; gguf_set_val_str(ctx, "general.architecture", arch.c_str()); gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); - gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), cb_data.v_final.size()); - - //size_t buf_size = 3u*cb_data.n_embd*sizeof(float); // TODO how much size do i need? - size_t buf_size = 128u*1024u*4096u; // FIXME placehokder + gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_final.size()); // TODO customize mem size - I have no idea what this is supposed to be struct ggml_init_params params = { - /*.mem_size =*/ buf_size, + /*.mem_size =*/ ggml_tensor_overhead() * v_final.size(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; struct ggml_context * ctx_data = ggml_init(params); - for (int i = 0; i < cb_data.v_final.size(); ++i) { + for (size_t i = 0; i < v_final.size(); ++i) { // TODO this number is probably not right - figure out which layer is which // the python implementation uses a dict to handle this, we don't know if it's 1, 2, 3, 4... or other const std::string name = "direction." + to_string(i+1); - struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, cb_data.n_embd); + struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, n_embd); ggml_set_name(cur, name.c_str()); @@ -374,7 +402,7 @@ static void export_gguf(callback_data & cb_data, const std::string fname, const { float * data = (float *) cur->data; for(int j = 0; j < ggml_nelements(cur); j++) { - data[j] = cb_data.v_final[i][j]; + data[j] = v_final[i][j]; } } @@ -403,78 +431,116 @@ int main(int argc, char ** argv) { argc -= skipme; argv += skipme; - callback_data cb_data; gpt_params params; if (!gpt_params_parse(argc, argv, params)) { return 1; } + // load prompts + cparams.positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file); + cparams.negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file); + if (cparams.positive_prompts.size() != cparams.negative_prompts.size()) { + fprintf(stderr, "number of positive and negative prompts must be equal"); + return 1; + } + print_build_info(); llama_backend_init(); llama_numa_init(params.numa); - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = cb_eval; - params.cb_eval_user_data = &cb_data; - params.warmup = false; - - // init + // load the model to get hparams llama_model * model; llama_context * ctx; std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; + int n_layers = llama_n_layer(model); + int n_embd = llama_n_embd(model); + int n_prompts = cparams.positive_prompts.size(); + // vector of finished vectors of size [n_embd], we have (n_layers - 1) vectors in total + std::vector v_final(n_layers - 1, NULL); + for (size_t i = 0; i < v_final.size(); ++i) { + v_final[i] = (float *) calloc(n_embd, sizeof(float)); } + llama_free(ctx); + llama_free_model(model); + + for (size_t i = 0; i < n_prompts; ++i) { + callback_data cb_data; + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = cb_eval; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + // load model + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); + return 1; + } + + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + /* TODO this just tokenizes the exact pos/neg strings, correct? + * instead we want to create a bunch of starter prompts for it to work off + * we need to run get_hidden_layers many many times and then figure out how to combine the resulting vectors + * see the blogpost + python implementation for reference + * + * https://vgel.me/posts/representation-engineering/ + * https://github.com/vgel/repeng/blob/main/repeng/extract.py + */ + std::string positive_prompt = cparams.positive_prompts[i]; + std::string negative_prompt = cparams.negative_prompts[i]; + std::vector tokens_pos = ::llama_tokenize(ctx, positive_prompt, add_bos); + std::vector tokens_neg = ::llama_tokenize(ctx, negative_prompt, add_bos); + size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); + padding_seq(ctx, tokens_pos, max_seq_len); + padding_seq(ctx, tokens_neg, max_seq_len); + cb_data.n_tokens = max_seq_len; + cb_data.n_embd = n_embd; + + printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", positive_prompt.c_str(), negative_prompt.c_str(), max_seq_len); + + cb_data.is_eval_pos = true; + get_hidden_layers(ctx, tokens_pos); + cb_data.is_eval_pos = false; + get_hidden_layers(ctx, tokens_neg); + + printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4096]); + printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4096]); + + calc_diff(cb_data); + printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4096]); + + printf("Running PCA...\n"); + pca(cb_data); + + // add the output vector to v_final + for (size_t j = 0; j < cb_data.v_final.size(); ++j) { + for (size_t k = 0; k < n_embd; ++k) { + v_final[j][k] += cb_data.v_final[j][k]; + } + } - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + llama_free(ctx); + llama_free_model(model); } - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - - /* TODO this just tokenizes the exact pos/neg strings, correct? - * instead we want to create a bunch of starter prompts for it to work off - * we need to run get_hidden_layers many many times and then figure out how to combine the resulting vectors - * see the blogpost + python implementation for reference - * - * https://vgel.me/posts/representation-engineering/ - * https://github.com/vgel/repeng/blob/main/repeng/extract.py - */ - std::vector tokens_pos = ::llama_tokenize(ctx, cparams.positive, add_bos); - std::vector tokens_neg = ::llama_tokenize(ctx, cparams.negative, add_bos); - size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); - padding_seq(ctx, tokens_pos, max_seq_len); - padding_seq(ctx, tokens_neg, max_seq_len); - cb_data.n_tokens = max_seq_len; - cb_data.n_embd = llama_n_embd(model); - - cb_data.is_eval_pos = true; - get_hidden_layers(ctx, tokens_pos); - cb_data.is_eval_pos = false; - get_hidden_layers(ctx, tokens_neg); - - printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4096]); - printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4096]); - - calc_diff(cb_data); - printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4096]); - - pca(cb_data); + // calculate the mean value of v_final + // TODO: maybe using LERP here + for (size_t j = 0; j < v_final.size(); ++j) { + for (size_t k = 0; k < n_embd; ++k) { + v_final[j][k] /= n_prompts; + } + } // TODO figure out how to extract this from model - there's no API exposed to get model arch string // we need get_arch_name() from llama.cpp // TODO also has support been implemeneted for arches other than llama yet? see #5970 std::string model_hint = "llama"; - export_gguf(cb_data, cparams.outfile, model_hint); - - //llama_print_timings(ctx); - - llama_free(ctx); - llama_free_model(model); + export_gguf(v_final, n_embd, cparams.outfile, model_hint); llama_backend_free(); From 287da25f482e4c6f65ac83d1a33fa1f534d4b152 Mon Sep 17 00:00:00 2001 From: ngxson Date: Fri, 31 May 2024 00:06:45 +0200 Subject: [PATCH 08/56] fix mem error --- examples/control-vector-generator/control-vector-generator.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index f36c1f6a2d835..2c09809dd763f 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -381,7 +381,8 @@ static void export_gguf(std::vector v_final, int n_embd, const std::str // TODO customize mem size - I have no idea what this is supposed to be struct ggml_init_params params = { - /*.mem_size =*/ ggml_tensor_overhead() * v_final.size(), + /*.mem_size =*/ (ggml_tensor_overhead() * v_final.size()) + + (n_embd * v_final.size() * sizeof(float)), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; From d446c6d88730ee79beb933643dd3d9d095839390 Mon Sep 17 00:00:00 2001 From: ngxson Date: Fri, 31 May 2024 00:41:12 +0200 Subject: [PATCH 09/56] add debugs --- .../control-vector-generator.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 2c09809dd763f..03e7fa5e3456e 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -352,16 +352,19 @@ static void pca(callback_data & cb_data) { for (int il = worker_id; il < n_layers; il += n_threads) { float * matrix = square_diff(cb_data, il); std::vector eigenvector = power_iteration(cb_data, matrix); - cb_data.v_final[il] = &eigenvector[0]; + cb_data.v_final[il] = (float *) malloc(eigenvector.size() * sizeof(float)); + memcpy(cb_data.v_final[il], eigenvector.data(), eigenvector.size() * sizeof(float)); delete[] matrix; printf("Done with layer %d\n", il); + printf("il = %d | %f %f \n", il, cb_data.v_final[il][0], cb_data.v_final[il][1]); } }; + printf("Running PCA...\n"); for (int i = 0; i < n_threads; ++i) { threads.emplace_back(worker_function, i); } for (auto & th : threads) th.join(); - printf("Done with PCA."); + printf("Done with PCA.\n"); } template @@ -509,13 +512,12 @@ int main(int argc, char ** argv) { cb_data.is_eval_pos = false; get_hidden_layers(ctx, tokens_neg); - printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4096]); - printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4096]); + printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4097]); + printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4097]); calc_diff(cb_data); - printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4096]); + printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4097]); - printf("Running PCA...\n"); pca(cb_data); // add the output vector to v_final @@ -524,6 +526,7 @@ int main(int argc, char ** argv) { v_final[j][k] += cb_data.v_final[j][k]; } } + printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); llama_free(ctx); llama_free_model(model); From 31f153fe9caf1ba48a7337055f3c3d9eb3f86d57 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 30 May 2024 21:36:17 -0400 Subject: [PATCH 10/56] fix matrix transpose multiplication you have got to be kidding me --- examples/control-vector-generator/control-vector-generator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 03e7fa5e3456e..57e43dcf7dc15 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -276,7 +276,7 @@ static float* square_diff(callback_data & cb_data, size_t idx) { for (size_t j = 0; j < cb_data.n_embd; j++) { float sum = 0.0f; for (size_t k = 0; k < cb_data.n_tokens; k++) { - sum += cb_data.v_diff[idx][i * cb_data.n_tokens + k] * cb_data.v_diff[idx][j * cb_data.n_tokens + k]; + sum += cb_data.v_diff[idx][i + cb_data.n_embd * k] * cb_data.v_diff[idx][j + cb_data.n_embd * k]; } result[i * cb_data.n_embd + j] = sum; } From fa85ba6ae39ace8a5045f0ff25de71393f05d4e8 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Thu, 30 May 2024 23:39:59 -0400 Subject: [PATCH 11/56] preliminary template/multiprompt support model is running out of context and that ought to be fixed (segfaulting) but other than that it looks goodish --- .../control-vector-generator.cpp | 153 ++++++++++++------ .../control-vector-generator/negative.txt | 1 + .../control-vector-generator/positive.txt | 1 + 3 files changed, 103 insertions(+), 52 deletions(-) create mode 100644 examples/control-vector-generator/negative.txt create mode 100644 examples/control-vector-generator/positive.txt diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 57e43dcf7dc15..cee9a016ac447 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -10,6 +10,11 @@ #include #include +struct diff_wrapper { + float * diff; + size_t n_rows; +}; + struct callback_data { std::vector data; int n_tokens = 0; @@ -20,11 +25,14 @@ struct callback_data { std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] std::vector v_diff; // vector of matrices of size [n_embd, n_tokens] std::vector v_final; // vector of finished vectors of size [n_embd] + // each element of the outer vector correspond to one layer, each element of the inner vector correspond to one prompt pass + std::vector> v_diffs_wrapped; // vector of compiled diff matrices to be concatenated ~callback_data() { for (auto ptr : v_pos) free(ptr); for (auto ptr : v_neg) free(ptr); for (auto ptr : v_diff) free(ptr); for (auto ptr : v_final) free(ptr); + for (auto & vec : v_diffs_wrapped) for (auto ptr : vec) free(ptr.diff); } }; @@ -32,8 +40,8 @@ struct ctrl_params { std::string outfile = "control_vector.gguf"; std::string completions_file = "examples/control-vector-generator/completions.txt"; /* pair of prompts to be used for generating the vectors */ - std::string positive_prompts_file = "positive.txt"; - std::string negative_prompts_file = "negative.txt"; + std::string positive_prompts_file = "examples/control-vector-generator/positive.txt"; + std::string negative_prompts_file = "examples/control-vector-generator/negative.txt"; std::vector positive_prompts; std::vector negative_prompts; /* pair of prompts to be used for testing */ @@ -157,20 +165,20 @@ static std::string format_template(std::string persona, std::string suffix) { return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; } -/*static void populate_entries(ctrl_params & cparams) { +static void populate_entries(ctrl_params & cparams, std::string positive, std::string negative) { std::string line; std::ifstream completions_file(cparams.completions_file); if (completions_file.is_open()) { while (std::getline(completions_file, line)) { // TODO replicate the truncations done by the python implementation - cparams.positive_entries.push_back(format_template(cparams.positive, line)); - cparams.negative_entries.push_back(format_template(cparams.negative, line)); + cparams.positive_entries.push_back(format_template(positive, line)); + cparams.negative_entries.push_back(format_template(negative, line)); } completions_file.close(); } else { throw std::invalid_argument("error: invalid completions file or file could not be opened"); } -}*/ // TODO actually do something with this +} static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; @@ -251,6 +259,11 @@ static void padding_seq(llama_context * ctx, std::vector & tokens, } } +static bool is_row_all_zeros(float * diff, size_t row, size_t cols) { + for (size_t i = 0; i < cols; ++i) if (diff[row * cols + i] != 0.0) return false; + return true; +} + static void calc_diff(callback_data & cb_data) { // TODO: assert cb_data.v_pos.size() == cb_data.v_neg.size() const size_t n_elems = cb_data.n_embd * cb_data.n_tokens; @@ -261,7 +274,47 @@ static void calc_diff(callback_data & cb_data) { for (size_t i = 0; i < n_elems; i++) { dest[i] = inp_pos[i] - inp_neg[i]; } - cb_data.v_diff.push_back(dest); + + // strip zero rows + std::vector nonzero_rows; + for (size_t i = 0; i < cb_data.n_tokens; ++i) { + if (!is_row_all_zeros(dest, i, cb_data.n_embd)) { + nonzero_rows.push_back(i); + } + } + + diff_wrapper dw; + dw.n_rows = nonzero_rows.size(); + dw.diff = (float *) malloc(dw.n_rows * cb_data.n_embd * sizeof(float)); + + size_t offset = 0; + for (size_t i = 0; i < dw.n_rows; ++i) { + float * origin = dest + nonzero_rows[i] * cb_data.n_embd; + memcpy(dw.diff + offset, origin, cb_data.n_embd * sizeof(float)); + offset += cb_data.n_embd; + } + + cb_data.v_diffs_wrapped[il].push_back(dw); + delete dest; + } +} + +static void concatenate_diffs(callback_data & cb_data) { + for (size_t i = 0; i < cb_data.v_diffs_wrapped.size(); ++i) { + std::vector & vec = cb_data.v_diffs_wrapped[i]; + size_t n_rows_total = 0; + for (size_t j = 0; i < vec.size(); ++j) { + n_rows_total += vec[j].n_rows; + } + float * diff = (float *) malloc(n_rows_total * cb_data.n_embd * sizeof(float)); + size_t offset = 0; + for (size_t j = 0; j < vec.size(); ++j) { + float * origin = vec[j].diff; + memcpy(diff + offset, origin, vec[j].n_rows * cb_data.n_embd * sizeof(float)); + offset += vec[j].n_rows * cb_data.n_embd; + delete vec[j].diff; + } + cb_data.v_diff.push_back(diff); } } @@ -382,7 +435,6 @@ static void export_gguf(std::vector v_final, int n_embd, const std::str gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_final.size()); - // TODO customize mem size - I have no idea what this is supposed to be struct ggml_init_params params = { /*.mem_size =*/ (ggml_tensor_overhead() * v_final.size()) + (n_embd * v_final.size() * sizeof(float)), @@ -464,39 +516,25 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < v_final.size(); ++i) { v_final[i] = (float *) calloc(n_embd, sizeof(float)); } - llama_free(ctx); - llama_free_model(model); + // create templated prompts for (size_t i = 0; i < n_prompts; ++i) { - callback_data cb_data; - - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = cb_eval; - params.cb_eval_user_data = &cb_data; - params.warmup = false; - - // load model - llama_model * model; - llama_context * ctx; - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; - } + populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); + } + + callback_data cb_data; + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = cb_eval; + params.cb_eval_user_data = &cb_data; + params.warmup = false; - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - - /* TODO this just tokenizes the exact pos/neg strings, correct? - * instead we want to create a bunch of starter prompts for it to work off - * we need to run get_hidden_layers many many times and then figure out how to combine the resulting vectors - * see the blogpost + python implementation for reference - * - * https://vgel.me/posts/representation-engineering/ - * https://github.com/vgel/repeng/blob/main/repeng/extract.py - */ - std::string positive_prompt = cparams.positive_prompts[i]; - std::string negative_prompt = cparams.negative_prompts[i]; + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { + std::string positive_prompt = cparams.positive_entries[i]; + std::string negative_prompt = cparams.negative_entries[i]; std::vector tokens_pos = ::llama_tokenize(ctx, positive_prompt, add_bos); std::vector tokens_neg = ::llama_tokenize(ctx, negative_prompt, add_bos); size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); @@ -511,26 +549,37 @@ int main(int argc, char ** argv) { get_hidden_layers(ctx, tokens_pos); cb_data.is_eval_pos = false; get_hidden_layers(ctx, tokens_neg); + // FIXME because you don't reload the model you actually run out of context lmao + // fix that... or do we want to reload for every new prompt? but that would take forever + // perhaps add that as a flag to the program - printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4097]); - printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4097]); - + // TODO actually check that this works calc_diff(cb_data); - printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4097]); - pca(cb_data); + // reset for next iteration + cb_data.v_pos.clear(); + cb_data.v_neg.clear(); + } - // add the output vector to v_final - for (size_t j = 0; j < cb_data.v_final.size(); ++j) { - for (size_t k = 0; k < n_embd; ++k) { - v_final[j][k] += cb_data.v_final[j][k]; - } - } - printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); + // TODO actually check that this works + concatenate_diffs(cb_data); + + // TODO diffs should be the same from here but still check that this works + pca(cb_data); - llama_free(ctx); - llama_free_model(model); + // TODO get rid of this + // add the output vector to v_final + for (size_t j = 0; j < cb_data.v_final.size(); ++j) { + for (size_t k = 0; k < n_embd; ++k) { + v_final[j][k] += cb_data.v_final[j][k]; + } } + printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); + + llama_free(ctx); + llama_free_model(model); + + // calculate the mean value of v_final // TODO: maybe using LERP here diff --git a/examples/control-vector-generator/negative.txt b/examples/control-vector-generator/negative.txt new file mode 100644 index 0000000000000..4f84a22ba9f9a --- /dev/null +++ b/examples/control-vector-generator/negative.txt @@ -0,0 +1 @@ +sad \ No newline at end of file diff --git a/examples/control-vector-generator/positive.txt b/examples/control-vector-generator/positive.txt new file mode 100644 index 0000000000000..adaa78b2175c6 --- /dev/null +++ b/examples/control-vector-generator/positive.txt @@ -0,0 +1 @@ +happy \ No newline at end of file From 4d88cd1af1a7975220573642c5509f7de927a818 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 31 May 2024 12:40:35 -0400 Subject: [PATCH 12/56] fix zero output & param parsing, functional templating fixed a bug where the output file had no tensor data/was all zero fixed a bug where single hyphen flags were not being correctly parsed implements creation of templated prompts from input (still need to adapt based on model) --- .../control-vector-generator.cpp | 154 +++++++++--------- .../control-vector-generator/negative.txt | 2 +- .../control-vector-generator/positive.txt | 2 +- 3 files changed, 80 insertions(+), 78 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index cee9a016ac447..8d4983c88a66c 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -11,8 +11,8 @@ #include struct diff_wrapper { - float * diff; - size_t n_rows; + float * diff; // matrix of size [n_rows, cb_data.n_embd] with zero rows stripped + size_t n_rows; // number of rows in the matrix for size calculation }; struct callback_data { @@ -56,23 +56,24 @@ static void print_usage(const char * executable) { printf("Creates a GGUF control vector for a given model."); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" --outfile output file (default: 'control_vector.gguf')\n"); - printf(" --completions-file completions file (default: 'examples/control-vector-generator/completions.txt')\n"); - printf(" -pf, --positive-file positive prompts file, one prompt per line (default: 'positive.txt')\n"); - printf(" -nf, --negative-file negative prompts file, one prompt per line (default: 'negative.txt')\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -o, --outfile output file (default: 'control_vector.gguf')\n"); + printf(" -cf, --completions-file completions file (default: 'examples/control-vector-generator/completions.txt')\n"); + printf(" -pf, --positive-file positive prompts file, one prompt per line (default: 'examples/control-vector-generator/positive.txt')\n"); + printf(" -nf, --negative-file negative prompts file, one prompt per line (default: 'examples/control-vector-generator/negative.txt')\n"); printf("\n"); - printf("gpt-opts: other options from main\n"); + printf("gpt-opts:\n"); + printf(" other options from main\n"); printf("\n"); } static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) { std::string arg; const std::string arg_prefix = "--"; + // hack to skip ctrlvec args in gpt_parse_params but we'll leave it as is int skipme = 0; - int arg_idx = 1; - for(; arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) == 0; ++arg_idx) { + for(int arg_idx = 1; arg_idx < argc; ++arg_idx) { arg = argv[arg_idx]; if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { std::replace(arg.begin(), arg.end(), '_', '-'); @@ -87,19 +88,17 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); } - if (arg == "--outfile") { + if (arg == "--outfile" || arg == "-o") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { params.outfile = argv[arg_idx]; - // FIXME hack to skip these args in gpt_parse_params skipme += 2; } else { throw std::invalid_argument("error: missing argument for " + arg); } } - if (arg == "--completions-file") { + if (arg == "--completions-file" || arg == "-cf") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { params.completions_file = argv[arg_idx]; - // FIXME hack to skip these args in gpt_parse_params skipme += 2; } else { throw std::invalid_argument("error: missing argument for " + arg); @@ -108,7 +107,6 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) if (arg == "--positive-file" || arg == "-pf") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { params.positive_prompts_file = argv[arg_idx]; - // FIXME hack to skip these args in gpt_parse_params skipme += 2; } else { throw std::invalid_argument("error: missing argument for " + arg); @@ -117,13 +115,12 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) if (arg == "--negative-file" || arg == "-nf") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { params.negative_prompts_file = argv[arg_idx]; - // FIXME hack to skip these args in gpt_parse_params skipme += 2; } else { throw std::invalid_argument("error: missing argument for " + arg); } } - + // TODO it might be nice QoL to have single positive/negative args // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params } return skipme; @@ -161,8 +158,9 @@ static std::vector ctrlvec_load_prompt_file(std::string path) { static std::string format_template(std::string persona, std::string suffix) { const std::string user_tag = "[INST]"; const std::string asst_tag = "[/INST]"; - // TODO make this dynamic - allow the user to change it somehow - return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; + // TODO make this dynamic - allow the user to change it somehow - and adapt based on model + //return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; + return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" } static void populate_entries(ctrl_params & cparams, std::string positive, std::string negative) { @@ -259,14 +257,15 @@ static void padding_seq(llama_context * ctx, std::vector & tokens, } } -static bool is_row_all_zeros(float * diff, size_t row, size_t cols) { - for (size_t i = 0; i < cols; ++i) if (diff[row * cols + i] != 0.0) return false; +static bool is_row_all_zeros(float * diff, size_t row, size_t cols, float eps = 1e-6) { + for (size_t i = 0; i < cols; ++i) if (diff[row * cols + i] > eps) return false; return true; } static void calc_diff(callback_data & cb_data) { // TODO: assert cb_data.v_pos.size() == cb_data.v_neg.size() const size_t n_elems = cb_data.n_embd * cb_data.n_tokens; + cb_data.v_diffs_wrapped.resize(cb_data.v_pos.size()); for (size_t il = 0; il < cb_data.v_pos.size(); il++) { auto & inp_pos = cb_data.v_pos[il]; auto & inp_neg = cb_data.v_neg[il]; @@ -275,6 +274,8 @@ static void calc_diff(callback_data & cb_data) { dest[i] = inp_pos[i] - inp_neg[i]; } + // TODO can we make this faster? like check during the above operation rather than on a second pass? + // strip zero rows std::vector nonzero_rows; for (size_t i = 0; i < cb_data.n_tokens; ++i) { @@ -283,7 +284,13 @@ static void calc_diff(callback_data & cb_data) { } } - diff_wrapper dw; + /* debug + if(cb_data.n_tokens != nonzero_rows.size()) { + std::cout << "original n_tokens: " << cb_data.n_tokens << std::endl; + std::cout << "zero rows in layer " << il << ": " << cb_data.n_tokens - nonzero_rows.size() << std::endl; + } */ + + struct diff_wrapper dw; dw.n_rows = nonzero_rows.size(); dw.diff = (float *) malloc(dw.n_rows * cb_data.n_embd * sizeof(float)); @@ -303,16 +310,16 @@ static void concatenate_diffs(callback_data & cb_data) { for (size_t i = 0; i < cb_data.v_diffs_wrapped.size(); ++i) { std::vector & vec = cb_data.v_diffs_wrapped[i]; size_t n_rows_total = 0; - for (size_t j = 0; i < vec.size(); ++j) { + for (size_t j = 0; j < vec.size(); ++j) { n_rows_total += vec[j].n_rows; } + // std::cout << "n_rows_total: " << n_rows_total << std::endl; float * diff = (float *) malloc(n_rows_total * cb_data.n_embd * sizeof(float)); size_t offset = 0; for (size_t j = 0; j < vec.size(); ++j) { float * origin = vec[j].diff; memcpy(diff + offset, origin, vec[j].n_rows * cb_data.n_embd * sizeof(float)); offset += vec[j].n_rows * cb_data.n_embd; - delete vec[j].diff; } cb_data.v_diff.push_back(diff); } @@ -344,6 +351,7 @@ static void normalize_inplace(std::vector & vec) { for (const float& val : vec) { norm += val * val; } + if(norm == 0) throw std::runtime_error("norm is zero"); norm = std::sqrt(norm); for (float& val : vec) { val /= norm; @@ -407,7 +415,6 @@ static void pca(callback_data & cb_data) { std::vector eigenvector = power_iteration(cb_data, matrix); cb_data.v_final[il] = (float *) malloc(eigenvector.size() * sizeof(float)); memcpy(cb_data.v_final[il], eigenvector.data(), eigenvector.size() * sizeof(float)); - delete[] matrix; printf("Done with layer %d\n", il); printf("il = %d | %f %f \n", il, cb_data.v_final[il][0], cb_data.v_final[il][1]); } @@ -427,39 +434,39 @@ static std::string to_string(const T & val) { return ss.str(); } -static void export_gguf(std::vector v_final, int n_embd, const std::string fname, const std::string model_hint) { +static void export_gguf(callback_data & cb_data, int n_layers, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); + //int test = cb_data.v_final.size(); + int test = n_layers - 1; + // replaced cb_data.v_final.size() with n_layers - 1 + const std::string arch = "controlvector"; gguf_set_val_str(ctx, "general.architecture", arch.c_str()); gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); - gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_final.size()); + gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), test); struct ggml_init_params params = { - /*.mem_size =*/ (ggml_tensor_overhead() * v_final.size()) - + (n_embd * v_final.size() * sizeof(float)), + /*.mem_size =*/ (ggml_tensor_overhead() * test) + + (cb_data.n_embd * test * sizeof(float)), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; struct ggml_context * ctx_data = ggml_init(params); - for (size_t i = 0; i < v_final.size(); ++i) { + for (size_t i = 0; i < test; ++i) { // TODO this number is probably not right - figure out which layer is which // the python implementation uses a dict to handle this, we don't know if it's 1, 2, 3, 4... or other const std::string name = "direction." + to_string(i+1); - struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, n_embd); + struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, cb_data.n_embd); ggml_set_name(cur, name.c_str()); - // TODO figure out how to set data - it's whining about buf != NULL when using the below commented line - //ggml_backend_tensor_set(cur, cb_data.v_final[i], 0, cb_data.n_embd * sizeof(float)); - { - float * data = (float *) cur->data; - for(int j = 0; j < ggml_nelements(cur); j++) { - data[j] = v_final[i][j]; - } + float * data = (float *) cur->data; + for(int j = 0; j < cb_data.n_embd; j++) { + data[j] = cb_data.v_final[i][j]; } gguf_add_tensor(ctx, cur); @@ -480,10 +487,8 @@ static void export_gguf(std::vector v_final, int n_embd, const std::str int main(int argc, char ** argv) { ctrl_params cparams; - int skipme = ctrlvec_params_parse(argc, argv, cparams); - //populate_entries(cparams); - // FIXME hack to skip the ctrlvec args in parsing gpt params + int skipme = ctrlvec_params_parse(argc, argv, cparams); argc -= skipme; argv += skipme; @@ -500,6 +505,14 @@ int main(int argc, char ** argv) { return 1; } + callback_data cb_data; + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = cb_eval; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + print_build_info(); llama_backend_init(); llama_numa_init(params.numa); @@ -508,9 +521,11 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; std::tie(model, ctx) = llama_init_from_gpt_params(params); + int n_layers = llama_n_layer(model); int n_embd = llama_n_embd(model); int n_prompts = cparams.positive_prompts.size(); + // vector of finished vectors of size [n_embd], we have (n_layers - 1) vectors in total std::vector v_final(n_layers - 1, NULL); for (size_t i = 0; i < v_final.size(); ++i) { @@ -520,18 +535,14 @@ int main(int argc, char ** argv) { // create templated prompts for (size_t i = 0; i < n_prompts; ++i) { populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); - } - - callback_data cb_data; - - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = cb_eval; - params.cb_eval_user_data = &cb_data; - params.warmup = false; + } const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + int token_ct = 0; + int n_ctx = llama_n_ctx(ctx); + + // TODO multithread this for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { std::string positive_prompt = cparams.positive_entries[i]; std::string negative_prompt = cparams.negative_entries[i]; @@ -543,17 +554,29 @@ int main(int argc, char ** argv) { cb_data.n_tokens = max_seq_len; cb_data.n_embd = n_embd; + // need to reload the model so it doesn't run out of context + // this should scale with -c option passed by main + // TODO maybe we want to add an option to reload for every new prompt + token_ct += 2 * max_seq_len; + if (token_ct >= n_ctx) { + //break; + llama_free(ctx); + llama_free_model(model); + std::tie(model, ctx) = llama_init_from_gpt_params(params); + token_ct = 2 * max_seq_len; + } + printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", positive_prompt.c_str(), negative_prompt.c_str(), max_seq_len); cb_data.is_eval_pos = true; get_hidden_layers(ctx, tokens_pos); cb_data.is_eval_pos = false; get_hidden_layers(ctx, tokens_neg); - // FIXME because you don't reload the model you actually run out of context lmao - // fix that... or do we want to reload for every new prompt? but that would take forever - // perhaps add that as a flag to the program - // TODO actually check that this works + // TODO check whether the same tokens correspond to zero rows because we don't seem to be getting many zero rows anymore + // we get a lot of zero rows for the first few prompts and then they drop off + // likewise most of the zero rows are in the first few layers for each prompt + calc_diff(cb_data); // reset for next iteration @@ -561,39 +584,18 @@ int main(int argc, char ** argv) { cb_data.v_neg.clear(); } - // TODO actually check that this works concatenate_diffs(cb_data); - - // TODO diffs should be the same from here but still check that this works pca(cb_data); - - // TODO get rid of this - // add the output vector to v_final - for (size_t j = 0; j < cb_data.v_final.size(); ++j) { - for (size_t k = 0; k < n_embd; ++k) { - v_final[j][k] += cb_data.v_final[j][k]; - } - } printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); llama_free(ctx); llama_free_model(model); - - - // calculate the mean value of v_final - // TODO: maybe using LERP here - for (size_t j = 0; j < v_final.size(); ++j) { - for (size_t k = 0; k < n_embd; ++k) { - v_final[j][k] /= n_prompts; - } - } - // TODO figure out how to extract this from model - there's no API exposed to get model arch string // we need get_arch_name() from llama.cpp // TODO also has support been implemeneted for arches other than llama yet? see #5970 std::string model_hint = "llama"; - export_gguf(v_final, n_embd, cparams.outfile, model_hint); + export_gguf(cb_data, n_layers, cparams.outfile, model_hint); llama_backend_free(); diff --git a/examples/control-vector-generator/negative.txt b/examples/control-vector-generator/negative.txt index 4f84a22ba9f9a..2ac3387f184b0 100644 --- a/examples/control-vector-generator/negative.txt +++ b/examples/control-vector-generator/negative.txt @@ -1 +1 @@ -sad \ No newline at end of file +[INST] Act like a person who is extremely sad. [/INST] \ No newline at end of file diff --git a/examples/control-vector-generator/positive.txt b/examples/control-vector-generator/positive.txt index adaa78b2175c6..f28e9aa1aeb72 100644 --- a/examples/control-vector-generator/positive.txt +++ b/examples/control-vector-generator/positive.txt @@ -1 +1 @@ -happy \ No newline at end of file +[INST] Act like a person who is extremely happy. [/INST] \ No newline at end of file From 4d7d71bc43efec6362a1f1d440786cf9b805f046 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 31 May 2024 21:08:25 -0400 Subject: [PATCH 13/56] fix square_diff matmul index range and CRLF->LF line endings fixed a logic error where square_diff would not multiply all rows fixed a formatting error where the provided completions.txt had CRLF line endings --- .../control-vector-generator.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 8d4983c88a66c..2541fcb2709ad 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -10,6 +10,8 @@ #include #include +// TODO read everything over and make sure it makes sense because you're dropping logic errors left and right + struct diff_wrapper { float * diff; // matrix of size [n_rows, cb_data.n_embd] with zero rows stripped size_t n_rows; // number of rows in the matrix for size calculation @@ -23,14 +25,14 @@ struct callback_data { // each element of the vector correspond to one layer std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] - std::vector v_diff; // vector of matrices of size [n_embd, n_tokens] std::vector v_final; // vector of finished vectors of size [n_embd] + std::vector v_diff; // vector of matrices of size [n_embd, m] where m is some some sum of concatenated matrices // each element of the outer vector correspond to one layer, each element of the inner vector correspond to one prompt pass std::vector> v_diffs_wrapped; // vector of compiled diff matrices to be concatenated ~callback_data() { for (auto ptr : v_pos) free(ptr); for (auto ptr : v_neg) free(ptr); - for (auto ptr : v_diff) free(ptr); + for (auto ptr : v_diff) free(ptr.diff); for (auto ptr : v_final) free(ptr); for (auto & vec : v_diffs_wrapped) for (auto ptr : vec) free(ptr.diff); } @@ -321,7 +323,10 @@ static void concatenate_diffs(callback_data & cb_data) { memcpy(diff + offset, origin, vec[j].n_rows * cb_data.n_embd * sizeof(float)); offset += vec[j].n_rows * cb_data.n_embd; } - cb_data.v_diff.push_back(diff); + struct diff_wrapper dw; + dw.n_rows = n_rows_total; + dw.diff = diff; + cb_data.v_diff.push_back(dw); } } @@ -335,8 +340,8 @@ static float* square_diff(callback_data & cb_data, size_t idx) { for (size_t i = 0; i < cb_data.n_embd; i++) { for (size_t j = 0; j < cb_data.n_embd; j++) { float sum = 0.0f; - for (size_t k = 0; k < cb_data.n_tokens; k++) { - sum += cb_data.v_diff[idx][i + cb_data.n_embd * k] * cb_data.v_diff[idx][j + cb_data.n_embd * k]; + for (size_t k = 0; k < cb_data.v_diff[idx].n_rows; k++) { + sum += cb_data.v_diff[idx].diff[i + cb_data.n_embd * k] * cb_data.v_diff[idx].diff[j + cb_data.n_embd * k]; } result[i * cb_data.n_embd + j] = sum; } From 62560367aa1ecf1d75df3baffee6e8dbff62fd7c Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 31 May 2024 21:27:14 -0400 Subject: [PATCH 14/56] add command-line args for num threads, num completions file lines, always reload model refactored a few things and did what the commit message says on the tin --- .../control-vector-generator.cpp | 73 ++++++++++++++++--- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 2541fcb2709ad..33da54ec781c0 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -39,13 +39,21 @@ struct callback_data { }; struct ctrl_params { + /* default meta parameters */ + bool always_reload = false; + int n_completions = 64; + int n_threads = 8; + + /* default filepaths */ std::string outfile = "control_vector.gguf"; std::string completions_file = "examples/control-vector-generator/completions.txt"; - /* pair of prompts to be used for generating the vectors */ std::string positive_prompts_file = "examples/control-vector-generator/positive.txt"; std::string negative_prompts_file = "examples/control-vector-generator/negative.txt"; + + /* pair of prompts to be used for generating the vectors */ std::vector positive_prompts; std::vector negative_prompts; + /* pair of prompts to be used for testing */ std::vector positive_entries; std::vector negative_entries; @@ -59,10 +67,19 @@ static void print_usage(const char * executable) { printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" -o, --outfile output file (default: 'control_vector.gguf')\n"); - printf(" -cf, --completions-file completions file (default: 'examples/control-vector-generator/completions.txt')\n"); - printf(" -pf, --positive-file positive prompts file, one prompt per line (default: 'examples/control-vector-generator/positive.txt')\n"); - printf(" -nf, --negative-file negative prompts file, one prompt per line (default: 'examples/control-vector-generator/negative.txt')\n"); + printf(" -t, --num-threads number of threads to use (do not confuse with gpt-opts -t)\n"); + printf(" default: 8\n"); + printf(" -o, --outfile output file\n"); + printf(" default: 'control_vector.gguf'\n"); + printf(" -pf, --positive-file positive prompts file, one prompt per line\n"); + printf(" default: 'examples/control-vector-generator/positive.txt'\n"); + printf(" -nf, --negative-file negative prompts file, one prompt per line\n"); + printf(" default: 'examples/control-vector-generator/negative.txt'\n"); + printf(" -cf, --completions-file completions file\n"); + printf(" default: 'examples/control-vector-generator/completions.txt'\n"); + printf(" -nc, --num-completions number of lines of completions file to use\n"); + printf(" default: 64\n"); + printf(" --always-reload reload the model for every new template to parse\n"); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); @@ -122,6 +139,36 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) throw std::invalid_argument("error: missing argument for " + arg); } } + if (arg == "--num-completions" || arg == "-nc") { + if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { + try { + params.n_completions = std::stoi(argv[arg_idx]); + } + catch (const std::invalid_argument & ex) { + throw std::invalid_argument("error: invalid argument for " + arg); + } + skipme += 2; + } else { + throw std::invalid_argument("error: missing argument for " + arg); + } + } + if (arg == "--num-threads" || arg == "-t") { + if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { + try { + params.n_threads = std::stoi(argv[arg_idx]); + } + catch (const std::invalid_argument & ex) { + throw std::invalid_argument("error: invalid argument for " + arg); + } + skipme += 2; + } else { + throw std::invalid_argument("error: missing argument for " + arg); + } + } + if (arg == "--always-reload") { + params.always_reload = true; + skipme += 1; + } // TODO it might be nice QoL to have single positive/negative args // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params } @@ -168,11 +215,13 @@ static std::string format_template(std::string persona, std::string suffix) { static void populate_entries(ctrl_params & cparams, std::string positive, std::string negative) { std::string line; std::ifstream completions_file(cparams.completions_file); + int i = 0; if (completions_file.is_open()) { - while (std::getline(completions_file, line)) { + while (std::getline(completions_file, line) && i < cparams.n_completions) { // TODO replicate the truncations done by the python implementation cparams.positive_entries.push_back(format_template(positive, line)); cparams.negative_entries.push_back(format_template(negative, line)); + i++; } completions_file.close(); } else { @@ -409,8 +458,7 @@ static std::vector power_iteration(callback_data & cb_data, const float * } // TODO translate to ggml -static void pca(callback_data & cb_data) { - size_t n_threads = 8; +static void pca(callback_data & cb_data, size_t n_threads) { int n_layers = cb_data.v_diff.size(); std::vector threads; cb_data.v_final.reserve(n_layers); @@ -561,15 +609,18 @@ int main(int argc, char ** argv) { // need to reload the model so it doesn't run out of context // this should scale with -c option passed by main - // TODO maybe we want to add an option to reload for every new prompt token_ct += 2 * max_seq_len; - if (token_ct >= n_ctx) { + if (token_ct > n_ctx || cparams.always_reload) { //break; llama_free(ctx); llama_free_model(model); std::tie(model, ctx) = llama_init_from_gpt_params(params); token_ct = 2 * max_seq_len; } + if (token_ct > n_ctx) { + fprintf(stderr, "context size exceeded on iteration %d\n", i); + break; + } printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", positive_prompt.c_str(), negative_prompt.c_str(), max_seq_len); @@ -590,7 +641,7 @@ int main(int argc, char ** argv) { } concatenate_diffs(cb_data); - pca(cb_data); + pca(cb_data, cparams.n_threads); printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); llama_free(ctx); From db3ba108e79f819ec8a1a9c01d14faa9d4951490 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 31 May 2024 21:38:02 -0400 Subject: [PATCH 15/56] code aestheticization --- .../control-vector-generator.cpp | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 33da54ec781c0..70f3668e938a0 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -19,16 +19,20 @@ struct diff_wrapper { struct callback_data { std::vector data; + int n_tokens = 0; int n_embd = 0; bool is_eval_pos = true; + // each element of the vector correspond to one layer - std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] - std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] - std::vector v_final; // vector of finished vectors of size [n_embd] - std::vector v_diff; // vector of matrices of size [n_embd, m] where m is some some sum of concatenated matrices + std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] + std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] + std::vector v_final; // vector of finished vectors of size [n_embd] + std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions + // each element of the outer vector correspond to one layer, each element of the inner vector correspond to one prompt pass std::vector> v_diffs_wrapped; // vector of compiled diff matrices to be concatenated + ~callback_data() { for (auto ptr : v_pos) free(ptr); for (auto ptr : v_neg) free(ptr); @@ -66,10 +70,8 @@ static void print_usage(const char * executable) { printf("Creates a GGUF control vector for a given model."); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -t, --num-threads number of threads to use (do not confuse with gpt-opts -t)\n"); - printf(" default: 8\n"); - printf(" -o, --outfile output file\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -o, --outfile output file\n"); printf(" default: 'control_vector.gguf'\n"); printf(" -pf, --positive-file positive prompts file, one prompt per line\n"); printf(" default: 'examples/control-vector-generator/positive.txt'\n"); @@ -77,9 +79,11 @@ static void print_usage(const char * executable) { printf(" default: 'examples/control-vector-generator/negative.txt'\n"); printf(" -cf, --completions-file completions file\n"); printf(" default: 'examples/control-vector-generator/completions.txt'\n"); - printf(" -nc, --num-completions number of lines of completions file to use\n"); + printf(" -nc, --num-completions N number of lines of completions file to use\n"); printf(" default: 64\n"); - printf(" --always-reload reload the model for every new template to parse\n"); + printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n"); + printf(" default: 8\n"); + printf(" --always-reload reload the model for every new template to parse\n"); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); @@ -88,7 +92,7 @@ static void print_usage(const char * executable) { static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) { std::string arg; - const std::string arg_prefix = "--"; + const std::string arg_prefix = "-"; // hack to skip ctrlvec args in gpt_parse_params but we'll leave it as is int skipme = 0; @@ -357,6 +361,7 @@ static void calc_diff(callback_data & cb_data) { } } +// TODO do we want to multithread this? it takes very little time as it is static void concatenate_diffs(callback_data & cb_data) { for (size_t i = 0; i < cb_data.v_diffs_wrapped.size(); ++i) { std::vector & vec = cb_data.v_diffs_wrapped[i]; @@ -398,7 +403,7 @@ static float* square_diff(callback_data & cb_data, size_t idx) { return result; } -// TODO translate to ggml +// TODO translate to ggml (this is a built-in function in ggml) static void normalize_inplace(std::vector & vec) { // inefficient(?) norm computation float norm = 0.0f; @@ -412,7 +417,7 @@ static void normalize_inplace(std::vector & vec) { } } -// TODO translate to ggml +// TODO translate to ggml (this is a built-in function in ggml) static std::vector mul_mat(const float * mat, const std::vector & vec, size_t dim) { std::vector result(dim, 0.0f); for (size_t i = 0; i < dim; ++i) { @@ -459,7 +464,7 @@ static std::vector power_iteration(callback_data & cb_data, const float * // TODO translate to ggml static void pca(callback_data & cb_data, size_t n_threads) { - int n_layers = cb_data.v_diff.size(); + int n_layers = cb_data.v_diff.size(); std::vector threads; cb_data.v_final.reserve(n_layers); auto worker_function = [&](int worker_id) { @@ -577,6 +582,7 @@ int main(int argc, char ** argv) { int n_layers = llama_n_layer(model); int n_embd = llama_n_embd(model); + cb_data.n_embd = n_embd; int n_prompts = cparams.positive_prompts.size(); // vector of finished vectors of size [n_embd], we have (n_layers - 1) vectors in total @@ -605,7 +611,6 @@ int main(int argc, char ** argv) { padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len); cb_data.n_tokens = max_seq_len; - cb_data.n_embd = n_embd; // need to reload the model so it doesn't run out of context // this should scale with -c option passed by main From 86842b20e545b38c45846b1c21ccb7ac3d92ea0a Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Fri, 31 May 2024 22:25:46 -0400 Subject: [PATCH 16/56] fix compiler warnings --- .../control-vector-generator.cpp | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 70f3668e938a0..1e6d6b5e09b4f 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -333,7 +333,7 @@ static void calc_diff(callback_data & cb_data) { // strip zero rows std::vector nonzero_rows; - for (size_t i = 0; i < cb_data.n_tokens; ++i) { + for (int i = 0; i < cb_data.n_tokens; ++i) { if (!is_row_all_zeros(dest, i, cb_data.n_embd)) { nonzero_rows.push_back(i); } @@ -357,7 +357,7 @@ static void calc_diff(callback_data & cb_data) { } cb_data.v_diffs_wrapped[il].push_back(dw); - delete dest; + free(dest); } } @@ -391,8 +391,8 @@ static void concatenate_diffs(callback_data & cb_data) { static float* square_diff(callback_data & cb_data, size_t idx) { float* result = new float[cb_data.n_embd * cb_data.n_embd]; std::memset(result, 0, cb_data.n_embd * cb_data.n_embd * sizeof(float)); - for (size_t i = 0; i < cb_data.n_embd; i++) { - for (size_t j = 0; j < cb_data.n_embd; j++) { + for (size_t i = 0; i < (size_t) cb_data.n_embd; i++) { + for (size_t j = 0; j < (size_t) cb_data.n_embd; j++) { float sum = 0.0f; for (size_t k = 0; k < cb_data.v_diff[idx].n_rows; k++) { sum += cb_data.v_diff[idx].diff[i + cb_data.n_embd * k] * cb_data.v_diff[idx].diff[j + cb_data.n_embd * k]; @@ -463,7 +463,7 @@ static std::vector power_iteration(callback_data & cb_data, const float * } // TODO translate to ggml -static void pca(callback_data & cb_data, size_t n_threads) { +static void pca(callback_data & cb_data, int n_threads) { int n_layers = cb_data.v_diff.size(); std::vector threads; cb_data.v_final.reserve(n_layers); @@ -495,25 +495,23 @@ static std::string to_string(const T & val) { static void export_gguf(callback_data & cb_data, int n_layers, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); - //int test = cb_data.v_final.size(); - int test = n_layers - 1; - // replaced cb_data.v_final.size() with n_layers - 1 + size_t v_final_size_eff = n_layers - 1; const std::string arch = "controlvector"; gguf_set_val_str(ctx, "general.architecture", arch.c_str()); gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); - gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), test); + gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_final_size_eff); struct ggml_init_params params = { - /*.mem_size =*/ (ggml_tensor_overhead() * test) - + (cb_data.n_embd * test * sizeof(float)), + /*.mem_size =*/ (ggml_tensor_overhead() * v_final_size_eff) + + (cb_data.n_embd * v_final_size_eff * sizeof(float)), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; struct ggml_context * ctx_data = ggml_init(params); - for (size_t i = 0; i < test; ++i) { + for (size_t i = 0; i < v_final_size_eff; ++i) { // TODO this number is probably not right - figure out which layer is which // the python implementation uses a dict to handle this, we don't know if it's 1, 2, 3, 4... or other const std::string name = "direction." + to_string(i+1); @@ -528,7 +526,7 @@ static void export_gguf(callback_data & cb_data, int n_layers, const std::string } gguf_add_tensor(ctx, cur); - printf("Added tensor %d\n", i); + printf("Added tensor %zu\n", i); } printf("Writing file...\n"); @@ -592,7 +590,7 @@ int main(int argc, char ** argv) { } // create templated prompts - for (size_t i = 0; i < n_prompts; ++i) { + for (int i = 0; i < n_prompts; ++i) { populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); } @@ -623,7 +621,7 @@ int main(int argc, char ** argv) { token_ct = 2 * max_seq_len; } if (token_ct > n_ctx) { - fprintf(stderr, "context size exceeded on iteration %d\n", i); + fprintf(stderr, "context size exceeded on iteration %zu\n", i); break; } From 544268888bc2136e3e7a165d6ef6fea5bb857774 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 1 Jun 2024 17:25:21 -0400 Subject: [PATCH 17/56] in-series multithreading for prompt embedding? added commented-out code to attempt to start implementing mutlithreading for embedding in main --- .../control-vector-generator.cpp | 113 +++++++++++++++++- 1 file changed, 107 insertions(+), 6 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 1e6d6b5e09b4f..1f55ba5fa0d26 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -17,6 +17,15 @@ struct diff_wrapper { size_t n_rows; // number of rows in the matrix for size calculation }; +/* TODO part of multithreading +struct tokens_pair { + size_t max_seq_len; + std::string positive; + std::string negative; + std::vector tokens_pos; + std::vector tokens_neg; +}; */ + struct callback_data { std::vector data; @@ -45,6 +54,8 @@ struct callback_data { struct ctrl_params { /* default meta parameters */ bool always_reload = false; + // TODO part of multithreading + // bool max_batch = false; int n_completions = 64; int n_threads = 8; @@ -84,6 +95,8 @@ static void print_usage(const char * executable) { printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n"); printf(" default: 8\n"); printf(" --always-reload reload the model for every new template to parse\n"); + // TODO part of multithreading + //printf(" --max-batch maximize batch sizes, rather than optimizing for multithreading\n"); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); @@ -173,6 +186,11 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) params.always_reload = true; skipme += 1; } + /* TODO part of multithreading + if (arg == "--max-batch") { + params.max_batch = true; + skipme += 1; + } */ // TODO it might be nice QoL to have single positive/negative args // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params } @@ -209,10 +227,10 @@ static std::vector ctrlvec_load_prompt_file(std::string path) { } static std::string format_template(std::string persona, std::string suffix) { - const std::string user_tag = "[INST]"; - const std::string asst_tag = "[/INST]"; - // TODO make this dynamic - allow the user to change it somehow - and adapt based on model + //const std::string user_tag = "[INST]"; + //const std::string asst_tag = "[/INST]"; //return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; + // TODO make this dynamic - allow the user to change it somehow - and adapt based on model return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" } @@ -233,6 +251,61 @@ static void populate_entries(ctrl_params & cparams, std::string positive, std::s } } +/* TODO part of multithreading +static size_t tokenize_pair(tokens_pair & tp, llama_context * ctx, const std::string & pos, const std::string & neg, const bool add_bos) { + tp.positive = pos; + tp.negative = neg; + tp.tokens_pos = ::llama_tokenize(ctx, pos, add_bos); + tp.tokens_neg = ::llama_tokenize(ctx, neg, add_bos); + tp.max_seq_len = std::max(tp.tokens_pos.size(), tp.tokens_neg.size()); + padding_seq(ctx, tp.tokens_pos, tp.max_seq_len); + padding_seq(ctx, tp.tokens_neg, tp.max_seq_len); + return 2 * max_seq_len; +} + +// current batching strategy works as follows: +// each batch runs on one model load, since we reload the model after every batch to clear context +// therefore each batch must be small enough to fit in the context size +// we try to make the batches multiples of thread count so threads are used most efficiently +static std::vector> batch_prompts(llama_context * ctx, ctrl_params & cparams, int n_ctx, const bool add_bos) { + std::vector> batched_prompts; + std::vector thread_batch; + std::vector batch; + size_t n_batch_tokens = 0; + + for (size_t i = 0; i < cparams.positive_entries.size(); ++i) { + tokens_pair tp; + size_t n_tokens = tokenize_pair(tp, ctx, cparams.positive_entries[i], cparams.negative_entries[i], add_bos); + n_batch_tokens += n_tokens; + + if (n_batch_tokens > n_ctx) { + if (cparams.max_batch) { + batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); + thread_batch.clear(); + } + batched_prompts.push_back(batch); + batch.clear(); + n_batch_tokens = n_tokens; + } + + thread_batch.push_back(tp); + + if (thread_batch.size() >= cparams.n_threads) { + batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); + thread_batch.clear();; + } + } + + if (!thread_batch.empty()) { + batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); + } + if (!batch.empty()) { + batched_prompts.push_back(batch); + } + + return batched_prompts; +} */ + static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; for (int i = 0; i < GGML_MAX_DIMS; ++i) { @@ -387,13 +460,14 @@ static void concatenate_diffs(callback_data & cb_data) { // BEGIN NON-GGML IMPLEMENTATION // TODO translate to ggml -// this probably doesn't want to be here - put it into the compute graph as a step in processing each layer +// this probably doesn't want to be a separate function - put it into the compute graph as a step in processing each layer static float* square_diff(callback_data & cb_data, size_t idx) { float* result = new float[cb_data.n_embd * cb_data.n_embd]; std::memset(result, 0, cb_data.n_embd * cb_data.n_embd * sizeof(float)); for (size_t i = 0; i < (size_t) cb_data.n_embd; i++) { for (size_t j = 0; j < (size_t) cb_data.n_embd; j++) { float sum = 0.0f; + // watch out for indexing - can't just use cb_data.n_tokens for (size_t k = 0; k < cb_data.v_diff[idx].n_rows; k++) { sum += cb_data.v_diff[idx].diff[i + cb_data.n_embd * k] * cb_data.v_diff[idx].diff[j + cb_data.n_embd * k]; } @@ -560,6 +634,10 @@ int main(int argc, char ** argv) { fprintf(stderr, "number of positive and negative prompts must be equal"); return 1; } + if (cparams.positive_prompts.empty()) { + fprintf(stderr, "must provide at least one prompt pair"); + return 1; + } callback_data cb_data; @@ -578,6 +656,7 @@ int main(int argc, char ** argv) { llama_context * ctx; std::tie(model, ctx) = llama_init_from_gpt_params(params); + int n_ctx = llama_n_ctx(ctx); int n_layers = llama_n_layer(model); int n_embd = llama_n_embd(model); cb_data.n_embd = n_embd; @@ -596,10 +675,32 @@ int main(int argc, char ** argv) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + /* TODO part of multithreading + std::vector> & batched_prompts = batch_prompts(ctx, cparams, n_ctx, add_bos); + std::vector threads; + auto worker_function = [&](tokens_pair & tp) { + printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", tp.positive.c_str(), tp.negative.c_str(), tp.max_seq_len); + // TODO so how do we deal with this? + // TODO we only have one cb_data object that everything gets passed to. so we need to be able to write to a different object per thread + // TODO but there's only one cb_eval function used as callback by the model... help wanted + }; + printf("Batching prompts...\n"); + for (int i = 0; i < batched_prompts.size(); ++i) { + for (int j = 0; j < batched_prompts[i].size(); ++j) { + threads.emplace_back(worker_function, batched_prompts[i][j]); + } + for (auto & th : threads) th.join(); + + // reload model for next batch + llama_free(ctx); + llama_free_model(model); + std::tie(model, ctx) = llama_init_from_gpt_params(params); + } + printf("Done with batching prompts.\n"); + */ + int token_ct = 0; - int n_ctx = llama_n_ctx(ctx); - // TODO multithread this for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { std::string positive_prompt = cparams.positive_entries[i]; std::string negative_prompt = cparams.negative_entries[i]; From 3090c485b63ec80d5158862846a082b94795a974 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 1 Jun 2024 18:32:14 -0400 Subject: [PATCH 18/56] remove unnecessary multithreading --- .../control-vector-generator.cpp | 97 ------------------- 1 file changed, 97 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 1f55ba5fa0d26..3cd42fa82db9f 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -17,15 +17,6 @@ struct diff_wrapper { size_t n_rows; // number of rows in the matrix for size calculation }; -/* TODO part of multithreading -struct tokens_pair { - size_t max_seq_len; - std::string positive; - std::string negative; - std::vector tokens_pos; - std::vector tokens_neg; -}; */ - struct callback_data { std::vector data; @@ -54,8 +45,6 @@ struct callback_data { struct ctrl_params { /* default meta parameters */ bool always_reload = false; - // TODO part of multithreading - // bool max_batch = false; int n_completions = 64; int n_threads = 8; @@ -95,8 +84,6 @@ static void print_usage(const char * executable) { printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n"); printf(" default: 8\n"); printf(" --always-reload reload the model for every new template to parse\n"); - // TODO part of multithreading - //printf(" --max-batch maximize batch sizes, rather than optimizing for multithreading\n"); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); @@ -186,11 +173,6 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) params.always_reload = true; skipme += 1; } - /* TODO part of multithreading - if (arg == "--max-batch") { - params.max_batch = true; - skipme += 1; - } */ // TODO it might be nice QoL to have single positive/negative args // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params } @@ -251,61 +233,6 @@ static void populate_entries(ctrl_params & cparams, std::string positive, std::s } } -/* TODO part of multithreading -static size_t tokenize_pair(tokens_pair & tp, llama_context * ctx, const std::string & pos, const std::string & neg, const bool add_bos) { - tp.positive = pos; - tp.negative = neg; - tp.tokens_pos = ::llama_tokenize(ctx, pos, add_bos); - tp.tokens_neg = ::llama_tokenize(ctx, neg, add_bos); - tp.max_seq_len = std::max(tp.tokens_pos.size(), tp.tokens_neg.size()); - padding_seq(ctx, tp.tokens_pos, tp.max_seq_len); - padding_seq(ctx, tp.tokens_neg, tp.max_seq_len); - return 2 * max_seq_len; -} - -// current batching strategy works as follows: -// each batch runs on one model load, since we reload the model after every batch to clear context -// therefore each batch must be small enough to fit in the context size -// we try to make the batches multiples of thread count so threads are used most efficiently -static std::vector> batch_prompts(llama_context * ctx, ctrl_params & cparams, int n_ctx, const bool add_bos) { - std::vector> batched_prompts; - std::vector thread_batch; - std::vector batch; - size_t n_batch_tokens = 0; - - for (size_t i = 0; i < cparams.positive_entries.size(); ++i) { - tokens_pair tp; - size_t n_tokens = tokenize_pair(tp, ctx, cparams.positive_entries[i], cparams.negative_entries[i], add_bos); - n_batch_tokens += n_tokens; - - if (n_batch_tokens > n_ctx) { - if (cparams.max_batch) { - batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); - thread_batch.clear(); - } - batched_prompts.push_back(batch); - batch.clear(); - n_batch_tokens = n_tokens; - } - - thread_batch.push_back(tp); - - if (thread_batch.size() >= cparams.n_threads) { - batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); - thread_batch.clear();; - } - } - - if (!thread_batch.empty()) { - batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); - } - if (!batch.empty()) { - batched_prompts.push_back(batch); - } - - return batched_prompts; -} */ - static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; for (int i = 0; i < GGML_MAX_DIMS; ++i) { @@ -675,30 +602,6 @@ int main(int argc, char ** argv) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - /* TODO part of multithreading - std::vector> & batched_prompts = batch_prompts(ctx, cparams, n_ctx, add_bos); - std::vector threads; - auto worker_function = [&](tokens_pair & tp) { - printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", tp.positive.c_str(), tp.negative.c_str(), tp.max_seq_len); - // TODO so how do we deal with this? - // TODO we only have one cb_data object that everything gets passed to. so we need to be able to write to a different object per thread - // TODO but there's only one cb_eval function used as callback by the model... help wanted - }; - printf("Batching prompts...\n"); - for (int i = 0; i < batched_prompts.size(); ++i) { - for (int j = 0; j < batched_prompts[i].size(); ++j) { - threads.emplace_back(worker_function, batched_prompts[i][j]); - } - for (auto & th : threads) th.join(); - - // reload model for next batch - llama_free(ctx); - llama_free_model(model); - std::tie(model, ctx) = llama_init_from_gpt_params(params); - } - printf("Done with batching prompts.\n"); - */ - int token_ct = 0; for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { From df623fffe808b98e59443bf2feed6d12695c670b Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 1 Jun 2024 18:36:54 -0400 Subject: [PATCH 19/56] interim fix memory leak --- examples/control-vector-generator/control-vector-generator.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 3cd42fa82db9f..c8c2dee95ebca 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -643,6 +643,8 @@ int main(int argc, char ** argv) { calc_diff(cb_data); // reset for next iteration + for (auto ptr : cb_data.v_pos) free(ptr); + for (auto ptr : cb_data.v_neg) free(ptr); cb_data.v_pos.clear(); cb_data.v_neg.clear(); } From 0e1f9734dec389ece03413a0e6c89c007af55d2e Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 1 Jun 2024 19:50:46 -0400 Subject: [PATCH 20/56] translated everything but PCA (I think) --- .../control-vector-generator.cpp | 148 +++++++++--------- 1 file changed, 75 insertions(+), 73 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index c8c2dee95ebca..f94382d0a2263 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -10,12 +10,7 @@ #include #include -// TODO read everything over and make sure it makes sense because you're dropping logic errors left and right - -struct diff_wrapper { - float * diff; // matrix of size [n_rows, cb_data.n_embd] with zero rows stripped - size_t n_rows; // number of rows in the matrix for size calculation -}; +// TODO read everything over and make sure it makes sense because I'm dropping logic errors left and right - Christian struct callback_data { std::vector data; @@ -25,20 +20,21 @@ struct callback_data { bool is_eval_pos = true; // each element of the vector correspond to one layer - std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] - std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] - std::vector v_final; // vector of finished vectors of size [n_embd] - std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions + std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] + std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] + std::vector v_final; // vector of finished vectors of size [n_embd] + std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions // each element of the outer vector correspond to one layer, each element of the inner vector correspond to one prompt pass - std::vector> v_diffs_wrapped; // vector of compiled diff matrices to be concatenated + std::vector> v_diffs_wrapped; // vector of compiled diff matrices to be concatenated + // TODO ggml destructor? ~callback_data() { for (auto ptr : v_pos) free(ptr); for (auto ptr : v_neg) free(ptr); - for (auto ptr : v_diff) free(ptr.diff); + for (auto ptr : v_diff) free(ptr); for (auto ptr : v_final) free(ptr); - for (auto & vec : v_diffs_wrapped) for (auto ptr : vec) free(ptr.diff); + for (auto & vec : v_diffs_wrapped) for (auto ptr : vec) free(ptr); } }; @@ -63,6 +59,13 @@ struct ctrl_params { std::vector negative_entries; }; +template +static std::string to_string(const T & val) { + std::stringstream ss; + ss << val; + return ss.str(); +} + static void print_usage(const char * executable) { printf("\n"); printf("usage: %s [options] -m [gpt-opts]", executable); @@ -83,7 +86,7 @@ static void print_usage(const char * executable) { printf(" default: 64\n"); printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n"); printf(" default: 8\n"); - printf(" --always-reload reload the model for every new template to parse\n"); + printf(" --always-reload reload the model for every new template to parse (not recommended)\n"); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); @@ -275,20 +278,26 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { // copy the data from the GPU memory if needed const bool is_host = ggml_backend_buffer_is_host(t->buffer); + // TODO does this look right? + struct ggml_tensor * t_host; if (!is_host) { auto n_bytes = ggml_nbytes(t); - cb_data->data.resize(n_bytes); - ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + struct ggml_init_params params = { + /*.mem_size =*/ n_bytes, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx_data = ggml_init(params); + t_host = ggml_new_tensor_2d(ctx_data, t->type, t->ne[0], t->ne[1]); + ggml_backend_tensor_get(t, t_host->data, 0, n_bytes); } + else t_host = t; - if (t->type == GGML_TYPE_F32) { - float * data = (float *) (is_host ? t->data : cb_data->data.data()); - float * dest = (float *) malloc(ggml_nbytes(t)); - memcpy(dest, data, ggml_nbytes(t)); + if (t_host->type == GGML_TYPE_F32) { if (cb_data->is_eval_pos) { - cb_data->v_pos.push_back(dest); + cb_data->v_pos.push_back(t_host); } else { - cb_data->v_neg.push_back(dest); + cb_data->v_neg.push_back(t_host); } } @@ -312,21 +321,33 @@ static void padding_seq(llama_context * ctx, std::vector & tokens, } } -static bool is_row_all_zeros(float * diff, size_t row, size_t cols, float eps = 1e-6) { - for (size_t i = 0; i < cols; ++i) if (diff[row * cols + i] > eps) return false; +static bool is_row_all_zeros(struct ggml_tensor * diff, size_t row, size_t cols, float eps = 1e-6) { + for (size_t i = 0; i < cols; ++i) if (ggml_get_f32_nd(diff, row, i, 0, 0) > eps) return false; return true; } static void calc_diff(callback_data & cb_data) { // TODO: assert cb_data.v_pos.size() == cb_data.v_neg.size() - const size_t n_elems = cb_data.n_embd * cb_data.n_tokens; cb_data.v_diffs_wrapped.resize(cb_data.v_pos.size()); for (size_t il = 0; il < cb_data.v_pos.size(); il++) { auto & inp_pos = cb_data.v_pos[il]; auto & inp_neg = cb_data.v_neg[il]; - float * dest = (float *) malloc(n_elems * sizeof(float)); - for (size_t i = 0; i < n_elems; i++) { - dest[i] = inp_pos[i] - inp_neg[i]; + auto n_bytes = ggml_nbytes(inp_pos); + + struct ggml_init_params params = { + /*.mem_size =*/ n_bytes, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx_data = ggml_init(params); + + // TODO is this the best way to get dimension? i don't know which way n_embd/n_tokens go + // for that matter can we get rid of n_embd/n_tokens fields in favor of ne[0]/ne[1]? + struct ggml_tensor * dest = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, inp_pos->ne[0], inp_pos->ne[1]); + for (size_t i = 0; i < cb_data.n_embd; i++) { + for (size_t j = 0; j < cb_data.n_tokens; j++) { + ggml_set_f32_nd(dest, i, j, 0, 0, ggml_get_f32_nd(inp_pos, i, j, 0, 0) - ggml_get_f32_nd(inp_neg, i, j, 0, 0)); + } } // TODO can we make this faster? like check during the above operation rather than on a second pass? @@ -345,42 +366,47 @@ static void calc_diff(callback_data & cb_data) { std::cout << "zero rows in layer " << il << ": " << cb_data.n_tokens - nonzero_rows.size() << std::endl; } */ - struct diff_wrapper dw; - dw.n_rows = nonzero_rows.size(); - dw.diff = (float *) malloc(dw.n_rows * cb_data.n_embd * sizeof(float)); + // TODO I don't know if this is the right dimension but presumably it is + struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, nonzero_rows.size(), inp_pos->ne[1]); size_t offset = 0; - for (size_t i = 0; i < dw.n_rows; ++i) { - float * origin = dest + nonzero_rows[i] * cb_data.n_embd; - memcpy(dw.diff + offset, origin, cb_data.n_embd * sizeof(float)); + for (size_t i = 0; i < nonzero_rows.size(); ++i) { + float * origin = (float *)(dest->data) + nonzero_rows[i] * cb_data.n_embd; + memcpy((float *)(diff->data) + offset, origin, cb_data.n_embd * sizeof(float)); offset += cb_data.n_embd; } - cb_data.v_diffs_wrapped[il].push_back(dw); + cb_data.v_diffs_wrapped[il].push_back(diff); free(dest); } } -// TODO do we want to multithread this? it takes very little time as it is static void concatenate_diffs(callback_data & cb_data) { for (size_t i = 0; i < cb_data.v_diffs_wrapped.size(); ++i) { - std::vector & vec = cb_data.v_diffs_wrapped[i]; + std::vector & vec = cb_data.v_diffs_wrapped[i]; size_t n_rows_total = 0; for (size_t j = 0; j < vec.size(); ++j) { - n_rows_total += vec[j].n_rows; + // TODO likewise no clue if this is right + n_rows_total += vec[j]->ne[0]; } + + struct ggml_init_params params = { + /*.mem_size =*/ cb_data.n_embd * n_rows_total * sizeof(float), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx_data = ggml_init(params); + // std::cout << "n_rows_total: " << n_rows_total << std::endl; - float * diff = (float *) malloc(n_rows_total * cb_data.n_embd * sizeof(float)); + struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, cb_data.n_embd, n_rows_total); size_t offset = 0; for (size_t j = 0; j < vec.size(); ++j) { - float * origin = vec[j].diff; - memcpy(diff + offset, origin, vec[j].n_rows * cb_data.n_embd * sizeof(float)); - offset += vec[j].n_rows * cb_data.n_embd; + float * origin = (float *)(vec[j]->data); + // TODO again not sure about dimension + memcpy((float *)(diff->data) + offset, origin, vec[j]->ne[0] * cb_data.n_embd * sizeof(float)); + offset += vec[j]->ne[0] * cb_data.n_embd; } - struct diff_wrapper dw; - dw.n_rows = n_rows_total; - dw.diff = diff; - cb_data.v_diff.push_back(dw); + cb_data.v_diff.push_back(diff); } } @@ -486,13 +512,6 @@ static void pca(callback_data & cb_data, int n_threads) { printf("Done with PCA.\n"); } -template -static std::string to_string(const T & val) { - std::stringstream ss; - ss << val; - return ss.str(); -} - static void export_gguf(callback_data & cb_data, int n_layers, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); @@ -503,30 +522,14 @@ static void export_gguf(callback_data & cb_data, int n_layers, const std::string gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_final_size_eff); - struct ggml_init_params params = { - /*.mem_size =*/ (ggml_tensor_overhead() * v_final_size_eff) - + (cb_data.n_embd * v_final_size_eff * sizeof(float)), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx_data = ggml_init(params); - for (size_t i = 0; i < v_final_size_eff; ++i) { // TODO this number is probably not right - figure out which layer is which - // the python implementation uses a dict to handle this, we don't know if it's 1, 2, 3, 4... or other + // i'm pretty sure it's right now const std::string name = "direction." + to_string(i+1); - struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, cb_data.n_embd); - - ggml_set_name(cur, name.c_str()); - - float * data = (float *) cur->data; - for(int j = 0; j < cb_data.n_embd; j++) { - data[j] = cb_data.v_final[i][j]; - } + ggml_set_name(cb_data.v_final[i], name.c_str()); - gguf_add_tensor(ctx, cur); + gguf_add_tensor(ctx, cb_data.v_final[i]); printf("Added tensor %zu\n", i); } @@ -536,7 +539,6 @@ static void export_gguf(callback_data & cb_data, int n_layers, const std::string printf("%s: wrote file '%s'\n", __func__, fname.c_str()); - ggml_free(ctx_data); gguf_free(ctx); } From b67ea65983ee3bc6ffef90cea4d9b61dd0fac7d2 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 1 Jun 2024 20:47:28 -0400 Subject: [PATCH 21/56] tentatively translate the rest --- .../control-vector-generator.cpp | 275 +++++++++++++----- 1 file changed, 198 insertions(+), 77 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index f94382d0a2263..a117ff291e94f 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -2,6 +2,14 @@ #include "llama.h" #include "ggml.h" +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + #include #include #include @@ -410,105 +418,224 @@ static void concatenate_diffs(callback_data & cb_data) { } } -// BEGIN NON-GGML IMPLEMENTATION - -// TODO translate to ggml -// this probably doesn't want to be a separate function - put it into the compute graph as a step in processing each layer -static float* square_diff(callback_data & cb_data, size_t idx) { - float* result = new float[cb_data.n_embd * cb_data.n_embd]; - std::memset(result, 0, cb_data.n_embd * cb_data.n_embd * sizeof(float)); - for (size_t i = 0; i < (size_t) cb_data.n_embd; i++) { - for (size_t j = 0; j < (size_t) cb_data.n_embd; j++) { - float sum = 0.0f; - // watch out for indexing - can't just use cb_data.n_tokens - for (size_t k = 0; k < cb_data.v_diff[idx].n_rows; k++) { - sum += cb_data.v_diff[idx].diff[i + cb_data.n_embd * k] * cb_data.v_diff[idx].diff[j + cb_data.n_embd * k]; - } - result[i * cb_data.n_embd + j] = sum; - } +struct pca_model { + struct ggml_tensor * v_diff_original; + struct ggml_tensor * square; + struct ggml_tensor * eigenvector; + + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer; + struct ggml_context * ctx; +}; + +void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original, const int n_embd) { +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(0); // init device 0 + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + +#ifdef GGML_USE_METAL + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); + model.backend = ggml_backend_metal_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } +#endif + + // if there aren't GPU Backends fallback to CPU backend + if (!model.backend) { + model.backend = ggml_backend_cpu_init(); + } + + const int num_tensors = 3; + + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + model.ctx = ggml_init(params); + + model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[0], v_diff_original->ne[1]); + model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_embd); + model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, n_embd); + + model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); + + ggml_backend_tensor_set(model.v_diff_original, v_diff_original->data, 0, ggml_nbytes(v_diff_original)); + // no need to load anything into square yet + + // initialize model.eigenvector to random vector + std::vector random_vec = std::vector(); + std::default_random_engine generator(static_cast(std::time(0))); + std::uniform_real_distribution distribution(0.0, 1.0); + for (int i = 0; i < n_embd; ++i) { + random_vec.push_back(distribution(generator)); } - return result; + + // we don't normalize it at first but that shouldn't be a problem + ggml_backend_tensor_set(model.eigenvector, random_vec.data(), 0, ggml_nbytes(model.eigenvector)); +} + +struct ggml_cgraph * square_diff_graph(const pca_model & model) { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + struct ggml_context * ctx0 = ggml_init(params0); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); + + ggml_build_forward_expand(gf, square); + + ggml_free(ctx0); + return gf; } -// TODO translate to ggml (this is a built-in function in ggml) -static void normalize_inplace(std::vector & vec) { - // inefficient(?) norm computation - float norm = 0.0f; - for (const float& val : vec) { - norm += val * val; +struct ggml_tensor * compute_square(const pca_model & model, ggml_gallocr_t allocr, int n_threads) { + struct ggml_cgraph * gf = square_diff_graph(model); + + ggml_gallocr_alloc_graph(allocr, gf); + + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); } - if(norm == 0) throw std::runtime_error("norm is zero"); - norm = std::sqrt(norm); - for (float& val : vec) { - val /= norm; + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_cb(model.backend, n_threads); } +#endif + + ggml_backend_graph_compute(model.backend, gf); + + return gf->nodes[gf->n_nodes - 1]; } -// TODO translate to ggml (this is a built-in function in ggml) -static std::vector mul_mat(const float * mat, const std::vector & vec, size_t dim) { - std::vector result(dim, 0.0f); - for (size_t i = 0; i < dim; ++i) { - for (size_t j = 0; j < dim; ++j) { - result[i] += mat[i * dim + j] * vec[j]; - } +struct ggml_cgraph * power_iteration_graph(const pca_model & model, float tolerance) { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + struct ggml_context * ctx0 = ggml_init(params0); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * b_tensor = ggml_mul_mat(ctx0, model.square, model.eigenvector); + // TODO difference between ggml_norm and ggml_norm_inplace? + // also is this the right way to do multi-step graphs? + b_tensor = ggml_norm_inplace(ctx0, b_tensor, tolerance); + + ggml_build_forward_expand(gf, b_tensor); + + ggml_free(ctx0); + return gf; +} + +struct ggml_tensor * compute_piter(const pca_model & model, ggml_gallocr_t allocr, int n_threads, float tolerance) { + struct ggml_cgraph * gf = power_iteration_graph(model, tolerance); + + ggml_gallocr_alloc_graph(allocr, gf); + + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); } - return result; + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_cb(model.backend, n_threads); + } +#endif + + ggml_backend_graph_compute(model.backend, gf); + + return gf->nodes[gf->n_nodes - 1]; } -// TODO translate to ggml -static std::vector power_iteration(callback_data & cb_data, const float * matrix, int maxIterations = 1000, float tolerance = 1e-8) { - std::vector b_tensor = std::vector(); - - // random vector gen/norm - std::default_random_engine generator(static_cast(std::time(0))); - std::uniform_real_distribution distribution(0.0, 1.0); - for (int i = 0; i < cb_data.n_embd; ++i) { - b_tensor.push_back(distribution(generator)); +static void power_iteration(callback_data & cb_data, int idx, int n_threads, int maxIterations = 1000, float tolerance = 1e-8) { + pca_model model; + load_pca_model(model, cb_data.v_diff[idx], cb_data.n_embd); + + ggml_gallocr_t allocr = NULL; + + { + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + + // create the worst case graph for memory usage estimation + struct ggml_cgraph * gf = square_diff_graph(model); + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); + + fprintf(stderr, "%s: square diff, compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); } - normalize_inplace(b_tensor); + + struct ggml_tensor * square = compute_square(model, allocr, n_threads); + ggml_backend_tensor_get(square, model.square->data, 0, ggml_nbytes(square)); + + // yes? + ggml_gallocr_free(allocr); for (int iter = 0; iter < maxIterations; ++iter) { - // store the previous one so we can check for convergence - std::vector b_prev_tensor = b_tensor; + // TODO do I need to reset it like this every time? + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - // matrix multiplication and renormalize - b_tensor = mul_mat(matrix, b_tensor, cb_data.n_embd); - normalize_inplace(b_tensor); + struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, cb_data.n_embd); + struct ggml_tensor * b_tensor = compute_piter(model, allocr, n_threads, tolerance); + ggml_backend_tensor_get(b_tensor, host_new_eigenvector->data, 0, ggml_nbytes(b_tensor)); // convergence check float diff = 0.0; for (int i = 0; i < cb_data.n_embd; ++i) { - diff += std::pow(b_tensor[i] - b_prev_tensor[i], 2); + diff += std::pow(((float *)(host_new_eigenvector->data))[i] - ((float *)(model.eigenvector->data))[i], 2); } - if (std::sqrt(diff) < tolerance) { + + // update eigenvector + ggml_backend_tensor_set(model.eigenvector, host_new_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); + + try { + if (std::sqrt(diff) < tolerance) { + break; + } + } + catch (std::exception & e) { + // catch division by zero I guess break; } } - return b_tensor; + // push back v_final with eigenvector + ggml_backend_tensor_get(model.eigenvector, cb_data.v_final[idx]->data, 0, ggml_nbytes(model.eigenvector)); } -// TODO translate to ggml static void pca(callback_data & cb_data, int n_threads) { - int n_layers = cb_data.v_diff.size(); - std::vector threads; - cb_data.v_final.reserve(n_layers); - auto worker_function = [&](int worker_id) { - for (int il = worker_id; il < n_layers; il += n_threads) { - float * matrix = square_diff(cb_data, il); - std::vector eigenvector = power_iteration(cb_data, matrix); - cb_data.v_final[il] = (float *) malloc(eigenvector.size() * sizeof(float)); - memcpy(cb_data.v_final[il], eigenvector.data(), eigenvector.size() * sizeof(float)); - printf("Done with layer %d\n", il); - printf("il = %d | %f %f \n", il, cb_data.v_final[il][0], cb_data.v_final[il][1]); - } - }; printf("Running PCA...\n"); - for (int i = 0; i < n_threads; ++i) { - threads.emplace_back(worker_function, i); + for (int il = 0; il < cb_data.v_diff.size(); ++il) { + struct ggml_init_params params = { + /*.mem_size =*/ cb_data.n_embd * sizeof(float), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx_data = ggml_init(params); + cb_data.v_final.push_back(ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, cb_data.n_embd)); + power_iteration(cb_data, il, n_threads); + printf("Done with layer %d\n", il); + printf("il = %d | %f %f \n", il, ggml_get_f32_1d(cb_data.v_final[il], 0), ggml_get_f32_1d(cb_data.v_final[il], 1)); } - for (auto & th : threads) th.join(); + printf("Done with PCA.\n"); printf("Done with PCA.\n"); } @@ -590,12 +717,6 @@ int main(int argc, char ** argv) { int n_embd = llama_n_embd(model); cb_data.n_embd = n_embd; int n_prompts = cparams.positive_prompts.size(); - - // vector of finished vectors of size [n_embd], we have (n_layers - 1) vectors in total - std::vector v_final(n_layers - 1, NULL); - for (size_t i = 0; i < v_final.size(); ++i) { - v_final[i] = (float *) calloc(n_embd, sizeof(float)); - } // create templated prompts for (int i = 0; i < n_prompts; ++i) { @@ -653,7 +774,7 @@ int main(int argc, char ** argv) { concatenate_diffs(cb_data); pca(cb_data, cparams.n_threads); - printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); + //printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); llama_free(ctx); llama_free_model(model); From a23c72e4c057defeaa9e883f8724f09e029c073a Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 1 Jun 2024 22:19:33 -0400 Subject: [PATCH 22/56] fix ggml errors and make new ones at least it compiles and runs --- .../control-vector-generator.cpp | 123 ++++++++++++------ 1 file changed, 81 insertions(+), 42 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index a117ff291e94f..f48e5ef3f9056 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -286,7 +286,9 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { // copy the data from the GPU memory if needed const bool is_host = ggml_backend_buffer_is_host(t->buffer); - // TODO does this look right? + // FIXME something is very wrong here + // v_pos and v_neg are being populated, but the values aren't correct - it writes the same values to all vectors, it looks like? + // this leads ultimately to an error in calc_diff where diff becomes entirely zeroes and eventually a segfault several iterations into pca struct ggml_tensor * t_host; if (!is_host) { auto n_bytes = ggml_nbytes(t); @@ -329,8 +331,12 @@ static void padding_seq(llama_context * ctx, std::vector & tokens, } } -static bool is_row_all_zeros(struct ggml_tensor * diff, size_t row, size_t cols, float eps = 1e-6) { - for (size_t i = 0; i < cols; ++i) if (ggml_get_f32_nd(diff, row, i, 0, 0) > eps) return false; +static bool is_row_all_zeros(struct ggml_tensor * diff, int row, int cols, float eps = 1e-6) { + for (int i = 0; i < cols; ++i) { + if (ggml_get_f32_nd(diff, i, row, 0, 0) > eps) { + return false; + } + } return true; } @@ -343,25 +349,30 @@ static void calc_diff(callback_data & cb_data) { auto n_bytes = ggml_nbytes(inp_pos); struct ggml_init_params params = { - /*.mem_size =*/ n_bytes, + /*.mem_size =*/ n_bytes + ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; struct ggml_context * ctx_data = ggml_init(params); + printf("inp_pos [0][0]: %f\n", ggml_get_f32_nd(inp_pos, 0, 0, 0, 0)); + // TODO is this the best way to get dimension? i don't know which way n_embd/n_tokens go // for that matter can we get rid of n_embd/n_tokens fields in favor of ne[0]/ne[1]? struct ggml_tensor * dest = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, inp_pos->ne[0], inp_pos->ne[1]); + for (size_t i = 0; i < cb_data.n_embd; i++) { for (size_t j = 0; j < cb_data.n_tokens; j++) { ggml_set_f32_nd(dest, i, j, 0, 0, ggml_get_f32_nd(inp_pos, i, j, 0, 0) - ggml_get_f32_nd(inp_neg, i, j, 0, 0)); } } + printf("dest [0][0]: %f\n", ggml_get_f32_nd(dest, 0, 0, 0, 0)); + // TODO can we make this faster? like check during the above operation rather than on a second pass? // strip zero rows - std::vector nonzero_rows; + std::vector nonzero_rows; for (int i = 0; i < cb_data.n_tokens; ++i) { if (!is_row_all_zeros(dest, i, cb_data.n_embd)) { nonzero_rows.push_back(i); @@ -374,18 +385,32 @@ static void calc_diff(callback_data & cb_data) { std::cout << "zero rows in layer " << il << ": " << cb_data.n_tokens - nonzero_rows.size() << std::endl; } */ - // TODO I don't know if this is the right dimension but presumably it is - struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, nonzero_rows.size(), inp_pos->ne[1]); + struct ggml_init_params params2 = { + /*.mem_size =*/ inp_pos->ne[0] * nonzero_rows.size() * sizeof(float) + ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx_data2 = ggml_init(params); + + struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_data2, GGML_TYPE_F32, inp_pos->ne[0], nonzero_rows.size()); - size_t offset = 0; + //size_t offset = 0; for (size_t i = 0; i < nonzero_rows.size(); ++i) { - float * origin = (float *)(dest->data) + nonzero_rows[i] * cb_data.n_embd; - memcpy((float *)(diff->data) + offset, origin, cb_data.n_embd * sizeof(float)); - offset += cb_data.n_embd; + // probably eschew this in favor of the iterative method? + //float * origin = (float *)(dest->data) + nonzero_rows[i] * cb_data.n_embd; + //memcpy((float *)(diff->data) + offset, origin, cb_data.n_embd * sizeof(float)); + //offset += cb_data.n_embd; + + for (size_t j = 0; j < cb_data.n_embd; j++) { + ggml_set_f32_nd(diff, j, i, 0, 0, ggml_get_f32_nd(dest, j, nonzero_rows[i], 0, 0)); + } } + // FIXME ggml_nbytes(diff) is 0 + cb_data.v_diffs_wrapped[il].push_back(diff); - free(dest); + ggml_free(ctx_data); + ggml_free(ctx_data2); } } @@ -394,12 +419,11 @@ static void concatenate_diffs(callback_data & cb_data) { std::vector & vec = cb_data.v_diffs_wrapped[i]; size_t n_rows_total = 0; for (size_t j = 0; j < vec.size(); ++j) { - // TODO likewise no clue if this is right - n_rows_total += vec[j]->ne[0]; + n_rows_total += vec[j]->ne[1]; } struct ggml_init_params params = { - /*.mem_size =*/ cb_data.n_embd * n_rows_total * sizeof(float), + /*.mem_size =*/ cb_data.n_embd * n_rows_total * sizeof(float) + ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; @@ -407,13 +431,14 @@ static void concatenate_diffs(callback_data & cb_data) { // std::cout << "n_rows_total: " << n_rows_total << std::endl; struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, cb_data.n_embd, n_rows_total); + size_t offset = 0; for (size_t j = 0; j < vec.size(); ++j) { float * origin = (float *)(vec[j]->data); - // TODO again not sure about dimension - memcpy((float *)(diff->data) + offset, origin, vec[j]->ne[0] * cb_data.n_embd * sizeof(float)); + memcpy((float *)(diff->data) + offset, origin, vec[j]->ne[1] * cb_data.n_embd * sizeof(float)); offset += vec[j]->ne[0] * cb_data.n_embd; } + cb_data.v_diff.push_back(diff); } } @@ -483,7 +508,7 @@ void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original, con } struct ggml_cgraph * square_diff_graph(const pca_model & model) { - static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static std::vector buf(buf_size); struct ggml_init_params params0 = { @@ -523,7 +548,7 @@ struct ggml_tensor * compute_square(const pca_model & model, ggml_gallocr_t allo } struct ggml_cgraph * power_iteration_graph(const pca_model & model, float tolerance) { - static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static std::vector buf(buf_size); struct ggml_init_params params0 = { @@ -565,25 +590,16 @@ struct ggml_tensor * compute_piter(const pca_model & model, ggml_gallocr_t alloc return gf->nodes[gf->n_nodes - 1]; } -static void power_iteration(callback_data & cb_data, int idx, int n_threads, int maxIterations = 1000, float tolerance = 1e-8) { +static void power_iteration(callback_data & cb_data, int idx, int n_threads, int maxIterations = 1000, float tolerance = 1e-7) { + printf("in power iteration\n"); pca_model model; load_pca_model(model, cb_data.v_diff[idx], cb_data.n_embd); - ggml_gallocr_t allocr = NULL; - - { - allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - - // create the worst case graph for memory usage estimation - struct ggml_cgraph * gf = square_diff_graph(model); - ggml_gallocr_reserve(allocr, gf); - size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); - - fprintf(stderr, "%s: square diff, compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); - } + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + // FIXME ggml_nbytes(square) is 0 because everything going back to diff in calc_diff is 0 struct ggml_tensor * square = compute_square(model, allocr, n_threads); - ggml_backend_tensor_get(square, model.square->data, 0, ggml_nbytes(square)); + ggml_backend_tensor_set(model.square, square->data, 0, ggml_nbytes(square)); // yes? ggml_gallocr_free(allocr); @@ -593,14 +609,33 @@ static void power_iteration(callback_data & cb_data, int idx, int n_threads, int // TODO do I need to reset it like this every time? allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, cb_data.n_embd); + // i have no idea how ggml_contexts work so i'm making a different one for the original and the old one + struct ggml_init_params hov_params = { + /*.mem_size =*/ cb_data.n_embd * sizeof(float) + ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * hov_ctx = ggml_init(hov_params); + + struct ggml_init_params hnv_params = { + /*.mem_size =*/ cb_data.n_embd * sizeof(float) + ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * hnv_ctx = ggml_init(hnv_params); + + struct ggml_tensor * host_old_eigenvector = ggml_new_tensor_1d(hov_ctx, GGML_TYPE_F32, cb_data.n_embd); + struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(hnv_ctx, GGML_TYPE_F32, cb_data.n_embd); + struct ggml_tensor * b_tensor = compute_piter(model, allocr, n_threads, tolerance); + ggml_backend_tensor_get(b_tensor, host_new_eigenvector->data, 0, ggml_nbytes(b_tensor)); + ggml_backend_tensor_get(model.eigenvector, host_old_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); // convergence check float diff = 0.0; for (int i = 0; i < cb_data.n_embd; ++i) { - diff += std::pow(((float *)(host_new_eigenvector->data))[i] - ((float *)(model.eigenvector->data))[i], 2); + diff += std::pow((ggml_get_f32_1d(host_new_eigenvector, i) - ggml_get_f32_1d(host_old_eigenvector, i)), 2); } // update eigenvector @@ -615,17 +650,23 @@ static void power_iteration(callback_data & cb_data, int idx, int n_threads, int // catch division by zero I guess break; } + + ggml_free(hnv_ctx); } - // push back v_final with eigenvector ggml_backend_tensor_get(model.eigenvector, cb_data.v_final[idx]->data, 0, ggml_nbytes(model.eigenvector)); + + ggml_gallocr_free(allocr); + ggml_free(model.ctx); + ggml_backend_buffer_free(model.buffer); + ggml_backend_free(model.backend); } static void pca(callback_data & cb_data, int n_threads) { printf("Running PCA...\n"); for (int il = 0; il < cb_data.v_diff.size(); ++il) { struct ggml_init_params params = { - /*.mem_size =*/ cb_data.n_embd * sizeof(float), + /*.mem_size =*/ cb_data.n_embd * sizeof(float) + ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; @@ -636,7 +677,6 @@ static void pca(callback_data & cb_data, int n_threads) { printf("il = %d | %f %f \n", il, ggml_get_f32_1d(cb_data.v_final[il], 0), ggml_get_f32_1d(cb_data.v_final[il], 1)); } printf("Done with PCA.\n"); - printf("Done with PCA.\n"); } static void export_gguf(callback_data & cb_data, int n_layers, const std::string fname, const std::string model_hint) { @@ -669,8 +709,6 @@ static void export_gguf(callback_data & cb_data, int n_layers, const std::string gguf_free(ctx); } -// END NON-GGML IMPLEMENTATION - int main(int argc, char ** argv) { ctrl_params cparams; @@ -766,8 +804,9 @@ int main(int argc, char ** argv) { calc_diff(cb_data); // reset for next iteration - for (auto ptr : cb_data.v_pos) free(ptr); - for (auto ptr : cb_data.v_neg) free(ptr); + // TODO there's no good way to do this is there? because you need to ggml_free the underlying ggml_context + //for (auto ptr : cb_data.v_pos) free(ptr->data); + //for (auto ptr : cb_data.v_neg) free(ptr->data); cb_data.v_pos.clear(); cb_data.v_neg.clear(); } From 15d5c257a04ff7a1cfb6e32fb9f5dcd71bd2e83b Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 2 Jun 2024 10:58:11 +0200 Subject: [PATCH 23/56] fix cb_eval --- .../control-vector-generator.cpp | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index f48e5ef3f9056..e2ee6208dfdcf 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -18,10 +18,13 @@ #include #include +#define DEBUG_POS 2 + // TODO read everything over and make sure it makes sense because I'm dropping logic errors left and right - Christian struct callback_data { std::vector data; + ggml_context * ctx_ggml; int n_tokens = 0; int n_embd = 0; @@ -290,18 +293,11 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { // v_pos and v_neg are being populated, but the values aren't correct - it writes the same values to all vectors, it looks like? // this leads ultimately to an error in calc_diff where diff becomes entirely zeroes and eventually a segfault several iterations into pca struct ggml_tensor * t_host; - if (!is_host) { - auto n_bytes = ggml_nbytes(t); - struct ggml_init_params params = { - /*.mem_size =*/ n_bytes, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * ctx_data = ggml_init(params); - t_host = ggml_new_tensor_2d(ctx_data, t->type, t->ne[0], t->ne[1]); - ggml_backend_tensor_get(t, t_host->data, 0, n_bytes); - } - else t_host = t; + auto n_bytes = ggml_nbytes(t); + t_host = ggml_new_tensor_2d(cb_data->ctx_ggml, t->type, t->ne[0], t->ne[1]); + t_host->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow + ggml_backend_tensor_get(t, t_host->data, 0, n_bytes); + printf("t_host [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(t_host, 0, DEBUG_POS, 0, 0)); if (t_host->type == GGML_TYPE_F32) { if (cb_data->is_eval_pos) { @@ -315,6 +311,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { + llama_kv_cache_clear(ctx); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; @@ -355,7 +352,8 @@ static void calc_diff(callback_data & cb_data) { }; struct ggml_context * ctx_data = ggml_init(params); - printf("inp_pos [0][0]: %f\n", ggml_get_f32_nd(inp_pos, 0, 0, 0, 0)); + printf("inp_pos [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(inp_pos, 0, DEBUG_POS, 0, 0)); + printf("inp_neg [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(inp_neg, 0, DEBUG_POS, 0, 0)); // TODO is this the best way to get dimension? i don't know which way n_embd/n_tokens go // for that matter can we get rid of n_embd/n_tokens fields in favor of ne[0]/ne[1]? @@ -367,7 +365,7 @@ static void calc_diff(callback_data & cb_data) { } } - printf("dest [0][0]: %f\n", ggml_get_f32_nd(dest, 0, 0, 0, 0)); + printf("dest [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(dest, 0, DEBUG_POS, 0, 0)); // TODO can we make this faster? like check during the above operation rather than on a second pass? @@ -415,6 +413,7 @@ static void calc_diff(callback_data & cb_data) { } static void concatenate_diffs(callback_data & cb_data) { + printf("concatenate_diffs\n"); for (size_t i = 0; i < cb_data.v_diffs_wrapped.size(); ++i) { std::vector & vec = cb_data.v_diffs_wrapped[i]; size_t n_rows_total = 0; @@ -756,6 +755,14 @@ int main(int argc, char ** argv) { cb_data.n_embd = n_embd; int n_prompts = cparams.positive_prompts.size(); + // init ctx_ggml + struct ggml_init_params params_ggml = { + /*.mem_size =*/ ggml_tensor_overhead() * n_prompts * n_layers * 4u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + cb_data.ctx_ggml = ggml_init(params_ggml); + // create templated prompts for (int i = 0; i < n_prompts; ++i) { populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); @@ -804,13 +811,15 @@ int main(int argc, char ** argv) { calc_diff(cb_data); // reset for next iteration - // TODO there's no good way to do this is there? because you need to ggml_free the underlying ggml_context - //for (auto ptr : cb_data.v_pos) free(ptr->data); - //for (auto ptr : cb_data.v_neg) free(ptr->data); + // TODO @ngxson : find a more proper way to alloc / free tensors + for (auto ptr : cb_data.v_pos) free(ptr->data); + for (auto ptr : cb_data.v_neg) free(ptr->data); cb_data.v_pos.clear(); cb_data.v_neg.clear(); } + printf("Done evaluate prompts\n"); + concatenate_diffs(cb_data); pca(cb_data, cparams.n_threads); //printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); From 07dba13ab6653bbbe0c5eded502b58379d12915d Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 17:40:19 -0400 Subject: [PATCH 24/56] temporary commit while I move dev environments it finally outputs a functioning control vector - "functioning" in the sense that it can be loaded and it clearly has the right idea, but makes the model incoherent --- .../control-vector-generator.cpp | 342 ++++++++++-------- 1 file changed, 189 insertions(+), 153 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index e2ee6208dfdcf..94c1939f8a07a 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -22,30 +22,50 @@ // TODO read everything over and make sure it makes sense because I'm dropping logic errors left and right - Christian +// to reduce the amount of stuff that gets sent to cb_eval I separated it somewhat - Christian struct callback_data { std::vector data; - ggml_context * ctx_ggml; + ggml_context * ctx_ggml; // holds v_pos, v_neg int n_tokens = 0; - int n_embd = 0; bool is_eval_pos = true; // each element of the vector correspond to one layer - std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] - std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] - std::vector v_final; // vector of finished vectors of size [n_embd] - std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions - - // each element of the outer vector correspond to one layer, each element of the inner vector correspond to one prompt pass - std::vector> v_diffs_wrapped; // vector of compiled diff matrices to be concatenated + std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] + std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] - // TODO ggml destructor? ~callback_data() { for (auto ptr : v_pos) free(ptr); for (auto ptr : v_neg) free(ptr); + ggml_free(ctx_ggml); + } +}; + +// I prefer having the different contexts so we can free each immediately after we're done using it +// e.g. we don't need the diffs_wrapped once we strip zero rows + concatenate them so we can ggml_free it, etc. +// @ngxson let me know what you think - @christianazinn +struct diff_ctx { + int n_embd = 0; + int n_threads = 8; + + ggml_context * ctx_diffs_wrapped; // holds v_diffs_wrapped + ggml_context * ctx_diff; // holds v_diff + ggml_context * ctx_final; // holds v_final + + // each element of the vector correspond to one layer + std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions + std::vector v_final; // vector of vectors of size [n_embd] to be written to file + + // each element of the outer vector correspond to one layer, each element of the inner vector correspond to one prompt pass + std::vector> v_diffs_wrapped; // vector of compiled diff matrices of size [n_embd, n_tokens] to be concatenated + + ~diff_ctx() { for (auto ptr : v_diff) free(ptr); for (auto ptr : v_final) free(ptr); for (auto & vec : v_diffs_wrapped) for (auto ptr : vec) free(ptr); + ggml_free(ctx_diffs_wrapped); + ggml_free(ctx_diff); + ggml_free(ctx_final); } }; @@ -289,9 +309,6 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { // copy the data from the GPU memory if needed const bool is_host = ggml_backend_buffer_is_host(t->buffer); - // FIXME something is very wrong here - // v_pos and v_neg are being populated, but the values aren't correct - it writes the same values to all vectors, it looks like? - // this leads ultimately to an error in calc_diff where diff becomes entirely zeroes and eventually a segfault several iterations into pca struct ggml_tensor * t_host; auto n_bytes = ggml_nbytes(t); t_host = ggml_new_tensor_2d(cb_data->ctx_ggml, t->type, t->ne[0], t->ne[1]); @@ -328,123 +345,104 @@ static void padding_seq(llama_context * ctx, std::vector & tokens, } } -static bool is_row_all_zeros(struct ggml_tensor * diff, int row, int cols, float eps = 1e-6) { - for (int i = 0; i < cols; ++i) { - if (ggml_get_f32_nd(diff, i, row, 0, 0) > eps) { - return false; - } - } - return true; -} - -static void calc_diff(callback_data & cb_data) { +static void calc_diff(callback_data & cb_data, diff_ctx & dctx) { // TODO: assert cb_data.v_pos.size() == cb_data.v_neg.size() - cb_data.v_diffs_wrapped.resize(cb_data.v_pos.size()); + dctx.v_diffs_wrapped.resize(cb_data.v_pos.size()); for (size_t il = 0; il < cb_data.v_pos.size(); il++) { + std::cout << "il: " << il << " of " << cb_data.v_pos.size()-1 << std::endl; + auto & inp_pos = cb_data.v_pos[il]; auto & inp_neg = cb_data.v_neg[il]; auto n_bytes = ggml_nbytes(inp_pos); - struct ggml_init_params params = { - /*.mem_size =*/ n_bytes + ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * ctx_data = ggml_init(params); - printf("inp_pos [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(inp_pos, 0, DEBUG_POS, 0, 0)); printf("inp_neg [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(inp_neg, 0, DEBUG_POS, 0, 0)); // TODO is this the best way to get dimension? i don't know which way n_embd/n_tokens go // for that matter can we get rid of n_embd/n_tokens fields in favor of ne[0]/ne[1]? - struct ggml_tensor * dest = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, inp_pos->ne[0], inp_pos->ne[1]); + // TODO assert inp_pos->ne[0] == inp_neg->ne[0] && inp_pos->ne[1] == inp_neg->ne[1] + struct ggml_tensor * dest = ggml_new_tensor_2d(dctx.ctx_diffs_wrapped, GGML_TYPE_F32, inp_pos->ne[0], inp_pos->ne[1]); + dest->data = malloc(n_bytes); // TODO @ngxson get rid of this malloc somehow - for (size_t i = 0; i < cb_data.n_embd; i++) { - for (size_t j = 0; j < cb_data.n_tokens; j++) { + for (size_t i = 0; i < inp_pos->ne[0]; i++) { + for (size_t j = 0; j < inp_pos->ne[1]; j++) { ggml_set_f32_nd(dest, i, j, 0, 0, ggml_get_f32_nd(inp_pos, i, j, 0, 0) - ggml_get_f32_nd(inp_neg, i, j, 0, 0)); } } printf("dest [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(dest, 0, DEBUG_POS, 0, 0)); - // TODO can we make this faster? like check during the above operation rather than on a second pass? + dctx.v_diffs_wrapped[il].push_back(dest); + } +} + +// TODO nomenclature is probably wrong! this should be cols +// row/col mixup has been giving me a headache this entire time because apparently ggml accesses 2d as [col][row] - @christianazinn +// TODO check row/col because that's probably where the logic error is +static bool is_row_all_zeros(struct ggml_tensor * diff, int row, int cols, float eps = 1e-6) { + for (int i = 0; i < cols; ++i) { + if (ggml_get_f32_nd(diff, i, row, 0, 0) > eps) { + return false; + } + } + return true; +} + +static void concatenate_diffs(diff_ctx & dctx) { + // TODO can you do this inplace? + // TODO assert each tensor has the same ->ne[0] and it equals dctx.n_embd + printf("concatenate_diffs\n"); + for (size_t il = 0; il < dctx.v_diffs_wrapped.size(); ++il) { + printf("il: %zu of %zu\n", il, dctx.v_diffs_wrapped.size()-1); + std::vector & vec = dctx.v_diffs_wrapped[il]; + + std::cout << "vec size: " << vec.size() << std::endl; // strip zero rows - std::vector nonzero_rows; - for (int i = 0; i < cb_data.n_tokens; ++i) { - if (!is_row_all_zeros(dest, i, cb_data.n_embd)) { - nonzero_rows.push_back(i); + int n_nonzero_rows = 0; + std::vector> nonzero_rows; // outer vector is tensor idx, inner vector is row in tensor + nonzero_rows.resize(vec.size()); + for (int i = 0; i < vec.size(); ++i) { + for (int j = 0; j < vec[i]->ne[1]; ++j) { + if (!is_row_all_zeros(vec[i], j, vec[i]->ne[0])) { + nonzero_rows[i].push_back(j); + n_nonzero_rows++; + } } } - /* debug - if(cb_data.n_tokens != nonzero_rows.size()) { - std::cout << "original n_tokens: " << cb_data.n_tokens << std::endl; - std::cout << "zero rows in layer " << il << ": " << cb_data.n_tokens - nonzero_rows.size() << std::endl; - } */ + std::cout << "n_nonzero_rows: " << n_nonzero_rows << std::endl; - struct ggml_init_params params2 = { - /*.mem_size =*/ inp_pos->ne[0] * nonzero_rows.size() * sizeof(float) + ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * ctx_data2 = ggml_init(params); + // we transpose it here because ggml mul_mat is really weird + struct ggml_tensor * diff = ggml_new_tensor_2d(dctx.ctx_diff, GGML_TYPE_F32, n_nonzero_rows, dctx.n_embd); - struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_data2, GGML_TYPE_F32, inp_pos->ne[0], nonzero_rows.size()); + diff->data = malloc(dctx.n_embd * n_nonzero_rows * sizeof(float) + ggml_tensor_overhead()); // @ngxson get rid of this malloc somehow - //size_t offset = 0; for (size_t i = 0; i < nonzero_rows.size(); ++i) { - // probably eschew this in favor of the iterative method? - //float * origin = (float *)(dest->data) + nonzero_rows[i] * cb_data.n_embd; - //memcpy((float *)(diff->data) + offset, origin, cb_data.n_embd * sizeof(float)); - //offset += cb_data.n_embd; - - for (size_t j = 0; j < cb_data.n_embd; j++) { - ggml_set_f32_nd(diff, j, i, 0, 0, ggml_get_f32_nd(dest, j, nonzero_rows[i], 0, 0)); + for (size_t j : nonzero_rows[i]) { + for (size_t k = 0; k < vec[i]->ne[0]; k++) { + //std::cout << ggml_get_f32_nd(vec[i], k, j, 0, 0) << std::endl; + ggml_set_f32_nd(diff, i, k, 0, 0, ggml_get_f32_nd(vec[i], k, j, 0, 0)); + } } } - // FIXME ggml_nbytes(diff) is 0 - - cb_data.v_diffs_wrapped[il].push_back(diff); - ggml_free(ctx_data); - ggml_free(ctx_data2); - } -} + printf("diff[0][1]: %f\n", ggml_get_f32_nd(diff, 0, 1, 0, 0)); -static void concatenate_diffs(callback_data & cb_data) { - printf("concatenate_diffs\n"); - for (size_t i = 0; i < cb_data.v_diffs_wrapped.size(); ++i) { - std::vector & vec = cb_data.v_diffs_wrapped[i]; - size_t n_rows_total = 0; - for (size_t j = 0; j < vec.size(); ++j) { - n_rows_total += vec[j]->ne[1]; - } + // TODO assert row == n_nonzero_rows - struct ggml_init_params params = { - /*.mem_size =*/ cb_data.n_embd * n_rows_total * sizeof(float) + ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * ctx_data = ggml_init(params); - - // std::cout << "n_rows_total: " << n_rows_total << std::endl; - struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, cb_data.n_embd, n_rows_total); - - size_t offset = 0; - for (size_t j = 0; j < vec.size(); ++j) { - float * origin = (float *)(vec[j]->data); - memcpy((float *)(diff->data) + offset, origin, vec[j]->ne[1] * cb_data.n_embd * sizeof(float)); - offset += vec[j]->ne[0] * cb_data.n_embd; - } - - cb_data.v_diff.push_back(diff); + dctx.v_diff.push_back(diff); } + ggml_free(dctx.ctx_diffs_wrapped); } +// TODO translate everything below this +// TODO make sure to free everything in a timely manner + struct pca_model { struct ggml_tensor * v_diff_original; struct ggml_tensor * square; + struct ggml_tensor * square_transpose; struct ggml_tensor * eigenvector; ggml_backend_t backend = NULL; @@ -452,7 +450,7 @@ struct pca_model { struct ggml_context * ctx; }; -void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original, const int n_embd) { +void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original) { #ifdef GGML_USE_CUDA fprintf(stderr, "%s: using CUDA backend\n", __func__); model.backend = ggml_backend_cuda_init(0); // init device 0 @@ -474,8 +472,10 @@ void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original, con if (!model.backend) { model.backend = ggml_backend_cpu_init(); } + + printf("v_diff_original[0][1]: %f\n", ggml_get_f32_nd(v_diff_original, 0, 1, 0, 0)); - const int num_tensors = 3; + const int num_tensors = 4; struct ggml_init_params params { /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, @@ -486,19 +486,21 @@ void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original, con model.ctx = ggml_init(params); model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[0], v_diff_original->ne[1]); - model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_embd); - model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, n_embd); + model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1], v_diff_original->ne[1]); + model.square_transpose = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1], v_diff_original->ne[1]); + model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1]); model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); ggml_backend_tensor_set(model.v_diff_original, v_diff_original->data, 0, ggml_nbytes(v_diff_original)); - // no need to load anything into square yet + + // no need to load anything into square or square_transpose yet // initialize model.eigenvector to random vector - std::vector random_vec = std::vector(); + std::vector random_vec; std::default_random_engine generator(static_cast(std::time(0))); std::uniform_real_distribution distribution(0.0, 1.0); - for (int i = 0; i < n_embd; ++i) { + for (int i = 0; i < v_diff_original->ne[1]; ++i) { random_vec.push_back(distribution(generator)); } @@ -519,6 +521,7 @@ struct ggml_cgraph * square_diff_graph(const pca_model & model) { struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); + //struct ggml_tensor * square_transpose = ggml_transpose(ctx0, square); ggml_build_forward_expand(gf, square); @@ -526,6 +529,7 @@ struct ggml_cgraph * square_diff_graph(const pca_model & model) { return gf; } +// TODO do this before pca so the pca_model is easier to malloc? struct ggml_tensor * compute_square(const pca_model & model, ggml_gallocr_t allocr, int n_threads) { struct ggml_cgraph * gf = square_diff_graph(model); @@ -589,51 +593,44 @@ struct ggml_tensor * compute_piter(const pca_model & model, ggml_gallocr_t alloc return gf->nodes[gf->n_nodes - 1]; } -static void power_iteration(callback_data & cb_data, int idx, int n_threads, int maxIterations = 1000, float tolerance = 1e-7) { +static void power_iteration(diff_ctx & dctx, int idx, int maxIterations = 1000, float tolerance = 1e-7) { printf("in power iteration\n"); + pca_model model; - load_pca_model(model, cb_data.v_diff[idx], cb_data.n_embd); + load_pca_model(model, dctx.v_diff[idx]); + std::cout << "model.v_diff_original->ne[0]: " << model.v_diff_original->ne[0] << std::endl; ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - // FIXME ggml_nbytes(square) is 0 because everything going back to diff in calc_diff is 0 - struct ggml_tensor * square = compute_square(model, allocr, n_threads); - ggml_backend_tensor_set(model.square, square->data, 0, ggml_nbytes(square)); + struct ggml_tensor * square = compute_square(model, allocr, dctx.n_threads); + ggml_backend_tensor_set(model.square, square->data, 0, ggml_nbytes(model.square)); // yes? ggml_gallocr_free(allocr); + struct ggml_init_params host_params = { + /*.mem_size =*/ (dctx.n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * host_ctx = ggml_init(host_params); + + struct ggml_tensor * host_old_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, dctx.n_embd); + struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, dctx.n_embd); + for (int iter = 0; iter < maxIterations; ++iter) { // TODO do I need to reset it like this every time? allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - // i have no idea how ggml_contexts work so i'm making a different one for the original and the old one - struct ggml_init_params hov_params = { - /*.mem_size =*/ cb_data.n_embd * sizeof(float) + ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * hov_ctx = ggml_init(hov_params); - - struct ggml_init_params hnv_params = { - /*.mem_size =*/ cb_data.n_embd * sizeof(float) + ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * hnv_ctx = ggml_init(hnv_params); - - struct ggml_tensor * host_old_eigenvector = ggml_new_tensor_1d(hov_ctx, GGML_TYPE_F32, cb_data.n_embd); - struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(hnv_ctx, GGML_TYPE_F32, cb_data.n_embd); - - struct ggml_tensor * b_tensor = compute_piter(model, allocr, n_threads, tolerance); + struct ggml_tensor * b_tensor = compute_piter(model, allocr, dctx.n_threads, tolerance); ggml_backend_tensor_get(b_tensor, host_new_eigenvector->data, 0, ggml_nbytes(b_tensor)); ggml_backend_tensor_get(model.eigenvector, host_old_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); // convergence check float diff = 0.0; - for (int i = 0; i < cb_data.n_embd; ++i) { + for (int i = 0; i < dctx.n_embd; ++i) { diff += std::pow((ggml_get_f32_1d(host_new_eigenvector, i) - ggml_get_f32_1d(host_old_eigenvector, i)), 2); } @@ -649,36 +646,29 @@ static void power_iteration(callback_data & cb_data, int idx, int n_threads, int // catch division by zero I guess break; } - - ggml_free(hnv_ctx); } - ggml_backend_tensor_get(model.eigenvector, cb_data.v_final[idx]->data, 0, ggml_nbytes(model.eigenvector)); + ggml_backend_tensor_get(model.eigenvector, dctx.v_final[idx]->data, 0, ggml_nbytes(model.eigenvector)); ggml_gallocr_free(allocr); + ggml_free(host_ctx); ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer); ggml_backend_free(model.backend); } -static void pca(callback_data & cb_data, int n_threads) { +static void pca(diff_ctx & dctx) { printf("Running PCA...\n"); - for (int il = 0; il < cb_data.v_diff.size(); ++il) { - struct ggml_init_params params = { - /*.mem_size =*/ cb_data.n_embd * sizeof(float) + ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * ctx_data = ggml_init(params); - cb_data.v_final.push_back(ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, cb_data.n_embd)); - power_iteration(cb_data, il, n_threads); + for (int il = 0; il < dctx.v_diff.size(); ++il) { + dctx.v_final.push_back(ggml_new_tensor_1d(dctx.ctx_final, GGML_TYPE_F32, dctx.n_embd)); + power_iteration(dctx, il); printf("Done with layer %d\n", il); - printf("il = %d | %f %f \n", il, ggml_get_f32_1d(cb_data.v_final[il], 0), ggml_get_f32_1d(cb_data.v_final[il], 1)); + printf("il = %d | %f %f \n", il, ggml_get_f32_1d(dctx.v_final[il], 0), ggml_get_f32_1d(dctx.v_final[il], 1)); } printf("Done with PCA.\n"); } -static void export_gguf(callback_data & cb_data, int n_layers, const std::string fname, const std::string model_hint) { +static void export_gguf(diff_ctx & dctx, int n_layers, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); size_t v_final_size_eff = n_layers - 1; @@ -693,9 +683,11 @@ static void export_gguf(callback_data & cb_data, int n_layers, const std::string // i'm pretty sure it's right now const std::string name = "direction." + to_string(i+1); - ggml_set_name(cb_data.v_final[i], name.c_str()); + std::cout << "dctx.v_final[i][0][1]: " << ggml_get_f32_nd(dctx.v_final[i], 0, 1, 0, 0) << std::endl; + + ggml_set_name(dctx.v_final[i], name.c_str()); - gguf_add_tensor(ctx, cb_data.v_final[i]); + gguf_add_tensor(ctx, dctx.v_final[i]); printf("Added tensor %zu\n", i); } @@ -752,17 +744,47 @@ int main(int argc, char ** argv) { int n_ctx = llama_n_ctx(ctx); int n_layers = llama_n_layer(model); int n_embd = llama_n_embd(model); - cb_data.n_embd = n_embd; int n_prompts = cparams.positive_prompts.size(); // init ctx_ggml struct ggml_init_params params_ggml = { - /*.mem_size =*/ ggml_tensor_overhead() * n_prompts * n_layers * 4u, + /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 2u, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; cb_data.ctx_ggml = ggml_init(params_ggml); + // init diff_ctx + diff_ctx dctx; + + // FIXME FIXME FIXME we are running out of memory here + // n_prompts should really be n_tokens damnit - remove the 2u and adapt + // we will either have to pretokenize everything so we know how much memory to allocate + // or allocate the tensor overhead as we go + struct ggml_init_params params_diffs_wrapped = { + /*.mem_size =*/ ggml_tensor_overhead() * n_prompts * n_layers * 16u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + // this we know how much overhead to allocate in advance + struct ggml_init_params params_diff = { + /*.mem_size =*/ ggml_tensor_overhead() * n_layers, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + // and this we know exactly how much memory to allocate in advance without malloc() hacks + struct ggml_init_params params_final = { + /*.mem_size =*/ n_embd * sizeof(float) * n_layers + + ggml_tensor_overhead() * n_layers, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + dctx.n_embd = n_embd; + dctx.n_threads = cparams.n_threads; + dctx.ctx_diffs_wrapped = ggml_init(params_diffs_wrapped); + dctx.ctx_diff = ggml_init(params_diff); + dctx.ctx_final = ggml_init(params_final); + // create templated prompts for (int i = 0; i < n_prompts; ++i) { populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); @@ -804,24 +826,33 @@ int main(int argc, char ** argv) { cb_data.is_eval_pos = false; get_hidden_layers(ctx, tokens_neg); - // TODO check whether the same tokens correspond to zero rows because we don't seem to be getting many zero rows anymore - // we get a lot of zero rows for the first few prompts and then they drop off - // likewise most of the zero rows are in the first few layers for each prompt - - calc_diff(cb_data); + calc_diff(cb_data, dctx); // reset for next iteration // TODO @ngxson : find a more proper way to alloc / free tensors - for (auto ptr : cb_data.v_pos) free(ptr->data); - for (auto ptr : cb_data.v_neg) free(ptr->data); + ggml_free(cb_data.ctx_ggml); + // TODO move this to the top of the loop and remove the ggml_free() outside + cb_data.ctx_ggml = ggml_init(params_ggml); cb_data.v_pos.clear(); cb_data.v_neg.clear(); } + // TODO we can actually delete cb_data here + //ggml_free(cb_data.ctx_ggml); + + printf("dctx.v_diffs_wrapped[0][0][2]: %f\n", ggml_get_f32_nd(dctx.v_diffs_wrapped[0][0], 0, 2, 0, 0)); + printf("Done evaluate prompts\n"); - concatenate_diffs(cb_data); - pca(cb_data, cparams.n_threads); + concatenate_diffs(dctx); + + printf("dctx.v_diff[0][0][1]: %f\n", ggml_get_f32_nd(dctx.v_diff[0], 0, 1, 0, 0)); + + printf("Done concatenate diffs\n"); + + // code is known to work up to here + + pca(dctx); //printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); llama_free(ctx); @@ -831,9 +862,14 @@ int main(int argc, char ** argv) { // we need get_arch_name() from llama.cpp // TODO also has support been implemeneted for arches other than llama yet? see #5970 std::string model_hint = "llama"; - export_gguf(cb_data, n_layers, cparams.outfile, model_hint); + export_gguf(dctx, n_layers, cparams.outfile, model_hint); llama_backend_free(); + std::cout << "okay which of you is failing" << std::endl; + + // TODO free(): invalid pointer after the entire program is done???????? + // probably because destructors free after you've already manually freed + // TODO fix destructor/ggml_free positioning return 0; } From 23fd1b587cb91af06a6e5800282f20ebbf6f7e72 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 21:14:43 -0400 Subject: [PATCH 25/56] update debug statements --- .../control-vector-generator.cpp | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 94c1939f8a07a..59335f826b5cb 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -22,7 +22,7 @@ // TODO read everything over and make sure it makes sense because I'm dropping logic errors left and right - Christian -// to reduce the amount of stuff that gets sent to cb_eval I separated it somewhat - Christian +// to reduce the amount of stuff that gets sent to cb_eval this is only what cb_eval actually needs struct callback_data { std::vector data; ggml_context * ctx_ggml; // holds v_pos, v_neg @@ -34,11 +34,13 @@ struct callback_data { std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] + // TODO I free everything as soon as it's unnecessary, rather than letting this live until the end of main() - is this undesirable? + /* ~callback_data() { for (auto ptr : v_pos) free(ptr); for (auto ptr : v_neg) free(ptr); ggml_free(ctx_ggml); - } + }*/ }; // I prefer having the different contexts so we can free each immediately after we're done using it @@ -51,7 +53,7 @@ struct diff_ctx { ggml_context * ctx_diffs_wrapped; // holds v_diffs_wrapped ggml_context * ctx_diff; // holds v_diff ggml_context * ctx_final; // holds v_final - + // each element of the vector correspond to one layer std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions std::vector v_final; // vector of vectors of size [n_embd] to be written to file @@ -62,10 +64,9 @@ struct diff_ctx { ~diff_ctx() { for (auto ptr : v_diff) free(ptr); for (auto ptr : v_final) free(ptr); - for (auto & vec : v_diffs_wrapped) for (auto ptr : vec) free(ptr); - ggml_free(ctx_diffs_wrapped); ggml_free(ctx_diff); ggml_free(ctx_final); + // ctx_diffs_wrapped is freed in concatenate_diffs as soon as we're done with it - see above. undesirable? } }; @@ -396,8 +397,6 @@ static void concatenate_diffs(diff_ctx & dctx) { printf("il: %zu of %zu\n", il, dctx.v_diffs_wrapped.size()-1); std::vector & vec = dctx.v_diffs_wrapped[il]; - std::cout << "vec size: " << vec.size() << std::endl; - // strip zero rows int n_nonzero_rows = 0; std::vector> nonzero_rows; // outer vector is tensor idx, inner vector is row in tensor @@ -411,7 +410,7 @@ static void concatenate_diffs(diff_ctx & dctx) { } } - std::cout << "n_nonzero_rows: " << n_nonzero_rows << std::endl; + printf("n_nonzero_rows: %d\n", n_nonzero_rows); // we transpose it here because ggml mul_mat is really weird struct ggml_tensor * diff = ggml_new_tensor_2d(dctx.ctx_diff, GGML_TYPE_F32, n_nonzero_rows, dctx.n_embd); @@ -427,13 +426,14 @@ static void concatenate_diffs(diff_ctx & dctx) { } } - printf("diff[0][1]: %f\n", ggml_get_f32_nd(diff, 0, 1, 0, 0)); + printf("diff[0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(diff, 0, DEBUG_POS, 0, 0)); // TODO assert row == n_nonzero_rows dctx.v_diff.push_back(diff); } - ggml_free(dctx.ctx_diffs_wrapped); + //for (auto & vec : dctx.v_diffs_wrapped) for (auto ptr : vec) free(ptr); + ggml_free(dctx.ctx_diffs_wrapped); } // TODO translate everything below this @@ -473,7 +473,7 @@ void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original) { model.backend = ggml_backend_cpu_init(); } - printf("v_diff_original[0][1]: %f\n", ggml_get_f32_nd(v_diff_original, 0, 1, 0, 0)); + printf("v_diff_original[0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(v_diff_original, 0, DEBUG_POS, 0, 0)); const int num_tensors = 4; @@ -529,7 +529,6 @@ struct ggml_cgraph * square_diff_graph(const pca_model & model) { return gf; } -// TODO do this before pca so the pca_model is easier to malloc? struct ggml_tensor * compute_square(const pca_model & model, ggml_gallocr_t allocr, int n_threads) { struct ggml_cgraph * gf = square_diff_graph(model); @@ -598,14 +597,12 @@ static void power_iteration(diff_ctx & dctx, int idx, int maxIterations = 1000, pca_model model; load_pca_model(model, dctx.v_diff[idx]); - std::cout << "model.v_diff_original->ne[0]: " << model.v_diff_original->ne[0] << std::endl; ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); struct ggml_tensor * square = compute_square(model, allocr, dctx.n_threads); ggml_backend_tensor_set(model.square, square->data, 0, ggml_nbytes(model.square)); - // yes? ggml_gallocr_free(allocr); struct ggml_init_params host_params = { @@ -683,7 +680,7 @@ static void export_gguf(diff_ctx & dctx, int n_layers, const std::string fname, // i'm pretty sure it's right now const std::string name = "direction." + to_string(i+1); - std::cout << "dctx.v_final[i][0][1]: " << ggml_get_f32_nd(dctx.v_final[i], 0, 1, 0, 0) << std::endl; + printf("dctx.v_final[i][%d]: %f\n", DEBUG_POS, ggml_get_f32_1d(dctx.v_final[i], DEBUG_POS)); ggml_set_name(dctx.v_final[i], name.c_str()); @@ -838,15 +835,14 @@ int main(int argc, char ** argv) { } // TODO we can actually delete cb_data here - //ggml_free(cb_data.ctx_ggml); - printf("dctx.v_diffs_wrapped[0][0][2]: %f\n", ggml_get_f32_nd(dctx.v_diffs_wrapped[0][0], 0, 2, 0, 0)); + printf("dctx.v_diffs_wrapped[0][0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(dctx.v_diffs_wrapped[0][0], 0, DEBUG_POS, 0, 0)); printf("Done evaluate prompts\n"); concatenate_diffs(dctx); - printf("dctx.v_diff[0][0][1]: %f\n", ggml_get_f32_nd(dctx.v_diff[0], 0, 1, 0, 0)); + printf("dctx.v_diff[0][0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(dctx.v_diff[0], 0, DEBUG_POS, 0, 0)); printf("Done concatenate diffs\n"); From 3815a0c3069fb8bb80c7d5913fb213c2ac62c98d Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 21:26:13 -0400 Subject: [PATCH 26/56] pre-tokenize so we can allocate correct memory to ctx_diffs_wrapped --- .../control-vector-generator.cpp | 65 ++++++++++++------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 59335f826b5cb..0135dfb18c1fe 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -91,6 +91,14 @@ struct ctrl_params { std::vector negative_entries; }; +struct tokenized_prompt { + std::string positive; + std::string negative; + std::vector tokens_pos; + std::vector tokens_neg; + size_t max_seq_len; +}; + template static std::string to_string(const T & val) { std::stringstream ss; @@ -713,11 +721,11 @@ int main(int argc, char ** argv) { cparams.positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file); cparams.negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file); if (cparams.positive_prompts.size() != cparams.negative_prompts.size()) { - fprintf(stderr, "number of positive and negative prompts must be equal"); + fprintf(stderr, "number of positive and negative prompts must be equal\n"); return 1; } if (cparams.positive_prompts.empty()) { - fprintf(stderr, "must provide at least one prompt pair"); + fprintf(stderr, "must provide at least one prompt pair\n"); return 1; } @@ -751,6 +759,29 @@ int main(int argc, char ** argv) { }; cb_data.ctx_ggml = ggml_init(params_ggml); + // create templated prompts + for (int i = 0; i < n_prompts; ++i) { + populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); + } + + // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped + std::vector tokenized_prompts; + size_t n_total_tokens = 0; + for (size_t i = 0; i < cparams.positive_entries.size(); ++i) { + tokenized_prompt t; + t.positive = cparams.positive_entries[i]; + t.negative = cparams.negative_entries[i]; + t.tokens_pos = ::llama_tokenize(ctx, t.positive, false); + t.tokens_neg = ::llama_tokenize(ctx, t.negative, false); + t.max_seq_len = std::max(t.tokens_pos.size(), t.tokens_neg.size()); + padding_seq(ctx, t.tokens_pos, t.max_seq_len); + padding_seq(ctx, t.tokens_neg, t.max_seq_len); + n_total_tokens += 2 * t.max_seq_len; + tokenized_prompts.push_back(t); + } + + std::cout << "n_total_tokens: " << n_total_tokens << std::endl; + // init diff_ctx diff_ctx dctx; @@ -759,7 +790,7 @@ int main(int argc, char ** argv) { // we will either have to pretokenize everything so we know how much memory to allocate // or allocate the tensor overhead as we go struct ggml_init_params params_diffs_wrapped = { - /*.mem_size =*/ ggml_tensor_overhead() * n_prompts * n_layers * 16u, + /*.mem_size =*/ ggml_tensor_overhead() * n_total_tokens, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -782,46 +813,35 @@ int main(int argc, char ** argv) { dctx.ctx_diff = ggml_init(params_diff); dctx.ctx_final = ggml_init(params_final); - // create templated prompts - for (int i = 0; i < n_prompts; ++i) { - populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); - } - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); int token_ct = 0; for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { - std::string positive_prompt = cparams.positive_entries[i]; - std::string negative_prompt = cparams.negative_entries[i]; - std::vector tokens_pos = ::llama_tokenize(ctx, positive_prompt, add_bos); - std::vector tokens_neg = ::llama_tokenize(ctx, negative_prompt, add_bos); - size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); - padding_seq(ctx, tokens_pos, max_seq_len); - padding_seq(ctx, tokens_neg, max_seq_len); - cb_data.n_tokens = max_seq_len; + tokenized_prompt t = tokenized_prompts[i]; + cb_data.n_tokens = t.max_seq_len; // need to reload the model so it doesn't run out of context // this should scale with -c option passed by main - token_ct += 2 * max_seq_len; + token_ct += 2 * t.max_seq_len; if (token_ct > n_ctx || cparams.always_reload) { //break; llama_free(ctx); llama_free_model(model); std::tie(model, ctx) = llama_init_from_gpt_params(params); - token_ct = 2 * max_seq_len; + token_ct = 2 * t.max_seq_len; } if (token_ct > n_ctx) { fprintf(stderr, "context size exceeded on iteration %zu\n", i); break; } - printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", positive_prompt.c_str(), negative_prompt.c_str(), max_seq_len); + printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", t.positive.c_str(), t.negative.c_str(), t.max_seq_len); cb_data.is_eval_pos = true; - get_hidden_layers(ctx, tokens_pos); + get_hidden_layers(ctx, t.tokens_pos); cb_data.is_eval_pos = false; - get_hidden_layers(ctx, tokens_neg); + get_hidden_layers(ctx, t.tokens_neg); calc_diff(cb_data, dctx); @@ -861,7 +881,8 @@ int main(int argc, char ** argv) { export_gguf(dctx, n_layers, cparams.outfile, model_hint); llama_backend_free(); - std::cout << "okay which of you is failing" << std::endl; + + printf("confirm we got here\n"); // TODO free(): invalid pointer after the entire program is done???????? // probably because destructors free after you've already manually freed From a42e783d7545d515bbb2583896d4b2a182bc82f6 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 21:33:46 -0400 Subject: [PATCH 27/56] update comments --- .../control-vector-generator.cpp | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 0135dfb18c1fe..4ca8559241db9 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -367,8 +367,6 @@ static void calc_diff(callback_data & cb_data, diff_ctx & dctx) { printf("inp_pos [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(inp_pos, 0, DEBUG_POS, 0, 0)); printf("inp_neg [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(inp_neg, 0, DEBUG_POS, 0, 0)); - // TODO is this the best way to get dimension? i don't know which way n_embd/n_tokens go - // for that matter can we get rid of n_embd/n_tokens fields in favor of ne[0]/ne[1]? // TODO assert inp_pos->ne[0] == inp_neg->ne[0] && inp_pos->ne[1] == inp_neg->ne[1] struct ggml_tensor * dest = ggml_new_tensor_2d(dctx.ctx_diffs_wrapped, GGML_TYPE_F32, inp_pos->ne[0], inp_pos->ne[1]); dest->data = malloc(n_bytes); // TODO @ngxson get rid of this malloc somehow @@ -385,9 +383,7 @@ static void calc_diff(callback_data & cb_data, diff_ctx & dctx) { } } -// TODO nomenclature is probably wrong! this should be cols -// row/col mixup has been giving me a headache this entire time because apparently ggml accesses 2d as [col][row] - @christianazinn -// TODO check row/col because that's probably where the logic error is +// 50/50 chance this should be cols but it works and I don't want to touch it - @christianazinn static bool is_row_all_zeros(struct ggml_tensor * diff, int row, int cols, float eps = 1e-6) { for (int i = 0; i < cols; ++i) { if (ggml_get_f32_nd(diff, i, row, 0, 0) > eps) { @@ -444,9 +440,6 @@ static void concatenate_diffs(diff_ctx & dctx) { ggml_free(dctx.ctx_diffs_wrapped); } -// TODO translate everything below this -// TODO make sure to free everything in a timely manner - struct pca_model { struct ggml_tensor * v_diff_original; struct ggml_tensor * square; @@ -785,10 +778,6 @@ int main(int argc, char ** argv) { // init diff_ctx diff_ctx dctx; - // FIXME FIXME FIXME we are running out of memory here - // n_prompts should really be n_tokens damnit - remove the 2u and adapt - // we will either have to pretokenize everything so we know how much memory to allocate - // or allocate the tensor overhead as we go struct ggml_init_params params_diffs_wrapped = { /*.mem_size =*/ ggml_tensor_overhead() * n_total_tokens, /*.mem_buffer =*/ NULL, @@ -854,7 +843,7 @@ int main(int argc, char ** argv) { cb_data.v_neg.clear(); } - // TODO we can actually delete cb_data here + // TODO we can actually delete cb_data here but do we want to? printf("dctx.v_diffs_wrapped[0][0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(dctx.v_diffs_wrapped[0][0], 0, DEBUG_POS, 0, 0)); From a710df749c793f1fd75349d722d23306d243f319 Mon Sep 17 00:00:00 2001 From: ngxson Date: Fri, 7 Jun 2024 15:37:58 +0200 Subject: [PATCH 28/56] (wip) refactor --- Makefile | 2 +- .../control-vector-generator/CMakeLists.txt | 2 +- .../control-vector-generator.cpp | 840 +++++++----------- examples/control-vector-generator/pca.hpp | 267 ++++++ 4 files changed, 566 insertions(+), 545 deletions(-) create mode 100644 examples/control-vector-generator/pca.hpp diff --git a/Makefile b/Makefile index c12a3e382435f..0e3850708ff46 100644 --- a/Makefile +++ b/Makefile @@ -838,7 +838,7 @@ eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -control-vector-generator: examples/control-vector-generator/control-vector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +control-vector-generator: examples/control-vector-generator/control-vector-generator.cpp examples/control-vector-generator/pca.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/control-vector-generator/CMakeLists.txt b/examples/control-vector-generator/CMakeLists.txt index 2515d20116749..f3688e431d914 100644 --- a/examples/control-vector-generator/CMakeLists.txt +++ b/examples/control-vector-generator/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET control-vector-generator) -add_executable(${TARGET} control-vector-generator.cpp) +add_executable(${TARGET} control-vector-generator.cpp pca.hpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 4ca8559241db9..35d607a59b9bc 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -1,6 +1,7 @@ #include "common.h" #include "llama.h" #include "ggml.h" +#include "pca.hpp" #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -18,55 +19,208 @@ #include #include -#define DEBUG_POS 2 -// TODO read everything over and make sure it makes sense because I'm dropping logic errors left and right - Christian +////////////////////////////////////////////////// -// to reduce the amount of stuff that gets sent to cb_eval this is only what cb_eval actually needs + +template +static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { + std::string ret; + for (; begin != end; ++begin) { + ret += llama_token_to_piece(ctx, *begin); + } + + return ret; +} + + +////////////////////////////////////////////////// + + +// cb_eval is reused for each pair of positive - negative prompt struct callback_data { - std::vector data; - ggml_context * ctx_ggml; // holds v_pos, v_neg + ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered + int n_layers = 0; int n_tokens = 0; bool is_eval_pos = true; // each element of the vector correspond to one layer - std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] - std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] - - // TODO I free everything as soon as it's unnecessary, rather than letting this live until the end of main() - is this undesirable? - /* - ~callback_data() { - for (auto ptr : v_pos) free(ptr); - for (auto ptr : v_neg) free(ptr); - ggml_free(ctx_ggml); - }*/ -}; + std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] + std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] + std::vector v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer + + // save a tensor into either v_pos or v_neg (decided by is_eval_pos) + void save_tensor_for_layer(struct ggml_tensor * t) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + + if (ctx_ggml == nullptr) { + // alloc a new ctx_ggml if needed + struct ggml_init_params params_ggml = { + /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_ggml = ggml_init(params_ggml); + } -// I prefer having the different contexts so we can free each immediately after we're done using it -// e.g. we don't need the diffs_wrapped once we strip zero rows + concatenate them so we can ggml_free it, etc. -// @ngxson let me know what you think - @christianazinn -struct diff_ctx { - int n_embd = 0; - int n_threads = 8; + // copy tensor data + auto n_bytes = ggml_nbytes(t); + struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]); + t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow + ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); + ggml_set_name(t_layer, ggml_get_name(t)); + print_debug_tensor(t_layer); - ggml_context * ctx_diffs_wrapped; // holds v_diffs_wrapped - ggml_context * ctx_diff; // holds v_diff - ggml_context * ctx_final; // holds v_final + if (is_eval_pos) { + v_pos.push_back(t_layer); + } else { + v_neg.push_back(t_layer); + } + } + + // calculate diff (v_pos - v_neg) and place the result back to v_pos + // all zero rows in the diff tensor will also be removed + // NOTE: final layer is ignored. we only have (n_layers - 1) to process + std::vector calc_diff() { + for (float il = 0; il < v_pos.size(); il++) { + float * a = (float *) v_pos[il]->data; + float * b = (float *) v_neg[il]->data; + size_t n_elem = ggml_nelements(v_pos[il]); + for (size_t j = 0; j < n_elem; j++) { + a[j] -= b[j]; + } + //print_debug_tensor(v_pos[i]); + auto diff_filtered = filter_nonzero_rows(v_pos[il]); + v_diff_filtered.push_back(diff_filtered); + } + return v_pos; // for convinient, we return the result std::vector + } + + // delete zero rows from a given 2D tensor + struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { + printf("filter_nonzero_rows\n"); + auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { + // check if given row containing all zero elements + int n_cols = t->ne[0]; // hint: should be equal to n_embd + for (int col = 0; col < n_cols; ++col) { + if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) { + return false; + } + } + return true; + }; + std::vector rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered) + for (int i_row = 0; i_row < a->ne[1]; i_row++) { + if (!is_row_all_zeros(a, i_row, 1e-6)) { + rows_to_copy.push_back(i_row); + } + } + + // get "n_nonzero_rows" for the output "diff_filtered" + int n_nonzero_rows = rows_to_copy.size(); + printf("n_nonzero_rows: %d\n", n_nonzero_rows); + int n_embd = a->ne[0]; + GGML_ASSERT(n_nonzero_rows > 0); + + // diff_filtered: [n_embd, n_nonzero_rows] + struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( + ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); + ggml_set_name(diff_filtered, (std::string("diff_filtered_") + a->name).c_str()); + diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); + + // copy non-zero rows + for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { + int src_row = rows_to_copy[dest_row]; + for (int i = 0; i < n_embd; i++) { + float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0); + ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem); + } + } + + print_debug_tensor(diff_filtered); + + return diff_filtered; + } + + // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors + void reset() { + for (auto ptr : v_pos) free(ptr->data); + for (auto ptr : v_neg) free(ptr->data); + for (auto ptr : v_diff_filtered) free(ptr->data); + v_pos.clear(); + v_neg.clear(); + v_diff_filtered.clear(); + if (ctx_ggml) { + ggml_free(ctx_ggml); + } + ctx_ggml = nullptr; + } +}; + +/** + * process_ctx is used to store the ggml context for pre-post processing the diff vectors + * in short, input => v_diff and output => v_final + */ +struct train_context { + ggml_context * ctx_ggml; + int n_embd; + int n_layers; // each element of the vector correspond to one layer - std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions + // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here + std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) std::vector v_final; // vector of vectors of size [n_embd] to be written to file - // each element of the outer vector correspond to one layer, each element of the inner vector correspond to one prompt pass - std::vector> v_diffs_wrapped; // vector of compiled diff matrices of size [n_embd, n_tokens] to be concatenated + // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor + // v_diff_tmp will get converted unto v_diff later on + std::vector> v_diff_tmp; + + train_context(int n_embd_, int n_layers_) { + n_embd = n_embd_; + n_layers = n_layers_; + struct ggml_init_params params_ggml = { + /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_ggml = ggml_init(params_ggml); + for (int il = 0; il < n_layers - 1; il++) { + std::vector empty; + v_diff_tmp.push_back(empty); + v_final.push_back(ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd)); + } + } - ~diff_ctx() { - for (auto ptr : v_diff) free(ptr); - for (auto ptr : v_final) free(ptr); - ggml_free(ctx_diff); - ggml_free(ctx_final); - // ctx_diffs_wrapped is freed in concatenate_diffs as soon as we're done with it - see above. undesirable? + // add new rows into existing tensor in v_diff_tmp + void concat_diff_tmp(const std::vector & diff_filtered) { + GGML_ASSERT(diff_filtered.size() == n_layers - 1); + for (int il = 0; il < n_layers - 1; il++) { + auto t = diff_filtered[il]; + auto & diff_tmp = v_diff_tmp[il]; + size_t curr_size = diff_tmp.size(); + diff_tmp.resize(curr_size + ggml_nbytes(t)); + memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t)); + } + } + + // build the v_diff tensors from v_diff_tmp + void build_v_diff() { + for (int il = 0; il < n_layers - 1; il++) { + auto & diff_tmp = v_diff_tmp[il]; + int n_elem = diff_tmp.size() / sizeof(float); + int n_rows = n_elem / n_embd; + struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows); + ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); + diff->data = diff_tmp.data(); + v_diff.push_back(diff); + } + } + + ~train_context() { + for (auto ptr : v_final) free(ptr->data); + // no need to free v_diff_tmp or v_diff, since we didn't use malloc + ggml_free(ctx_ggml); } }; @@ -82,23 +236,37 @@ struct ctrl_params { std::string positive_prompts_file = "examples/control-vector-generator/positive.txt"; std::string negative_prompts_file = "examples/control-vector-generator/negative.txt"; - /* pair of prompts to be used for generating the vectors */ - std::vector positive_prompts; - std::vector negative_prompts; - - /* pair of prompts to be used for testing */ + /* pair of prompts to be used for generating final vector */ std::vector positive_entries; std::vector negative_entries; }; struct tokenized_prompt { - std::string positive; - std::string negative; std::vector tokens_pos; std::vector tokens_neg; size_t max_seq_len; + + tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + tokens_pos = ::llama_tokenize(ctx, pos, add_bos); + tokens_neg = ::llama_tokenize(ctx, neg, add_bos); + max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); + padding_seq(ctx, tokens_pos, max_seq_len); + padding_seq(ctx, tokens_neg, max_seq_len); + } + + void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { + // TODO: customize padding token + std::vector pad_tokens = ::llama_tokenize(ctx, " ", false); + llama_token pad_tok = pad_tokens.back(); + while (tokens.size() < len) { + tokens.push_back(pad_tok); + } + } }; +////////////////////////////////////////////////// + template static std::string to_string(const T & val) { std::stringstream ss; @@ -235,7 +403,7 @@ static int ctrlvec_params_parse(int argc, char ** argv, ctrl_params & params) { return skipme; } -static std::vector ctrlvec_load_prompt_file(std::string path) { +static std::vector ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines = false) { std::vector output; std::ifstream file(path); if (!file.is_open()) { @@ -243,7 +411,8 @@ static std::vector ctrlvec_load_prompt_file(std::string path) { } std::string line; while (std::getline(file, line)) { - if (!line.empty()) { // skip empty lines + bool is_skip = skip_empty_lines && line.empty(); + if (!is_skip) { output.push_back(line); } } @@ -251,49 +420,23 @@ static std::vector ctrlvec_load_prompt_file(std::string path) { return output; } -static std::string format_template(std::string persona, std::string suffix) { - //const std::string user_tag = "[INST]"; - //const std::string asst_tag = "[/INST]"; - //return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; - // TODO make this dynamic - allow the user to change it somehow - and adapt based on model - return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" -} - -static void populate_entries(ctrl_params & cparams, std::string positive, std::string negative) { - std::string line; - std::ifstream completions_file(cparams.completions_file); - int i = 0; - if (completions_file.is_open()) { - while (std::getline(completions_file, line) && i < cparams.n_completions) { - // TODO replicate the truncations done by the python implementation - cparams.positive_entries.push_back(format_template(positive, line)); - cparams.negative_entries.push_back(format_template(negative, line)); - i++; - } - completions_file.close(); - } else { - throw std::invalid_argument("error: invalid completions file or file could not be opened"); - } -} - -static std::string ggml_ne_string(const ggml_tensor * t) { - std::string str; - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - str += std::to_string(t->ne[i]); - if (i + 1 < GGML_MAX_DIMS) { - str += ", "; - } - } - return str; -} +////////////////////////////////////////////////// static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (callback_data *) user_data; + auto ggml_ne_string = [](const ggml_tensor * t) -> std::string { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; + }; static const char * l_out_name = "l_out"; const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; if (ask) { return is_l_out; @@ -303,36 +446,8 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { return true; } - char src1_str[128] = {0}; - if (src1) { - sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); - } - - printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); - - - // copy the data from the GPU memory if needed - const bool is_host = ggml_backend_buffer_is_host(t->buffer); - - struct ggml_tensor * t_host; - auto n_bytes = ggml_nbytes(t); - t_host = ggml_new_tensor_2d(cb_data->ctx_ggml, t->type, t->ne[0], t->ne[1]); - t_host->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow - ggml_backend_tensor_get(t, t_host->data, 0, n_bytes); - printf("t_host [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(t_host, 0, DEBUG_POS, 0, 0)); - - if (t_host->type == GGML_TYPE_F32) { - if (cb_data->is_eval_pos) { - cb_data->v_pos.push_back(t_host); - } else { - cb_data->v_neg.push_back(t_host); - } - } - + // save the tensor to current context + cb_data->save_tensor_for_layer(t); return true; } @@ -345,348 +460,17 @@ static bool get_hidden_layers(llama_context * ctx, std::vector & to return true; } -static void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { - // TODO: customize padding token - std::vector pad_tokens = ::llama_tokenize(ctx, " ", false); - llama_token pad_tok = pad_tokens.back(); - while (tokens.size() < len) { - tokens.push_back(pad_tok); - } -} - -static void calc_diff(callback_data & cb_data, diff_ctx & dctx) { - // TODO: assert cb_data.v_pos.size() == cb_data.v_neg.size() - dctx.v_diffs_wrapped.resize(cb_data.v_pos.size()); - for (size_t il = 0; il < cb_data.v_pos.size(); il++) { - std::cout << "il: " << il << " of " << cb_data.v_pos.size()-1 << std::endl; - - auto & inp_pos = cb_data.v_pos[il]; - auto & inp_neg = cb_data.v_neg[il]; - auto n_bytes = ggml_nbytes(inp_pos); - - printf("inp_pos [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(inp_pos, 0, DEBUG_POS, 0, 0)); - printf("inp_neg [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(inp_neg, 0, DEBUG_POS, 0, 0)); - - // TODO assert inp_pos->ne[0] == inp_neg->ne[0] && inp_pos->ne[1] == inp_neg->ne[1] - struct ggml_tensor * dest = ggml_new_tensor_2d(dctx.ctx_diffs_wrapped, GGML_TYPE_F32, inp_pos->ne[0], inp_pos->ne[1]); - dest->data = malloc(n_bytes); // TODO @ngxson get rid of this malloc somehow - - for (size_t i = 0; i < inp_pos->ne[0]; i++) { - for (size_t j = 0; j < inp_pos->ne[1]; j++) { - ggml_set_f32_nd(dest, i, j, 0, 0, ggml_get_f32_nd(inp_pos, i, j, 0, 0) - ggml_get_f32_nd(inp_neg, i, j, 0, 0)); - } - } - - printf("dest [0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(dest, 0, DEBUG_POS, 0, 0)); - - dctx.v_diffs_wrapped[il].push_back(dest); - } -} - -// 50/50 chance this should be cols but it works and I don't want to touch it - @christianazinn -static bool is_row_all_zeros(struct ggml_tensor * diff, int row, int cols, float eps = 1e-6) { - for (int i = 0; i < cols; ++i) { - if (ggml_get_f32_nd(diff, i, row, 0, 0) > eps) { - return false; - } - } - return true; -} - -static void concatenate_diffs(diff_ctx & dctx) { - // TODO can you do this inplace? - // TODO assert each tensor has the same ->ne[0] and it equals dctx.n_embd - printf("concatenate_diffs\n"); - for (size_t il = 0; il < dctx.v_diffs_wrapped.size(); ++il) { - printf("il: %zu of %zu\n", il, dctx.v_diffs_wrapped.size()-1); - std::vector & vec = dctx.v_diffs_wrapped[il]; - - // strip zero rows - int n_nonzero_rows = 0; - std::vector> nonzero_rows; // outer vector is tensor idx, inner vector is row in tensor - nonzero_rows.resize(vec.size()); - for (int i = 0; i < vec.size(); ++i) { - for (int j = 0; j < vec[i]->ne[1]; ++j) { - if (!is_row_all_zeros(vec[i], j, vec[i]->ne[0])) { - nonzero_rows[i].push_back(j); - n_nonzero_rows++; - } - } - } - - printf("n_nonzero_rows: %d\n", n_nonzero_rows); - - // we transpose it here because ggml mul_mat is really weird - struct ggml_tensor * diff = ggml_new_tensor_2d(dctx.ctx_diff, GGML_TYPE_F32, n_nonzero_rows, dctx.n_embd); - - diff->data = malloc(dctx.n_embd * n_nonzero_rows * sizeof(float) + ggml_tensor_overhead()); // @ngxson get rid of this malloc somehow - - for (size_t i = 0; i < nonzero_rows.size(); ++i) { - for (size_t j : nonzero_rows[i]) { - for (size_t k = 0; k < vec[i]->ne[0]; k++) { - //std::cout << ggml_get_f32_nd(vec[i], k, j, 0, 0) << std::endl; - ggml_set_f32_nd(diff, i, k, 0, 0, ggml_get_f32_nd(vec[i], k, j, 0, 0)); - } - } - } - - printf("diff[0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(diff, 0, DEBUG_POS, 0, 0)); - - // TODO assert row == n_nonzero_rows - - dctx.v_diff.push_back(diff); - } - //for (auto & vec : dctx.v_diffs_wrapped) for (auto ptr : vec) free(ptr); - ggml_free(dctx.ctx_diffs_wrapped); -} - -struct pca_model { - struct ggml_tensor * v_diff_original; - struct ggml_tensor * square; - struct ggml_tensor * square_transpose; - struct ggml_tensor * eigenvector; - - ggml_backend_t backend = NULL; - ggml_backend_buffer_t buffer; - struct ggml_context * ctx; -}; - -void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original) { -#ifdef GGML_USE_CUDA - fprintf(stderr, "%s: using CUDA backend\n", __func__); - model.backend = ggml_backend_cuda_init(0); // init device 0 - if (!model.backend) { - fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); - } -#endif - -#ifdef GGML_USE_METAL - fprintf(stderr, "%s: using Metal backend\n", __func__); - ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); - model.backend = ggml_backend_metal_init(); - if (!model.backend) { - fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); - } -#endif - - // if there aren't GPU Backends fallback to CPU backend - if (!model.backend) { - model.backend = ggml_backend_cpu_init(); - } - - printf("v_diff_original[0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(v_diff_original, 0, DEBUG_POS, 0, 0)); - - const int num_tensors = 4; - - struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - model.ctx = ggml_init(params); - - model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[0], v_diff_original->ne[1]); - model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1], v_diff_original->ne[1]); - model.square_transpose = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1], v_diff_original->ne[1]); - model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1]); - - model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); - - ggml_backend_tensor_set(model.v_diff_original, v_diff_original->data, 0, ggml_nbytes(v_diff_original)); - - // no need to load anything into square or square_transpose yet - - // initialize model.eigenvector to random vector - std::vector random_vec; - std::default_random_engine generator(static_cast(std::time(0))); - std::uniform_real_distribution distribution(0.0, 1.0); - for (int i = 0; i < v_diff_original->ne[1]; ++i) { - random_vec.push_back(distribution(generator)); - } - - // we don't normalize it at first but that shouldn't be a problem - ggml_backend_tensor_set(model.eigenvector, random_vec.data(), 0, ggml_nbytes(model.eigenvector)); -} - -struct ggml_cgraph * square_diff_graph(const pca_model & model) { - static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); - static std::vector buf(buf_size); - - struct ggml_init_params params0 = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - struct ggml_context * ctx0 = ggml_init(params0); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); - //struct ggml_tensor * square_transpose = ggml_transpose(ctx0, square); - - ggml_build_forward_expand(gf, square); - - ggml_free(ctx0); - return gf; -} - -struct ggml_tensor * compute_square(const pca_model & model, ggml_gallocr_t allocr, int n_threads) { - struct ggml_cgraph * gf = square_diff_graph(model); - - ggml_gallocr_alloc_graph(allocr, gf); - - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } - -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - - ggml_backend_graph_compute(model.backend, gf); - - return gf->nodes[gf->n_nodes - 1]; -} - -struct ggml_cgraph * power_iteration_graph(const pca_model & model, float tolerance) { - static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); - static std::vector buf(buf_size); - - struct ggml_init_params params0 = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - struct ggml_context * ctx0 = ggml_init(params0); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * b_tensor = ggml_mul_mat(ctx0, model.square, model.eigenvector); - // TODO difference between ggml_norm and ggml_norm_inplace? - // also is this the right way to do multi-step graphs? - b_tensor = ggml_norm_inplace(ctx0, b_tensor, tolerance); - - ggml_build_forward_expand(gf, b_tensor); - - ggml_free(ctx0); - return gf; -} - -struct ggml_tensor * compute_piter(const pca_model & model, ggml_gallocr_t allocr, int n_threads, float tolerance) { - struct ggml_cgraph * gf = power_iteration_graph(model, tolerance); - - ggml_gallocr_alloc_graph(allocr, gf); - - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } - -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); - } -#endif - - ggml_backend_graph_compute(model.backend, gf); - - return gf->nodes[gf->n_nodes - 1]; -} - -static void power_iteration(diff_ctx & dctx, int idx, int maxIterations = 1000, float tolerance = 1e-7) { - printf("in power iteration\n"); - - pca_model model; - load_pca_model(model, dctx.v_diff[idx]); - - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - - struct ggml_tensor * square = compute_square(model, allocr, dctx.n_threads); - ggml_backend_tensor_set(model.square, square->data, 0, ggml_nbytes(model.square)); - - ggml_gallocr_free(allocr); - - struct ggml_init_params host_params = { - /*.mem_size =*/ (dctx.n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * host_ctx = ggml_init(host_params); - - struct ggml_tensor * host_old_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, dctx.n_embd); - struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, dctx.n_embd); - - for (int iter = 0; iter < maxIterations; ++iter) { - - // TODO do I need to reset it like this every time? - allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - - struct ggml_tensor * b_tensor = compute_piter(model, allocr, dctx.n_threads, tolerance); - - ggml_backend_tensor_get(b_tensor, host_new_eigenvector->data, 0, ggml_nbytes(b_tensor)); - ggml_backend_tensor_get(model.eigenvector, host_old_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); - - // convergence check - float diff = 0.0; - for (int i = 0; i < dctx.n_embd; ++i) { - diff += std::pow((ggml_get_f32_1d(host_new_eigenvector, i) - ggml_get_f32_1d(host_old_eigenvector, i)), 2); - } - - // update eigenvector - ggml_backend_tensor_set(model.eigenvector, host_new_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); - - try { - if (std::sqrt(diff) < tolerance) { - break; - } - } - catch (std::exception & e) { - // catch division by zero I guess - break; - } - } - - ggml_backend_tensor_get(model.eigenvector, dctx.v_final[idx]->data, 0, ggml_nbytes(model.eigenvector)); - - ggml_gallocr_free(allocr); - ggml_free(host_ctx); - ggml_free(model.ctx); - ggml_backend_buffer_free(model.buffer); - ggml_backend_free(model.backend); -} - -static void pca(diff_ctx & dctx) { - printf("Running PCA...\n"); - for (int il = 0; il < dctx.v_diff.size(); ++il) { - dctx.v_final.push_back(ggml_new_tensor_1d(dctx.ctx_final, GGML_TYPE_F32, dctx.n_embd)); - power_iteration(dctx, il); - printf("Done with layer %d\n", il); - printf("il = %d | %f %f \n", il, ggml_get_f32_1d(dctx.v_final[il], 0), ggml_get_f32_1d(dctx.v_final[il], 1)); - } - printf("Done with PCA.\n"); -} - -static void export_gguf(diff_ctx & dctx, int n_layers, const std::string fname, const std::string model_hint) { +static void export_gguf(const std::vector & v_ctrl, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); - size_t v_final_size_eff = n_layers - 1; - const std::string arch = "controlvector"; gguf_set_val_str(ctx, "general.architecture", arch.c_str()); gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); - gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_final_size_eff); - - for (size_t i = 0; i < v_final_size_eff; ++i) { - // TODO this number is probably not right - figure out which layer is which - // i'm pretty sure it's right now - const std::string name = "direction." + to_string(i+1); - - printf("dctx.v_final[i][%d]: %f\n", DEBUG_POS, ggml_get_f32_1d(dctx.v_final[i], DEBUG_POS)); + gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size()); - ggml_set_name(dctx.v_final[i], name.c_str()); - - gguf_add_tensor(ctx, dctx.v_final[i]); - printf("Added tensor %zu\n", i); + for (size_t i = 0; i < v_ctrl.size(); ++i) { + gguf_add_tensor(ctx, v_ctrl[i]); + printf("Added tensor: %s\n", v_ctrl[i]->name); } printf("Writing file...\n"); @@ -698,6 +482,42 @@ static void export_gguf(diff_ctx & dctx, int n_layers, const std::string fname, gguf_free(ctx); } +/** + * Load prompt files and completion file. + * Then format each pair of prompt + completion to make an entry. + */ +int prepare_entries(ctrl_params & cparams) { + // load prompts + std::vector positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file); + std::vector negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file); + if (positive_prompts.size() != negative_prompts.size()) { + fprintf(stderr, "number of positive and negative prompts must be equal\n"); + return 1; + } + if (positive_prompts.empty()) { + fprintf(stderr, "must provide at least one prompt pair\n"); + return 1; + } + + // create templated prompts + std::vector completions = ctrlvec_load_prompt_file(cparams.completions_file, false); + auto format_template = [](std::string persona, std::string suffix) { + //const std::string user_tag = "[INST]"; + //const std::string asst_tag = "[/INST]"; + //return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; + // TODO make this dynamic - allow the user to change it somehow - and adapt based on model + return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" + }; + for (int i = 0; i < positive_prompts.size(); ++i) { + for (auto & cmpl : completions) { + // TODO replicate the truncations done by the python implementation + cparams.positive_entries.push_back(format_template(positive_prompts[i], cmpl)); + cparams.negative_entries.push_back(format_template(negative_prompts[i], cmpl)); + } + } + return 0; +} + int main(int argc, char ** argv) { ctrl_params cparams; @@ -710,17 +530,8 @@ int main(int argc, char ** argv) { return 1; } - // load prompts - cparams.positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file); - cparams.negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file); - if (cparams.positive_prompts.size() != cparams.negative_prompts.size()) { - fprintf(stderr, "number of positive and negative prompts must be equal\n"); - return 1; - } - if (cparams.positive_prompts.empty()) { - fprintf(stderr, "must provide at least one prompt pair\n"); - return 1; - } + // load and prepare entries for training + prepare_entries(cparams); callback_data cb_data; @@ -742,72 +553,29 @@ int main(int argc, char ** argv) { int n_ctx = llama_n_ctx(ctx); int n_layers = llama_n_layer(model); int n_embd = llama_n_embd(model); - int n_prompts = cparams.positive_prompts.size(); - - // init ctx_ggml - struct ggml_init_params params_ggml = { - /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 2u, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - cb_data.ctx_ggml = ggml_init(params_ggml); - - // create templated prompts - for (int i = 0; i < n_prompts; ++i) { - populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); - } + // get model hint param (a.k.a model arch name) + char model_hint[128]; + llama_model_meta_val_str(model, "general.architecture", model_hint, 128); // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped std::vector tokenized_prompts; size_t n_total_tokens = 0; for (size_t i = 0; i < cparams.positive_entries.size(); ++i) { - tokenized_prompt t; - t.positive = cparams.positive_entries[i]; - t.negative = cparams.negative_entries[i]; - t.tokens_pos = ::llama_tokenize(ctx, t.positive, false); - t.tokens_neg = ::llama_tokenize(ctx, t.negative, false); - t.max_seq_len = std::max(t.tokens_pos.size(), t.tokens_neg.size()); - padding_seq(ctx, t.tokens_pos, t.max_seq_len); - padding_seq(ctx, t.tokens_neg, t.max_seq_len); + tokenized_prompt t(ctx, cparams.positive_entries[i], cparams.negative_entries[i]); n_total_tokens += 2 * t.max_seq_len; - tokenized_prompts.push_back(t); + tokenized_prompts.push_back(std::move(t)); } std::cout << "n_total_tokens: " << n_total_tokens << std::endl; - // init diff_ctx - diff_ctx dctx; - - struct ggml_init_params params_diffs_wrapped = { - /*.mem_size =*/ ggml_tensor_overhead() * n_total_tokens, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - // this we know how much overhead to allocate in advance - struct ggml_init_params params_diff = { - /*.mem_size =*/ ggml_tensor_overhead() * n_layers, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - // and this we know exactly how much memory to allocate in advance without malloc() hacks - struct ggml_init_params params_final = { - /*.mem_size =*/ n_embd * sizeof(float) * n_layers - + ggml_tensor_overhead() * n_layers, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - dctx.n_embd = n_embd; - dctx.n_threads = cparams.n_threads; - dctx.ctx_diffs_wrapped = ggml_init(params_diffs_wrapped); - dctx.ctx_diff = ggml_init(params_diff); - dctx.ctx_final = ggml_init(params_final); - - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + // init train_context + train_context ctx_train(n_embd, n_layers); int token_ct = 0; for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { tokenized_prompt t = tokenized_prompts[i]; + cb_data.n_layers = n_layers; cb_data.n_tokens = t.max_seq_len; // need to reload the model so it doesn't run out of context @@ -825,57 +593,43 @@ int main(int argc, char ** argv) { break; } - printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", t.positive.c_str(), t.negative.c_str(), t.max_seq_len); + printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", + tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), + tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), + t.max_seq_len); cb_data.is_eval_pos = true; get_hidden_layers(ctx, t.tokens_pos); cb_data.is_eval_pos = false; get_hidden_layers(ctx, t.tokens_neg); - calc_diff(cb_data, dctx); + // calculate diff and remove all zero rows + auto v_diff_filtered = cb_data.calc_diff(); + + // save & concat the filtered v_diff to ctx_train + printf("concat_diff_tmp\n"); + ctx_train.concat_diff_tmp(v_diff_filtered); // reset for next iteration - // TODO @ngxson : find a more proper way to alloc / free tensors - ggml_free(cb_data.ctx_ggml); - // TODO move this to the top of the loop and remove the ggml_free() outside - cb_data.ctx_ggml = ggml_init(params_ggml); - cb_data.v_pos.clear(); - cb_data.v_neg.clear(); + cb_data.reset(); + printf("reset\n"); } - // TODO we can actually delete cb_data here but do we want to? - - printf("dctx.v_diffs_wrapped[0][0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(dctx.v_diffs_wrapped[0][0], 0, DEBUG_POS, 0, 0)); - - printf("Done evaluate prompts\n"); - - concatenate_diffs(dctx); - - printf("dctx.v_diff[0][0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(dctx.v_diff[0], 0, DEBUG_POS, 0, 0)); - - printf("Done concatenate diffs\n"); - - // code is known to work up to here - - pca(dctx); - //printf("v_final %f %f \n", cb_data.v_final[0][0], cb_data.v_final[0][1]); - + // done with the model, we can now free it to make gain some memory + printf("Done evaluate prompts, unload model...\n"); llama_free(ctx); llama_free_model(model); - // TODO figure out how to extract this from model - there's no API exposed to get model arch string - // we need get_arch_name() from llama.cpp - // TODO also has support been implemeneted for arches other than llama yet? see #5970 - std::string model_hint = "llama"; - export_gguf(dctx, n_layers, cparams.outfile, model_hint); + // prepare ctx_train for PCA + ctx_train.build_v_diff(); - llama_backend_free(); + // run PCA + pca(ctx_train.v_diff, ctx_train.v_final); - printf("confirm we got here\n"); + // write output vectors to gguf + export_gguf(ctx_train.v_final, cparams.outfile, model_hint); - // TODO free(): invalid pointer after the entire program is done???????? - // probably because destructors free after you've already manually freed - // TODO fix destructor/ggml_free positioning + llama_backend_free(); return 0; } diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp new file mode 100644 index 0000000000000..47c8981a204ec --- /dev/null +++ b/examples/control-vector-generator/pca.hpp @@ -0,0 +1,267 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_POS 5 + +static void print_debug_tensor(struct ggml_tensor * t) { + printf("%s: %s (%s): [%ld, %ld]\n", __func__, t->name, ggml_type_name(t->type), t->ne[0], t->ne[1]); + printf("%s: %s[0] = [", __func__, t->name); + for (size_t i = 0; i <= DEBUG_POS; i++) { + printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0)); + } + printf(" ... ]\n"); +} + + + +struct pca_model { + struct ggml_tensor * v_diff_original; + struct ggml_tensor * square; + struct ggml_tensor * square_transpose; + struct ggml_tensor * eigenvector; + + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer; + struct ggml_context * ctx; +}; + +void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original) { +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(0); // init device 0 + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + +#ifdef GGML_USE_METAL + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); + model.backend = ggml_backend_metal_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } +#endif + + // if there aren't GPU Backends fallback to CPU backend + if (!model.backend) { + model.backend = ggml_backend_cpu_init(); + } + + //printf("v_diff_original[0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(v_diff_original, 0, DEBUG_POS, 0, 0)); + + const int num_tensors = 4; + + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + model.ctx = ggml_init(params); + + model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[0], v_diff_original->ne[1]); + model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1], v_diff_original->ne[1]); + model.square_transpose = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1], v_diff_original->ne[1]); + model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1]); + + model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); + + ggml_backend_tensor_set(model.v_diff_original, v_diff_original->data, 0, ggml_nbytes(v_diff_original)); + + // no need to load anything into square or square_transpose yet + + // initialize model.eigenvector to random vector + std::vector random_vec; + std::default_random_engine generator(static_cast(std::time(0))); + std::uniform_real_distribution distribution(0.0, 1.0); + for (int i = 0; i < v_diff_original->ne[1]; ++i) { + random_vec.push_back(distribution(generator)); + } + + // we don't normalize it at first but that shouldn't be a problem + ggml_backend_tensor_set(model.eigenvector, random_vec.data(), 0, ggml_nbytes(model.eigenvector)); +} + +struct ggml_cgraph * square_diff_graph(const pca_model & model) { + static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + struct ggml_context * ctx0 = ggml_init(params0); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); + //struct ggml_tensor * square_transpose = ggml_transpose(ctx0, square); + + ggml_build_forward_expand(gf, square); + + ggml_free(ctx0); + return gf; +} + +struct ggml_tensor * compute_square(const pca_model & model, ggml_gallocr_t allocr, int n_threads) { + struct ggml_cgraph * gf = square_diff_graph(model); + + ggml_gallocr_alloc_graph(allocr, gf); + + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_cb(model.backend, n_threads); + } +#endif + + ggml_backend_graph_compute(model.backend, gf); + + return gf->nodes[gf->n_nodes - 1]; +} + +struct ggml_cgraph * power_iteration_graph(const pca_model & model, float tolerance) { + static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + struct ggml_context * ctx0 = ggml_init(params0); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * b_tensor = ggml_mul_mat(ctx0, model.square, model.eigenvector); + // TODO difference between ggml_norm and ggml_norm_inplace? + // also is this the right way to do multi-step graphs? + b_tensor = ggml_norm_inplace(ctx0, b_tensor, tolerance); + + ggml_build_forward_expand(gf, b_tensor); + + ggml_free(ctx0); + return gf; +} + +struct ggml_tensor * compute_piter(const pca_model & model, ggml_gallocr_t allocr, int n_threads, float tolerance) { + struct ggml_cgraph * gf = power_iteration_graph(model, tolerance); + + ggml_gallocr_alloc_graph(allocr, gf); + + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_cb(model.backend, n_threads); + } +#endif + + ggml_backend_graph_compute(model.backend, gf); + + return gf->nodes[gf->n_nodes - 1]; +} + +static void power_iteration(struct ggml_tensor * input, struct ggml_tensor * output, int n_threads, int maxIterations = 1000, float tolerance = 1e-7) { + printf("in power iteration\n"); + int n_embd = input->ne[0];// shape of input: [n_embd, m] + + pca_model model; + load_pca_model(model, input); + + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + + struct ggml_tensor * square = compute_square(model, allocr, n_threads); + ggml_backend_tensor_set(model.square, square->data, 0, ggml_nbytes(model.square)); + + ggml_gallocr_free(allocr); + + struct ggml_init_params host_params = { + /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * host_ctx = ggml_init(host_params); + + struct ggml_tensor * host_old_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd); + struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd); + + for (int iter = 0; iter < maxIterations; ++iter) { + + // TODO do I need to reset it like this every time? + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + + struct ggml_tensor * b_tensor = compute_piter(model, allocr, n_threads, tolerance); + + ggml_backend_tensor_get(b_tensor, host_new_eigenvector->data, 0, ggml_nbytes(b_tensor)); + ggml_backend_tensor_get(model.eigenvector, host_old_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); + + // convergence check + float diff = 0.0; + for (int i = 0; i < n_embd; ++i) { + diff += std::pow((ggml_get_f32_1d(host_new_eigenvector, i) - ggml_get_f32_1d(host_old_eigenvector, i)), 2); + } + + // update eigenvector + ggml_backend_tensor_set(model.eigenvector, host_new_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); + + try { + if (std::sqrt(diff) < tolerance) { + break; + } + } + catch (std::exception & e) { + // catch division by zero I guess + break; + } + } + + ggml_backend_tensor_get(model.eigenvector, output->data, 0, ggml_nbytes(model.eigenvector)); + + ggml_gallocr_free(allocr); + ggml_free(host_ctx); + ggml_free(model.ctx); + ggml_backend_buffer_free(model.buffer); + ggml_backend_free(model.backend); +} + +static void pca( + const std::vector & v_input, + const std::vector & v_output) { + printf("Running PCA...\n"); + int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m] + int n_threads = 8; // TODO: change me + for (size_t il = 0; il < v_input.size(); ++il) { + // prepare output vector + struct ggml_tensor * ctrl_out = v_output[il]; + auto name = std::string("direction.") + std::to_string(il + 1); + ggml_set_name(ctrl_out, name.c_str()); + // run power_iteration + power_iteration(v_input[il], ctrl_out, n_threads); + printf("Done with layer %d\n", il); + print_debug_tensor(ctrl_out); + } + printf("Done with PCA.\n"); +} From c241b500a11a9b0716ef147f8da991ac5e73de2c Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 01:13:10 +0200 Subject: [PATCH 29/56] clean up PCA ggml implementation --- .../control-vector-generator.cpp | 4 +- examples/control-vector-generator/pca.hpp | 190 ++++++++---------- 2 files changed, 89 insertions(+), 105 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 35d607a59b9bc..ebff76edce220 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -94,7 +94,7 @@ struct callback_data { auto diff_filtered = filter_nonzero_rows(v_pos[il]); v_diff_filtered.push_back(diff_filtered); } - return v_pos; // for convinient, we return the result std::vector + return v_diff_filtered; // for convinient, we return the result std::vector } // delete zero rows from a given 2D tensor @@ -624,7 +624,7 @@ int main(int argc, char ** argv) { ctx_train.build_v_diff(); // run PCA - pca(ctx_train.v_diff, ctx_train.v_final); + PCA::run_pca(ctx_train.v_diff, ctx_train.v_final); // write output vectors to gguf export_gguf(ctx_train.v_final, cparams.outfile, model_hint); diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 47c8981a204ec..f279268ce9e39 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -29,12 +29,11 @@ static void print_debug_tensor(struct ggml_tensor * t) { printf(" ... ]\n"); } - +namespace PCA { struct pca_model { struct ggml_tensor * v_diff_original; struct ggml_tensor * square; - struct ggml_tensor * square_transpose; struct ggml_tensor * eigenvector; ggml_backend_t backend = NULL; @@ -42,7 +41,7 @@ struct pca_model { struct ggml_context * ctx; }; -void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original) { +void load_pca_model(pca_model & model, struct ggml_tensor * input) { #ifdef GGML_USE_CUDA fprintf(stderr, "%s: using CUDA backend\n", __func__); model.backend = ggml_backend_cuda_init(0); // init device 0 @@ -64,35 +63,35 @@ void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original) { if (!model.backend) { model.backend = ggml_backend_cpu_init(); } - - //printf("v_diff_original[0][%d]: %f\n", DEBUG_POS, ggml_get_f32_nd(v_diff_original, 0, DEBUG_POS, 0, 0)); const int num_tensors = 4; - struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, }; - model.ctx = ggml_init(params); - model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[0], v_diff_original->ne[1]); - model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1], v_diff_original->ne[1]); - model.square_transpose = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1], v_diff_original->ne[1]); - model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, v_diff_original->ne[1]); + auto n_embd = input->ne[0]; + auto n_samples = input->ne[1]; - model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); + model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_samples); + model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_embd); + model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, n_embd); + + ggml_set_name(model.v_diff_original, "v_diff_original"); + ggml_set_name(model.square, "square"); + ggml_set_name(model.eigenvector, "eigenvector"); - ggml_backend_tensor_set(model.v_diff_original, v_diff_original->data, 0, ggml_nbytes(v_diff_original)); + model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); - // no need to load anything into square or square_transpose yet + ggml_backend_tensor_set(model.v_diff_original, input->data, 0, ggml_nbytes(input)); // initialize model.eigenvector to random vector std::vector random_vec; std::default_random_engine generator(static_cast(std::time(0))); std::uniform_real_distribution distribution(0.0, 1.0); - for (int i = 0; i < v_diff_original->ne[1]; ++i) { + for (int i = 0; i < ggml_nelements(model.eigenvector); ++i) { random_vec.push_back(distribution(generator)); } @@ -100,8 +99,12 @@ void load_pca_model(pca_model & model, struct ggml_tensor * v_diff_original) { ggml_backend_tensor_set(model.eigenvector, random_vec.data(), 0, ggml_nbytes(model.eigenvector)); } -struct ggml_cgraph * square_diff_graph(const pca_model & model) { - static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); +static struct ggml_cgraph * build_graph_piter( + const pca_model & model, + bool calc_square = false, + int nb_iterations = 1) { + GGML_ASSERT(nb_iterations > 0); + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static std::vector buf(buf_size); struct ggml_init_params params0 = { @@ -109,64 +112,55 @@ struct ggml_cgraph * square_diff_graph(const pca_model & model) { /*.mem_buffer =*/ buf.data(), /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() }; + // create a temporally context to build the graph struct ggml_context * ctx0 = ggml_init(params0); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); - //struct ggml_tensor * square_transpose = ggml_transpose(ctx0, square); - - ggml_build_forward_expand(gf, square); - - ggml_free(ctx0); - return gf; -} + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + // turn v_diff_original into square matrix if needed + if (calc_square) { + //struct ggml_tensor * v_diff_transposed = ggml_transpose(ctx0, model.v_diff_original); + struct ggml_tensor * square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); + ggml_set_name(square, "square"); + //model.square = ggml_scale_inplace(ctx0, model.square, 0.0); + } -struct ggml_tensor * compute_square(const pca_model & model, ggml_gallocr_t allocr, int n_threads) { - struct ggml_cgraph * gf = square_diff_graph(model); + struct ggml_tensor * b_tensor; - ggml_gallocr_alloc_graph(allocr, gf); + for (int i = 0; i < nb_iterations; ++i) { + // b_tensor = square * eigenvector^T + b_tensor = ggml_mul_mat(ctx0, model.square, model.eigenvector); + ggml_set_name(b_tensor, "b_tensor"); - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); + // normalize + b_tensor = ggml_div_inplace(ctx0, + b_tensor, + ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor))) + ); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); + // calculate distance + struct ggml_tensor * distance; + { + distance = ggml_sub(ctx0, model.eigenvector, b_tensor); + ggml_set_name(distance, "distance"); + distance = ggml_sqrt_inplace(ctx0, + ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, distance))); } -#endif - - ggml_backend_graph_compute(model.backend, gf); - - return gf->nodes[gf->n_nodes - 1]; -} -struct ggml_cgraph * power_iteration_graph(const pca_model & model, float tolerance) { - static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); - static std::vector buf(buf_size); - - struct ggml_init_params params0 = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - struct ggml_context * ctx0 = ggml_init(params0); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * b_tensor = ggml_mul_mat(ctx0, model.square, model.eigenvector); - // TODO difference between ggml_norm and ggml_norm_inplace? - // also is this the right way to do multi-step graphs? - b_tensor = ggml_norm_inplace(ctx0, b_tensor, tolerance); - - ggml_build_forward_expand(gf, b_tensor); + // build operations nodes + ggml_build_forward_expand(gf, distance); + // delete the temporally context used to build the graph ggml_free(ctx0); return gf; } -struct ggml_tensor * compute_piter(const pca_model & model, ggml_gallocr_t allocr, int n_threads, float tolerance) { - struct ggml_cgraph * gf = power_iteration_graph(model, tolerance); - +struct ggml_tensor * compute_piter( + const pca_model & model, + struct ggml_cgraph * gf, + ggml_gallocr_t allocr, + int n_threads) { + // allocate tensors ggml_gallocr_alloc_graph(allocr, gf); if (ggml_backend_is_cpu(model.backend)) { @@ -181,25 +175,26 @@ struct ggml_tensor * compute_piter(const pca_model & model, ggml_gallocr_t alloc ggml_backend_graph_compute(model.backend, gf); + // in this case, the output tensor is the last one in the graph return gf->nodes[gf->n_nodes - 1]; } -static void power_iteration(struct ggml_tensor * input, struct ggml_tensor * output, int n_threads, int maxIterations = 1000, float tolerance = 1e-7) { +static void power_iteration( + struct ggml_tensor * input, + struct ggml_tensor * output, + int n_threads, + int maxIterations = 1000, + float tolerance = 1e-7) { printf("in power iteration\n"); - int n_embd = input->ne[0];// shape of input: [n_embd, m] + int n_embd = input->ne[0]; // shape of input: [n_embd, m] pca_model model; load_pca_model(model, input); - ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - - struct ggml_tensor * square = compute_square(model, allocr, n_threads); - ggml_backend_tensor_set(model.square, square->data, 0, ggml_nbytes(model.square)); - - ggml_gallocr_free(allocr); + ggml_gallocr_t allocr = NULL; struct ggml_init_params host_params = { - /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, + /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 4u, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; @@ -209,33 +204,19 @@ static void power_iteration(struct ggml_tensor * input, struct ggml_tensor * out struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd); for (int iter = 0; iter < maxIterations; ++iter) { - - // TODO do I need to reset it like this every time? - allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - - struct ggml_tensor * b_tensor = compute_piter(model, allocr, n_threads, tolerance); - - ggml_backend_tensor_get(b_tensor, host_new_eigenvector->data, 0, ggml_nbytes(b_tensor)); - ggml_backend_tensor_get(model.eigenvector, host_old_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); - - // convergence check - float diff = 0.0; - for (int i = 0; i < n_embd; ++i) { - diff += std::pow((ggml_get_f32_1d(host_new_eigenvector, i) - ggml_get_f32_1d(host_old_eigenvector, i)), 2); - } - - // update eigenvector - ggml_backend_tensor_set(model.eigenvector, host_new_eigenvector->data, 0, ggml_nbytes(model.eigenvector)); - - try { - if (std::sqrt(diff) < tolerance) { - break; - } - } - catch (std::exception & e) { - // catch division by zero I guess - break; + if (allocr) { + ggml_gallocr_free(allocr); } + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + struct ggml_cgraph * gf = build_graph_piter(model, iter == 0); + printf("kkk\n"); + ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); + struct ggml_tensor * distance = compute_piter(model, gf, allocr, n_threads); + + ggml_backend_tensor_get(distance, host_new_eigenvector->data, 0, ggml_nbytes(distance)); + print_debug_tensor(host_new_eigenvector); + + break; // FIXME } ggml_backend_tensor_get(model.eigenvector, output->data, 0, ggml_nbytes(model.eigenvector)); @@ -245,11 +226,12 @@ static void power_iteration(struct ggml_tensor * input, struct ggml_tensor * out ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer); ggml_backend_free(model.backend); + exit(0); } -static void pca( - const std::vector & v_input, - const std::vector & v_output) { +static void run_pca( + const std::vector & v_input, + const std::vector & v_output) { printf("Running PCA...\n"); int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m] int n_threads = 8; // TODO: change me @@ -265,3 +247,5 @@ static void pca( } printf("Done with PCA.\n"); } + +} From 6a5adf3d7c6be27830cad8dff6f473d3faf38d41 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 01:33:16 +0200 Subject: [PATCH 30/56] fix shape of v_diff_original --- .../control-vector-generator.cpp | 3 ++- examples/control-vector-generator/pca.hpp | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index ebff76edce220..2e7f36635e5a3 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -210,8 +210,9 @@ struct train_context { auto & diff_tmp = v_diff_tmp[il]; int n_elem = diff_tmp.size() / sizeof(float); int n_rows = n_elem / n_embd; - struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows); + struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd); ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); + // TODO: IMPORTANT!! transpose diff diff->data = diff_tmp.data(); v_diff.push_back(diff); } diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index f279268ce9e39..a7c76e561f6a8 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -72,11 +72,11 @@ void load_pca_model(pca_model & model, struct ggml_tensor * input) { }; model.ctx = ggml_init(params); - auto n_embd = input->ne[0]; - auto n_samples = input->ne[1]; + auto n_embd = input->ne[1]; + auto n_samples = input->ne[0]; - model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_samples); - model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_embd); + model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_samples, n_embd); + model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_embd); model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, n_embd); ggml_set_name(model.v_diff_original, "v_diff_original"); @@ -117,9 +117,11 @@ static struct ggml_cgraph * build_graph_piter( struct ggml_cgraph * gf = ggml_new_graph(ctx0); // turn v_diff_original into square matrix if needed + struct ggml_tensor * square; if (calc_square) { //struct ggml_tensor * v_diff_transposed = ggml_transpose(ctx0, model.v_diff_original); - struct ggml_tensor * square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); + print_debug_tensor(model.v_diff_original); + square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); ggml_set_name(square, "square"); //model.square = ggml_scale_inplace(ctx0, model.square, 0.0); } @@ -128,7 +130,7 @@ static struct ggml_cgraph * build_graph_piter( for (int i = 0; i < nb_iterations; ++i) { // b_tensor = square * eigenvector^T - b_tensor = ggml_mul_mat(ctx0, model.square, model.eigenvector); + b_tensor = ggml_mul_mat(ctx0, square, model.eigenvector); ggml_set_name(b_tensor, "b_tensor"); // normalize @@ -209,7 +211,6 @@ static void power_iteration( } allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); struct ggml_cgraph * gf = build_graph_piter(model, iter == 0); - printf("kkk\n"); ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); struct ggml_tensor * distance = compute_piter(model, gf, allocr, n_threads); @@ -236,6 +237,7 @@ static void run_pca( int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m] int n_threads = 8; // TODO: change me for (size_t il = 0; il < v_input.size(); ++il) { + print_debug_tensor(v_input[il]); // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; auto name = std::string("direction.") + std::to_string(il + 1); From 9e39571fc2ee5a76230215dcde85b216b0a3e086 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 11:45:16 +0200 Subject: [PATCH 31/56] add n_batch for pca --- .../control-vector-generator.cpp | 8 +- examples/control-vector-generator/pca.hpp | 294 +++++++++++------- 2 files changed, 184 insertions(+), 118 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 2e7f36635e5a3..a65ceba0e051e 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -188,7 +188,9 @@ struct train_context { for (int il = 0; il < n_layers - 1; il++) { std::vector empty; v_diff_tmp.push_back(empty); - v_final.push_back(ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd)); + auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); + t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible + v_final.push_back(t); } } @@ -625,7 +627,9 @@ int main(int argc, char ** argv) { ctx_train.build_v_diff(); // run PCA - PCA::run_pca(ctx_train.v_diff, ctx_train.v_final); + PCA::pca_params pca_params; + PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); + exit(0); // TODO: REMOVE ME !!!!!!!!!!!!!!!!!!!!!!!! // write output vectors to gguf export_gguf(ctx_train.v_final, cparams.outfile, model_hint); diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index a7c76e561f6a8..67b914a34294a 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -20,8 +20,9 @@ #define DEBUG_POS 5 -static void print_debug_tensor(struct ggml_tensor * t) { +static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) { printf("%s: %s (%s): [%ld, %ld]\n", __func__, t->name, ggml_type_name(t->type), t->ne[0], t->ne[1]); + if (!with_data) return; printf("%s: %s[0] = [", __func__, t->name); for (size_t i = 0; i <= DEBUG_POS; i++) { printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0)); @@ -31,79 +32,121 @@ static void print_debug_tensor(struct ggml_tensor * t) { namespace PCA { -struct pca_model { - struct ggml_tensor * v_diff_original; - struct ggml_tensor * square; - struct ggml_tensor * eigenvector; +// input params for PCA computations +struct pca_params { + int n_threads = 1; + int n_batch = 5; // number of iterations do to in one batch. larger the batch, more memory is used + int n_iterations = 1000; + float tolerance = 1e-7; +}; +// result from each iteration +struct pca_result { + std::vector eigenvectors; + std::vector distances; +}; + +struct pca_model { ggml_backend_t backend = NULL; ggml_backend_buffer_t buffer; - struct ggml_context * ctx; -}; + struct ggml_context * ctx; // context to compute graph on target device + struct ggml_context * ctx_host; // host context to store results + + // tensors on target device + struct ggml_tensor * dev_input; + struct ggml_tensor * dev_square; + struct ggml_tensor * dev_eigenvector; + + // tensors to store output data on host + struct ggml_tensor * host_eigenvector; -void load_pca_model(pca_model & model, struct ggml_tensor * input) { + pca_model(struct ggml_tensor * t_input) { #ifdef GGML_USE_CUDA - fprintf(stderr, "%s: using CUDA backend\n", __func__); - model.backend = ggml_backend_cuda_init(0); // init device 0 - if (!model.backend) { - fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); - } + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } #endif #ifdef GGML_USE_METAL - fprintf(stderr, "%s: using Metal backend\n", __func__); - ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); - model.backend = ggml_backend_metal_init(); - if (!model.backend) { - fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); - } + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); + backend = ggml_backend_metal_init(); + if (!backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } #endif - // if there aren't GPU Backends fallback to CPU backend - if (!model.backend) { - model.backend = ggml_backend_cpu_init(); - } - - const int num_tensors = 4; - struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - model.ctx = ggml_init(params); - - auto n_embd = input->ne[1]; - auto n_samples = input->ne[0]; - - model.v_diff_original = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_samples, n_embd); - model.square = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_embd, n_embd); - model.eigenvector = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, n_embd); - - ggml_set_name(model.v_diff_original, "v_diff_original"); - ggml_set_name(model.square, "square"); - ggml_set_name(model.eigenvector, "eigenvector"); - - model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + backend = ggml_backend_cpu_init(); + } - ggml_backend_tensor_set(model.v_diff_original, input->data, 0, ggml_nbytes(input)); + const int num_tensors = 4; + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx = ggml_init(params); + + auto n_samples = t_input->ne[0]; + auto n_embd = t_input->ne[1]; + + dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd); + dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + ggml_set_name(dev_input, "dev_input"); + ggml_set_name(dev_square, "dev_square"); + ggml_set_name(dev_eigenvector, "dev_eigenvector"); + buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input)); + + // initialize eigenvector to random normalized vector + { + std::vector random_vec(ggml_nelements(dev_eigenvector), 0.0); + std::default_random_engine generator(static_cast(std::time(0))); + std::uniform_real_distribution distribution(0.0, 1.0); + float sum_sqr = 0.0; // for normalizing random_vec + for (size_t i = 0; i < random_vec.size(); ++i) { + float f = distribution(generator); + sum_sqr += f * f; + random_vec[i] = f; + } + // normalize it + float random_vec_norm = std::sqrt(sum_sqr); + for (size_t i = 0; i < random_vec.size(); ++i) { + random_vec[i] /= random_vec_norm; + } + ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector)); + } - // initialize model.eigenvector to random vector - std::vector random_vec; - std::default_random_engine generator(static_cast(std::time(0))); - std::uniform_real_distribution distribution(0.0, 1.0); - for (int i = 0; i < ggml_nelements(model.eigenvector); ++i) { - random_vec.push_back(distribution(generator)); + // init host context + struct ggml_init_params host_params = { + /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + ctx_host = ggml_init(host_params); + host_eigenvector = ggml_new_tensor_1d(ctx_host, GGML_TYPE_F32, n_embd); } - // we don't normalize it at first but that shouldn't be a problem - ggml_backend_tensor_set(model.eigenvector, random_vec.data(), 0, ggml_nbytes(model.eigenvector)); -} + ~pca_model() { + ggml_free(ctx_host); + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); + } +}; static struct ggml_cgraph * build_graph_piter( + const struct pca_params & params, const pca_model & model, - bool calc_square = false, - int nb_iterations = 1) { - GGML_ASSERT(nb_iterations > 0); + bool calc_square = false) { + GGML_ASSERT(params.n_batch > 0); + // TODO: buf_size must be able to scale with params.n_batch static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static std::vector buf(buf_size); @@ -117,20 +160,21 @@ static struct ggml_cgraph * build_graph_piter( struct ggml_cgraph * gf = ggml_new_graph(ctx0); // turn v_diff_original into square matrix if needed - struct ggml_tensor * square; + struct ggml_tensor * tmp_square; if (calc_square) { - //struct ggml_tensor * v_diff_transposed = ggml_transpose(ctx0, model.v_diff_original); - print_debug_tensor(model.v_diff_original); - square = ggml_mul_mat(ctx0, model.v_diff_original, model.v_diff_original); - ggml_set_name(square, "square"); - //model.square = ggml_scale_inplace(ctx0, model.square, 0.0); + print_debug_tensor(model.dev_input); + tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input); + ggml_set_name(tmp_square, "tmp_square"); } struct ggml_tensor * b_tensor; + struct ggml_tensor * distance; + struct ggml_tensor * old_eigen = model.dev_eigenvector; + struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square; - for (int i = 0; i < nb_iterations; ++i) { + for (int i = 0; i < params.n_batch; ++i) { // b_tensor = square * eigenvector^T - b_tensor = ggml_mul_mat(ctx0, square, model.eigenvector); + b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen); ggml_set_name(b_tensor, "b_tensor"); // normalize @@ -138,104 +182,122 @@ static struct ggml_cgraph * build_graph_piter( b_tensor, ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor))) ); - } + ggml_set_name(b_tensor, ("b_tensor_norm_" + std::to_string(i)).c_str()); - // calculate distance - struct ggml_tensor * distance; - { - distance = ggml_sub(ctx0, model.eigenvector, b_tensor); - ggml_set_name(distance, "distance"); + // calculate distance(new eigenvector - old eigenvector) + struct ggml_tensor * new_sub_old = ggml_sub(ctx0, old_eigen, b_tensor); distance = ggml_sqrt_inplace(ctx0, - ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, distance))); - } + ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old))); + ggml_set_name(distance, ("distance_" + std::to_string(i)).c_str()); - // build operations nodes - ggml_build_forward_expand(gf, distance); + old_eigen = b_tensor; + + // build operations nodes + ggml_build_forward_expand(gf, distance); + } // delete the temporally context used to build the graph ggml_free(ctx0); return gf; } -struct ggml_tensor * compute_piter( +static ggml_status compute_piter( + const struct pca_params & params, const pca_model & model, struct ggml_cgraph * gf, ggml_gallocr_t allocr, - int n_threads) { + struct pca_result & result) { // allocate tensors ggml_gallocr_alloc_graph(allocr, gf); if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); + ggml_backend_cpu_set_n_threads(model.backend, params.n_threads); } #ifdef GGML_USE_METAL if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, n_threads); + ggml_backend_metal_set_n_cb(model.backend, params.n_threads); } #endif - ggml_backend_graph_compute(model.backend, gf); - - // in this case, the output tensor is the last one in the graph - return gf->nodes[gf->n_nodes - 1]; + ggml_status res = ggml_backend_graph_compute(model.backend, gf); + if (res == GGML_STATUS_SUCCESS) { + auto extract_i = [](std::string prefix, std::string str) -> int { + int i = -1; + if (str.rfind(prefix, 0) == 0) { + sscanf(str.c_str(), (prefix + "%d").c_str(), &i); + } + return i; + }; + // get output nodes + result.eigenvectors.clear(); + result.distances.clear(); + result.eigenvectors.resize(params.n_batch); + result.distances.resize(params.n_batch); + for (int i = 0; i < gf->n_nodes; ++i) { + auto node = gf->nodes[i]; + int iter = -1; + // find b_tensor (without copying data from device) + if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { + print_debug_tensor(node, false); + result.eigenvectors[iter] = node; + } + // find distances, then copy data from device + if ((iter = extract_i("distance_", node->name)) > -1) { + float d; + ggml_backend_tensor_get(node, &d, 0, sizeof(float)); + result.distances[iter] = d; + std::cout << node->name << " = " << d << "\n"; + } + } + } + return res; } static void power_iteration( - struct ggml_tensor * input, - struct ggml_tensor * output, - int n_threads, - int maxIterations = 1000, - float tolerance = 1e-7) { + const struct pca_params & params, + struct ggml_tensor * input, // shape of input: [n_samples, n_embd] + struct ggml_tensor * output) { printf("in power iteration\n"); - int n_embd = input->ne[0]; // shape of input: [n_embd, m] - - pca_model model; - load_pca_model(model, input); + //int n_embd = input->ne[1]; + struct pca_model model(input); ggml_gallocr_t allocr = NULL; + struct pca_result result; + struct ggml_tensor * last_eigenvector; - struct ggml_init_params host_params = { - /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 4u, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - struct ggml_context * host_ctx = ggml_init(host_params); - - struct ggml_tensor * host_old_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd); - struct ggml_tensor * host_new_eigenvector = ggml_new_tensor_1d(host_ctx, GGML_TYPE_F32, n_embd); - - for (int iter = 0; iter < maxIterations; ++iter) { + int n_iter = params.n_iterations / params.n_batch; // more batch, fewer iterations + for (int iter = 0; iter < n_iter; ++iter) { + bool calc_square = (iter == 0); // only need to calculate square for first iteration if (allocr) { ggml_gallocr_free(allocr); } allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - struct ggml_cgraph * gf = build_graph_piter(model, iter == 0); + struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square); ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); - struct ggml_tensor * distance = compute_piter(model, gf, allocr, n_threads); + compute_piter(params, model, gf, allocr, result); - ggml_backend_tensor_get(distance, host_new_eigenvector->data, 0, ggml_nbytes(distance)); - print_debug_tensor(host_new_eigenvector); + for (size_t k = 0; k < result.distances.size(); ++k) { + last_eigenvector = result.eigenvectors[k]; + if (result.distances[k] < params.tolerance) { + break; // done + } + } break; // FIXME } - ggml_backend_tensor_get(model.eigenvector, output->data, 0, ggml_nbytes(model.eigenvector)); - + ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector)); + print_debug_tensor(output); ggml_gallocr_free(allocr); - ggml_free(host_ctx); - ggml_free(model.ctx); - ggml_backend_buffer_free(model.buffer); - ggml_backend_free(model.backend); - exit(0); } static void run_pca( + const struct pca_params & params, const std::vector & v_input, const std::vector & v_output) { printf("Running PCA...\n"); int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m] - int n_threads = 8; // TODO: change me for (size_t il = 0; il < v_input.size(); ++il) { print_debug_tensor(v_input[il]); // prepare output vector @@ -243,7 +305,7 @@ static void run_pca( auto name = std::string("direction.") + std::to_string(il + 1); ggml_set_name(ctrl_out, name.c_str()); // run power_iteration - power_iteration(v_input[il], ctrl_out, n_threads); + power_iteration(params, v_input[il], ctrl_out); printf("Done with layer %d\n", il); print_debug_tensor(ctrl_out); } From 1a088fb0a572d97f8f1a9e02ddde78aad69e03d0 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 12:37:05 +0200 Subject: [PATCH 32/56] working version --- .../control-vector-generator.cpp | 102 ++++++++++-------- examples/control-vector-generator/pca.hpp | 58 ++++++---- 2 files changed, 95 insertions(+), 65 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index a65ceba0e051e..07086a635b9f3 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -70,7 +70,7 @@ struct callback_data { t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); ggml_set_name(t_layer, ggml_get_name(t)); - print_debug_tensor(t_layer); + //print_debug_tensor(t_layer); if (is_eval_pos) { v_pos.push_back(t_layer); @@ -99,7 +99,7 @@ struct callback_data { // delete zero rows from a given 2D tensor struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { - printf("filter_nonzero_rows\n"); + //printf("filter_nonzero_rows\n"); auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { // check if given row containing all zero elements int n_cols = t->ne[0]; // hint: should be equal to n_embd @@ -119,7 +119,7 @@ struct callback_data { // get "n_nonzero_rows" for the output "diff_filtered" int n_nonzero_rows = rows_to_copy.size(); - printf("n_nonzero_rows: %d\n", n_nonzero_rows); + //printf("n_nonzero_rows: %d\n", n_nonzero_rows); int n_embd = a->ne[0]; GGML_ASSERT(n_nonzero_rows > 0); @@ -138,7 +138,7 @@ struct callback_data { } } - print_debug_tensor(diff_filtered); + //print_debug_tensor(diff_filtered); return diff_filtered; } @@ -169,7 +169,8 @@ struct train_context { // each element of the vector correspond to one layer // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here - std::vector v_diff; // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) + // NOTE (2): v_diff is transposed from v_diff_tmp + std::vector v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) std::vector v_final; // vector of vectors of size [n_embd] to be written to file // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor @@ -196,7 +197,7 @@ struct train_context { // add new rows into existing tensor in v_diff_tmp void concat_diff_tmp(const std::vector & diff_filtered) { - GGML_ASSERT(diff_filtered.size() == n_layers - 1); + GGML_ASSERT((int) diff_filtered.size() == n_layers - 1); for (int il = 0; il < n_layers - 1; il++) { auto t = diff_filtered[il]; auto & diff_tmp = v_diff_tmp[il]; @@ -206,32 +207,46 @@ struct train_context { } } - // build the v_diff tensors from v_diff_tmp + // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) void build_v_diff() { + printf("build_v_diff\n"); for (int il = 0; il < n_layers - 1; il++) { auto & diff_tmp = v_diff_tmp[il]; int n_elem = diff_tmp.size() / sizeof(float); + GGML_ASSERT(n_elem % n_embd == 0); int n_rows = n_elem / n_embd; struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd); ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); - // TODO: IMPORTANT!! transpose diff - diff->data = diff_tmp.data(); + // copy data & transpose + diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible + float * arr = (float *) diff_tmp.data(); + for (int ir = 0; ir < n_rows; ++ir) { + for (int ic = 0; ic < n_embd; ++ic) { + float f = arr[ir*n_embd + ic]; + //std::cout << ir << "," << ic << " = " << f << "\n"; + ggml_set_f32_nd(diff, ir, ic, 0, 0, f); + } + } v_diff.push_back(diff); + print_debug_tensor(diff); + // free memory of diff_tmp + diff_tmp.resize(0); } } ~train_context() { for (auto ptr : v_final) free(ptr->data); - // no need to free v_diff_tmp or v_diff, since we didn't use malloc + for (auto ptr : v_diff) free(ptr->data); + // no need to free v_diff_tmp, since we didn't use malloc ggml_free(ctx_ggml); } }; struct ctrl_params { /* default meta parameters */ - bool always_reload = false; int n_completions = 64; - int n_threads = 8; + int n_pca_batch = 5; + int n_pca_iterations = 1000; /* default filepaths */ std::string outfile = "control_vector.gguf"; @@ -295,9 +310,10 @@ static void print_usage(const char * executable) { printf(" default: 'examples/control-vector-generator/completions.txt'\n"); printf(" -nc, --num-completions N number of lines of completions file to use\n"); printf(" default: 64\n"); - printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n"); - printf(" default: 8\n"); - printf(" --always-reload reload the model for every new template to parse (not recommended)\n"); + printf(" --batch-pca N batch size used for PCA\n"); + printf(" default: 5\n"); + printf(" --iter-pca N number of iterations used for PCA\n"); + printf(" default: 1000\n"); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); @@ -370,10 +386,10 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) throw std::invalid_argument("error: missing argument for " + arg); } } - if (arg == "--num-threads" || arg == "-t") { + if (arg == "--pca-batch") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { try { - params.n_threads = std::stoi(argv[arg_idx]); + params.n_pca_batch = std::stoi(argv[arg_idx]); } catch (const std::invalid_argument & ex) { throw std::invalid_argument("error: invalid argument for " + arg); @@ -383,9 +399,18 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) throw std::invalid_argument("error: missing argument for " + arg); } } - if (arg == "--always-reload") { - params.always_reload = true; - skipme += 1; + if (arg == "--pca-iter") { + if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { + try { + params.n_pca_iterations = std::stoi(argv[arg_idx]); + } + catch (const std::invalid_argument & ex) { + throw std::invalid_argument("error: invalid argument for " + arg); + } + skipme += 2; + } else { + throw std::invalid_argument("error: missing argument for " + arg); + } } // TODO it might be nice QoL to have single positive/negative args // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params @@ -427,7 +452,7 @@ static std::vector ctrlvec_load_prompt_file(std::string path, bool static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (callback_data *) user_data; - auto ggml_ne_string = [](const ggml_tensor * t) -> std::string { + /*auto ggml_ne_string = [](const ggml_tensor * t) -> std::string { std::string str; for (int i = 0; i < GGML_MAX_DIMS; ++i) { str += std::to_string(t->ne[i]); @@ -436,7 +461,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } } return str; - }; + };*/ static const char * l_out_name = "l_out"; const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; @@ -473,6 +498,7 @@ static void export_gguf(const std::vector & v_ctrl, const for (size_t i = 0; i < v_ctrl.size(); ++i) { gguf_add_tensor(ctx, v_ctrl[i]); + print_debug_tensor(v_ctrl[i]); printf("Added tensor: %s\n", v_ctrl[i]->name); } @@ -489,7 +515,7 @@ static void export_gguf(const std::vector & v_ctrl, const * Load prompt files and completion file. * Then format each pair of prompt + completion to make an entry. */ -int prepare_entries(ctrl_params & cparams) { +static int prepare_entries(ctrl_params & cparams) { // load prompts std::vector positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file); std::vector negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file); @@ -511,7 +537,7 @@ int prepare_entries(ctrl_params & cparams) { // TODO make this dynamic - allow the user to change it somehow - and adapt based on model return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" }; - for (int i = 0; i < positive_prompts.size(); ++i) { + for (size_t i = 0; i < positive_prompts.size(); ++i) { for (auto & cmpl : completions) { // TODO replicate the truncations done by the python implementation cparams.positive_entries.push_back(format_template(positive_prompts[i], cmpl)); @@ -553,7 +579,7 @@ int main(int argc, char ** argv) { llama_context * ctx; std::tie(model, ctx) = llama_init_from_gpt_params(params); - int n_ctx = llama_n_ctx(ctx); + // int n_ctx = llama_n_ctx(ctx); int n_layers = llama_n_layer(model); int n_embd = llama_n_embd(model); // get model hint param (a.k.a model arch name) @@ -574,29 +600,13 @@ int main(int argc, char ** argv) { // init train_context train_context ctx_train(n_embd, n_layers); - int token_ct = 0; - for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { tokenized_prompt t = tokenized_prompts[i]; cb_data.n_layers = n_layers; cb_data.n_tokens = t.max_seq_len; - // need to reload the model so it doesn't run out of context - // this should scale with -c option passed by main - token_ct += 2 * t.max_seq_len; - if (token_ct > n_ctx || cparams.always_reload) { - //break; - llama_free(ctx); - llama_free_model(model); - std::tie(model, ctx) = llama_init_from_gpt_params(params); - token_ct = 2 * t.max_seq_len; - } - if (token_ct > n_ctx) { - fprintf(stderr, "context size exceeded on iteration %zu\n", i); - break; - } - - printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", + printf("Evaluating prompt[%ld/%ld]: \"%s\" - \"%s\" (%ld tokens)\n", + i+1, t.tokens_pos.size(), tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), t.max_seq_len); @@ -610,12 +620,10 @@ int main(int argc, char ** argv) { auto v_diff_filtered = cb_data.calc_diff(); // save & concat the filtered v_diff to ctx_train - printf("concat_diff_tmp\n"); ctx_train.concat_diff_tmp(v_diff_filtered); // reset for next iteration cb_data.reset(); - printf("reset\n"); } // done with the model, we can now free it to make gain some memory @@ -628,8 +636,10 @@ int main(int argc, char ** argv) { // run PCA PCA::pca_params pca_params; + pca_params.n_threads = params.n_threads; + pca_params.n_batch = cparams.n_pca_batch; + pca_params.n_iterations = cparams.n_pca_iterations; PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); - exit(0); // TODO: REMOVE ME !!!!!!!!!!!!!!!!!!!!!!!! // write output vectors to gguf export_gguf(ctx_train.v_final, cparams.outfile, model_hint); diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 67b914a34294a..cd1760de93cc1 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -38,10 +38,15 @@ struct pca_params { int n_batch = 5; // number of iterations do to in one batch. larger the batch, more memory is used int n_iterations = 1000; float tolerance = 1e-7; + + // for debugging + int i_layer = 0; + int n_layers = 0; }; // result from each iteration struct pca_result { + struct ggml_tensor * calculated_square = NULL; std::vector eigenvectors; std::vector distances; }; @@ -162,7 +167,6 @@ static struct ggml_cgraph * build_graph_piter( // turn v_diff_original into square matrix if needed struct ggml_tensor * tmp_square; if (calc_square) { - print_debug_tensor(model.dev_input); tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input); ggml_set_name(tmp_square, "tmp_square"); } @@ -229,17 +233,17 @@ static ggml_status compute_piter( } return i; }; - // get output nodes + result.calculated_square = NULL; result.eigenvectors.clear(); result.distances.clear(); result.eigenvectors.resize(params.n_batch); result.distances.resize(params.n_batch); + // get output nodes for (int i = 0; i < gf->n_nodes; ++i) { auto node = gf->nodes[i]; int iter = -1; // find b_tensor (without copying data from device) if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { - print_debug_tensor(node, false); result.eigenvectors[iter] = node; } // find distances, then copy data from device @@ -247,7 +251,11 @@ static ggml_status compute_piter( float d; ggml_backend_tensor_get(node, &d, 0, sizeof(float)); result.distances[iter] = d; - std::cout << node->name << " = " << d << "\n"; + // std::cout << node->name << " = " << d << "\n"; + } + // find tmp_square if it exists (without copying data from device) + if (std::string(node->name) == "tmp_square") { + result.calculated_square = node; } } } @@ -258,23 +266,22 @@ static void power_iteration( const struct pca_params & params, struct ggml_tensor * input, // shape of input: [n_samples, n_embd] struct ggml_tensor * output) { - printf("in power iteration\n"); - //int n_embd = input->ne[1]; + //printf("in power iteration\n"); struct pca_model model(input); ggml_gallocr_t allocr = NULL; struct pca_result result; - struct ggml_tensor * last_eigenvector; + struct ggml_tensor * last_eigenvector = NULL; - int n_iter = params.n_iterations / params.n_batch; // more batch, fewer iterations - for (int iter = 0; iter < n_iter; ++iter) { + int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations + for (int iter = 0; iter < n_iters; ++iter) { bool calc_square = (iter == 0); // only need to calculate square for first iteration if (allocr) { ggml_gallocr_free(allocr); } allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square); - ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); + // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); compute_piter(params, model, gf, allocr, result); for (size_t k = 0; k < result.distances.size(); ++k) { @@ -283,31 +290,44 @@ static void power_iteration( break; // done } } - - break; // FIXME + + if (calc_square) { + // copy and store the square matrix if needed + GGML_ASSERT(result.calculated_square != NULL); + std::vector tmp_buf(ggml_nbytes(model.dev_square)); + ggml_backend_tensor_get(result.calculated_square, tmp_buf.data(), 0, tmp_buf.size()); + ggml_backend_tensor_set(model.dev_square, tmp_buf.data(), 0, tmp_buf.size()); + } + + printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n", + __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch); } + // get output tensor + GGML_ASSERT(last_eigenvector); ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector)); - print_debug_tensor(output); + //print_debug_tensor(output); ggml_gallocr_free(allocr); } static void run_pca( - const struct pca_params & params, - const std::vector & v_input, + struct pca_params & params, + const std::vector & v_input, // shape of v_input[0]: [n_samples, n_embd] const std::vector & v_output) { printf("Running PCA...\n"); - int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m] for (size_t il = 0; il < v_input.size(); ++il) { - print_debug_tensor(v_input[il]); + // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; auto name = std::string("direction.") + std::to_string(il + 1); ggml_set_name(ctrl_out, name.c_str()); + // run power_iteration + params.i_layer = il; + params.n_layers = v_input.size(); power_iteration(params, v_input[il], ctrl_out); - printf("Done with layer %d\n", il); - print_debug_tensor(ctrl_out); + printf("DONE layer %ld / %ld\n", il+1, v_input.size()); + //print_debug_tensor(ctrl_out); } printf("Done with PCA.\n"); } From 163916864ca7abd4eb0573389f27513997b8f178 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 12:40:07 +0200 Subject: [PATCH 33/56] remember to copy back the last_eigenvector --- examples/control-vector-generator/pca.hpp | 26 +++++++++++++---------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index cd1760de93cc1..e190e8815d6cd 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -62,9 +62,6 @@ struct pca_model { struct ggml_tensor * dev_square; struct ggml_tensor * dev_eigenvector; - // tensors to store output data on host - struct ggml_tensor * host_eigenvector; - pca_model(struct ggml_tensor * t_input) { #ifdef GGML_USE_CUDA fprintf(stderr, "%s: using CUDA backend\n", __func__); @@ -129,17 +126,16 @@ struct pca_model { } // init host context - struct ggml_init_params host_params = { - /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - ctx_host = ggml_init(host_params); - host_eigenvector = ggml_new_tensor_1d(ctx_host, GGML_TYPE_F32, n_embd); + //struct ggml_init_params host_params = { + // /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, + // /*.mem_buffer =*/ NULL, + // /*.no_alloc =*/ false, + //}; + //ctx_host = ggml_init(host_params); + //host_eigenvector = ggml_new_tensor_1d(ctx_host, GGML_TYPE_F32, n_embd); } ~pca_model() { - ggml_free(ctx_host); ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); @@ -299,6 +295,14 @@ static void power_iteration( ggml_backend_tensor_set(model.dev_square, tmp_buf.data(), 0, tmp_buf.size()); } + { + // copy last eigen vector and store as input for next iteration + GGML_ASSERT(last_eigenvector != NULL); + std::vector tmp_buf(ggml_nbytes(last_eigenvector)); + ggml_backend_tensor_get(last_eigenvector, tmp_buf.data(), 0, tmp_buf.size()); + ggml_backend_tensor_set(model.dev_eigenvector, tmp_buf.data(), 0, tmp_buf.size()); + } + printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n", __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch); } From 446da906d90a02b3a1d312bebe075c8213f64f7f Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Tue, 11 Jun 2024 08:22:38 -0400 Subject: [PATCH 34/56] fix n_completions --- .../control-vector-generator/control-vector-generator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 07086a635b9f3..674a0ac416e92 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -538,10 +538,10 @@ static int prepare_entries(ctrl_params & cparams) { return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" }; for (size_t i = 0; i < positive_prompts.size(); ++i) { - for (auto & cmpl : completions) { + for (size_t j = 0; j < completions.size() && j < cparams.n_completions; ++j) { // TODO replicate the truncations done by the python implementation - cparams.positive_entries.push_back(format_template(positive_prompts[i], cmpl)); - cparams.negative_entries.push_back(format_template(negative_prompts[i], cmpl)); + cparams.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); + cparams.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); } } return 0; From d41c71998084b5cdcc53db91607efa00d2a2d18a Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 14:31:45 +0200 Subject: [PATCH 35/56] bring back n_completions --- .../control-vector-generator/control-vector-generator.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 674a0ac416e92..84fb745d5a689 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -18,6 +18,7 @@ #include #include #include +#include ////////////////////////////////////////////////// @@ -244,7 +245,7 @@ struct train_context { struct ctrl_params { /* default meta parameters */ - int n_completions = 64; + int n_completions = INT_MAX; int n_pca_batch = 5; int n_pca_iterations = 1000; @@ -538,7 +539,7 @@ static int prepare_entries(ctrl_params & cparams) { return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" }; for (size_t i = 0; i < positive_prompts.size(); ++i) { - for (size_t j = 0; j < completions.size() && j < cparams.n_completions; ++j) { + for (int j = 0; j < std::min((int) completions.size(), cparams.n_completions); ++j) { // TODO replicate the truncations done by the python implementation cparams.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); cparams.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); @@ -606,7 +607,7 @@ int main(int argc, char ** argv) { cb_data.n_tokens = t.max_seq_len; printf("Evaluating prompt[%ld/%ld]: \"%s\" - \"%s\" (%ld tokens)\n", - i+1, t.tokens_pos.size(), + i+1, cparams.positive_entries.size(), tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), t.max_seq_len); From 3223133cf56631d48591012ff82b95d2bc085648 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 15:05:06 +0200 Subject: [PATCH 36/56] default n_pca_batch to 20 --- .../control-vector-generator.cpp | 19 ++++++++++--------- examples/control-vector-generator/pca.hpp | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 84fb745d5a689..f713fb3d7ef81 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -246,7 +246,7 @@ struct train_context { struct ctrl_params { /* default meta parameters */ int n_completions = INT_MAX; - int n_pca_batch = 5; + int n_pca_batch = 20; int n_pca_iterations = 1000; /* default filepaths */ @@ -294,6 +294,7 @@ static std::string to_string(const T & val) { } static void print_usage(const char * executable) { + struct ctrl_params defaults; printf("\n"); printf("usage: %s [options] -m [gpt-opts]", executable); printf("\n"); @@ -302,19 +303,19 @@ static void print_usage(const char * executable) { printf("options:\n"); printf(" -h, --help show this help message and exit\n"); printf(" -o, --outfile output file\n"); - printf(" default: 'control_vector.gguf'\n"); + printf(" default: %s\n", defaults.outfile.c_str()); printf(" -pf, --positive-file positive prompts file, one prompt per line\n"); - printf(" default: 'examples/control-vector-generator/positive.txt'\n"); + printf(" default: %s\n", defaults.positive_prompts_file.c_str()); printf(" -nf, --negative-file negative prompts file, one prompt per line\n"); - printf(" default: 'examples/control-vector-generator/negative.txt'\n"); + printf(" default: %s\n", defaults.negative_prompts_file.c_str()); printf(" -cf, --completions-file completions file\n"); - printf(" default: 'examples/control-vector-generator/completions.txt'\n"); + printf(" default: %s\n", defaults.completions_file.c_str()); printf(" -nc, --num-completions N number of lines of completions file to use\n"); - printf(" default: 64\n"); - printf(" --batch-pca N batch size used for PCA\n"); - printf(" default: 5\n"); + printf(" default: use all lines\n"); + printf(" --batch-pca N batch size used for PCA. Larger batch runs faster, but uses more memory\n"); + printf(" default: %d\n", defaults.n_pca_batch); printf(" --iter-pca N number of iterations used for PCA\n"); - printf(" default: 1000\n"); + printf(" default: %d\n", defaults.n_pca_iterations); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index e190e8815d6cd..9b1e7e4e6d05b 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -35,7 +35,7 @@ namespace PCA { // input params for PCA computations struct pca_params { int n_threads = 1; - int n_batch = 5; // number of iterations do to in one batch. larger the batch, more memory is used + int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used int n_iterations = 1000; float tolerance = 1e-7; From da6babdf0a0c0471bcc7c1d69753765db4c3a7d3 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 15:47:35 +0200 Subject: [PATCH 37/56] fix macos build --- examples/control-vector-generator/pca.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 9b1e7e4e6d05b..a227c17297829 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -21,7 +21,7 @@ #define DEBUG_POS 5 static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) { - printf("%s: %s (%s): [%ld, %ld]\n", __func__, t->name, ggml_type_name(t->type), t->ne[0], t->ne[1]); + printf("%s: %s (%s): [%ld, %ld]\n", __func__, t->name, ggml_type_name(t->type), (size_t) t->ne[0], (size_t) t->ne[1]); if (!with_data) return; printf("%s: %s[0] = [", __func__, t->name); for (size_t i = 0; i <= DEBUG_POS; i++) { @@ -73,7 +73,6 @@ struct pca_model { #ifdef GGML_USE_METAL fprintf(stderr, "%s: using Metal backend\n", __func__); - ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); backend = ggml_backend_metal_init(); if (!backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); @@ -185,7 +184,7 @@ static struct ggml_cgraph * build_graph_piter( ggml_set_name(b_tensor, ("b_tensor_norm_" + std::to_string(i)).c_str()); // calculate distance(new eigenvector - old eigenvector) - struct ggml_tensor * new_sub_old = ggml_sub(ctx0, old_eigen, b_tensor); + struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1)); distance = ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old))); ggml_set_name(distance, ("distance_" + std::to_string(i)).c_str()); From 54f77e2467ca1a367f828efb7322f75c4702f7b4 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 19:03:13 +0200 Subject: [PATCH 38/56] add to makefile all targets --- Makefile | 3 ++- .../control-vector-generator.cpp | 11 ----------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 82d1b1f72ae8c..ef1f60b56a4a6 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o + retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm \ + tests/test-c.o control-vector-generator # Binaries only useful for tests TEST_TARGETS = \ diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index f713fb3d7ef81..60d0c87b5f8a5 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -454,17 +454,6 @@ static std::vector ctrlvec_load_prompt_file(std::string path, bool static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (callback_data *) user_data; - /*auto ggml_ne_string = [](const ggml_tensor * t) -> std::string { - std::string str; - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - str += std::to_string(t->ne[i]); - if (i + 1 < GGML_MAX_DIMS) { - str += ", "; - } - } - return str; - };*/ - static const char * l_out_name = "l_out"; const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; From 04c91d29ff430bfbbeac9ba0783669778660d3f6 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 19:14:04 +0200 Subject: [PATCH 39/56] use ggml_format_name --- .../control-vector-generator.cpp | 11 ++++++++--- examples/control-vector-generator/pca.hpp | 13 ++++++------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 60d0c87b5f8a5..486ade1e18364 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -127,7 +127,7 @@ struct callback_data { // diff_filtered: [n_embd, n_nonzero_rows] struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); - ggml_set_name(diff_filtered, (std::string("diff_filtered_") + a->name).c_str()); + ggml_format_name(diff_filtered, "diff_filtered_%s", a->name); diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); // copy non-zero rows @@ -245,7 +245,7 @@ struct train_context { struct ctrl_params { /* default meta parameters */ - int n_completions = INT_MAX; + int n_completions = 64; int n_pca_batch = 20; int n_pca_iterations = 1000; @@ -311,7 +311,7 @@ static void print_usage(const char * executable) { printf(" -cf, --completions-file completions file\n"); printf(" default: %s\n", defaults.completions_file.c_str()); printf(" -nc, --num-completions N number of lines of completions file to use\n"); - printf(" default: use all lines\n"); + printf(" default: %d\n", defaults.n_completions); printf(" --batch-pca N batch size used for PCA. Larger batch runs faster, but uses more memory\n"); printf(" default: %d\n", defaults.n_pca_batch); printf(" --iter-pca N number of iterations used for PCA\n"); @@ -550,6 +550,11 @@ int main(int argc, char ** argv) { return 1; } + if (cparams.n_pca_iterations % cparams.n_pca_batch != 0) { + fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n"); + return 1; + } + // load and prepare entries for training prepare_entries(cparams); diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index a227c17297829..8e13a53ca2d9f 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -181,13 +181,14 @@ static struct ggml_cgraph * build_graph_piter( b_tensor, ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor))) ); - ggml_set_name(b_tensor, ("b_tensor_norm_" + std::to_string(i)).c_str()); + ggml_format_name(b_tensor, "b_tensor_norm_%d", i); // calculate distance(new eigenvector - old eigenvector) + // we don't use ggml_sub because it may not be implemented on GPU backend struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1)); distance = ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old))); - ggml_set_name(distance, ("distance_" + std::to_string(i)).c_str()); + ggml_format_name(distance, "distance_%d", i); old_eigen = b_tensor; @@ -317,22 +318,20 @@ static void run_pca( struct pca_params & params, const std::vector & v_input, // shape of v_input[0]: [n_samples, n_embd] const std::vector & v_output) { - printf("Running PCA...\n"); + printf("%s: Running PCA...\n", __func__); for (size_t il = 0; il < v_input.size(); ++il) { // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; - auto name = std::string("direction.") + std::to_string(il + 1); - ggml_set_name(ctrl_out, name.c_str()); + ggml_format_name(ctrl_out, "direction.%ld", il+1); // run power_iteration params.i_layer = il; params.n_layers = v_input.size(); power_iteration(params, v_input[il], ctrl_out); - printf("DONE layer %ld / %ld\n", il+1, v_input.size()); + printf("%s: Done layer %ld / %ld\n", __func__, il+1, v_input.size()); //print_debug_tensor(ctrl_out); } - printf("Done with PCA.\n"); } } From 5ffba9ecc366acee334d984e54591a6e83e63db7 Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 19:35:17 +0200 Subject: [PATCH 40/56] add readme --- examples/control-vector-generator/README.md | 26 ++++++++++++++ .../control-vector-generator.cpp | 34 ++++++++++--------- 2 files changed, 44 insertions(+), 16 deletions(-) create mode 100644 examples/control-vector-generator/README.md diff --git a/examples/control-vector-generator/README.md b/examples/control-vector-generator/README.md new file mode 100644 index 0000000000000..b064607789a91 --- /dev/null +++ b/examples/control-vector-generator/README.md @@ -0,0 +1,26 @@ +# control-vector-generator + +This example demonstrates how to generate a control vector using gguf models. + +Related PRs: +- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970) +- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880) +- [Add control-vector-generator](https://github.com/ggerganov/llama.cpp/pull/7514) + +Example: + +```sh +# CPU only +./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf + +# With GPU +./control-vector-generator --num-completions 2 --pca-iter 40 -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 + +# With advanced options +# Please note that the ORDER of arguments does matter +# example-related options (i.e., --num-completions, --pca-iter) always come before model options (i.e., -m, -ngl) +./control-vector-generator --num-completions 128 --pca-iter 2000 --batch-pca 100 -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 + +# To see help message +./control-vector-generator -h +``` diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 486ade1e18364..69cde8f045761 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -301,24 +301,26 @@ static void print_usage(const char * executable) { printf("Creates a GGUF control vector for a given model."); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -o, --outfile output file\n"); - printf(" default: %s\n", defaults.outfile.c_str()); - printf(" -pf, --positive-file positive prompts file, one prompt per line\n"); - printf(" default: %s\n", defaults.positive_prompts_file.c_str()); - printf(" -nf, --negative-file negative prompts file, one prompt per line\n"); - printf(" default: %s\n", defaults.negative_prompts_file.c_str()); - printf(" -cf, --completions-file completions file\n"); - printf(" default: %s\n", defaults.completions_file.c_str()); - printf(" -nc, --num-completions N number of lines of completions file to use\n"); - printf(" default: %d\n", defaults.n_completions); - printf(" --batch-pca N batch size used for PCA. Larger batch runs faster, but uses more memory\n"); - printf(" default: %d\n", defaults.n_pca_batch); - printf(" --iter-pca N number of iterations used for PCA\n"); - printf(" default: %d\n", defaults.n_pca_iterations); + printf(" -h, --help show this help message and exit\n"); + printf(" -o, --outfile FNAME output file\n"); + printf(" default: %s\n", defaults.outfile.c_str()); + printf(" -pf, --positive-file FNAME positive prompts file, one prompt per line\n"); + printf(" default: %s\n", defaults.positive_prompts_file.c_str()); + printf(" -nf, --negative-file FNAME negative prompts file, one prompt per line\n"); + printf(" default: %s\n", defaults.negative_prompts_file.c_str()); + printf(" -cf, --completions-file completions file\n"); + printf(" default: %s\n", defaults.completions_file.c_str()); + printf(" -nc, --num-completions N number of lines of completions file to use\n"); + printf(" default: %d\n", defaults.n_completions); + printf(" --batch-pca N batch size used for PCA. Larger batch runs faster, but uses more memory\n"); + printf(" default: %d\n", defaults.n_pca_batch); + printf(" --iter-pca N number of iterations used for PCA\n"); + printf(" default: %d\n", defaults.n_pca_iterations); printf("\n"); printf("gpt-opts:\n"); - printf(" other options from main\n"); + printf(" -m, --model FNAME path to model file\n"); + printf(" -ngl, --gpu-layers N number of layers to offload to GPU\n"); + printf(" ...other options from main\n"); printf("\n"); } From e9cb3b336d155cf900f8860cff3b8d5974c83a4a Mon Sep 17 00:00:00 2001 From: ngxson Date: Tue, 11 Jun 2024 22:09:14 +0200 Subject: [PATCH 41/56] fix .editorconfig --- .editorconfig | 3 +++ .../control-vector-generator/control-vector-generator.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.editorconfig b/.editorconfig index 16d16b3b55bf5..ee6f60e798473 100644 --- a/.editorconfig +++ b/.editorconfig @@ -26,3 +26,6 @@ indent_size = 2 [examples/llama.swiftui/llama.swiftui.xcodeproj/*] indent_style = tab + +[examples/control-vector-generator/*.txt] +insert_final_newline = unset diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 69cde8f045761..3ca2c530520f6 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -144,7 +144,7 @@ struct callback_data { return diff_filtered; } - // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors + // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors void reset() { for (auto ptr : v_pos) free(ptr->data); for (auto ptr : v_neg) free(ptr->data); @@ -430,7 +430,7 @@ static int ctrlvec_params_parse(int argc, char ** argv, ctrl_params & params) { catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); print_usage(argv[0]); - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } return skipme; } @@ -495,7 +495,7 @@ static void export_gguf(const std::vector & v_ctrl, const printf("Added tensor: %s\n", v_ctrl[i]->name); } - printf("Writing file...\n"); + printf("Writing file...\n"); gguf_write_to_file(ctx, fname.c_str(), false); @@ -603,7 +603,7 @@ int main(int argc, char ** argv) { cb_data.n_layers = n_layers; cb_data.n_tokens = t.max_seq_len; - printf("Evaluating prompt[%ld/%ld]: \"%s\" - \"%s\" (%ld tokens)\n", + printf("Evaluating prompt[%ld/%ld]: \"%s\" - \"%s\" (%ld tokens)\n", i+1, cparams.positive_entries.size(), tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), From 7297817d13916de0e78e83fa5617dce2f183d7df Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 11:41:37 +0200 Subject: [PATCH 42/56] use ggml_backend_tensor_copy --- examples/control-vector-generator/pca.hpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 8e13a53ca2d9f..86863d1a52b75 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -290,17 +290,13 @@ static void power_iteration( if (calc_square) { // copy and store the square matrix if needed GGML_ASSERT(result.calculated_square != NULL); - std::vector tmp_buf(ggml_nbytes(model.dev_square)); - ggml_backend_tensor_get(result.calculated_square, tmp_buf.data(), 0, tmp_buf.size()); - ggml_backend_tensor_set(model.dev_square, tmp_buf.data(), 0, tmp_buf.size()); + ggml_backend_tensor_copy(result.calculated_square, model.dev_square); } { // copy last eigen vector and store as input for next iteration GGML_ASSERT(last_eigenvector != NULL); - std::vector tmp_buf(ggml_nbytes(last_eigenvector)); - ggml_backend_tensor_get(last_eigenvector, tmp_buf.data(), 0, tmp_buf.size()); - ggml_backend_tensor_set(model.dev_eigenvector, tmp_buf.data(), 0, tmp_buf.size()); + ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector); } printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n", From e683b9af608c692b904681019f122458b28859a8 Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 12:49:01 +0200 Subject: [PATCH 43/56] attemp to fix compile problem on mac --- Makefile | 4 ++-- .../control-vector-generator/control-vector-generator.cpp | 6 +++--- examples/control-vector-generator/pca.hpp | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index ef1f60b56a4a6..80537c766610f 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm \ - tests/test-c.o control-vector-generator + control-vector-generator tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ @@ -874,7 +874,7 @@ eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -control-vector-generator: examples/control-vector-generator/control-vector-generator.cpp examples/control-vector-generator/pca.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +control-vector-generator: examples/control-vector-generator/control-vector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 3ca2c530520f6..477592bfd5204 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -603,11 +603,11 @@ int main(int argc, char ** argv) { cb_data.n_layers = n_layers; cb_data.n_tokens = t.max_seq_len; - printf("Evaluating prompt[%ld/%ld]: \"%s\" - \"%s\" (%ld tokens)\n", - i+1, cparams.positive_entries.size(), + printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", + (int) i+1, (int) cparams.positive_entries.size(), tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), - t.max_seq_len); + (int) t.max_seq_len); cb_data.is_eval_pos = true; get_hidden_layers(ctx, t.tokens_pos); diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 86863d1a52b75..00f2ca52cf722 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -21,7 +21,7 @@ #define DEBUG_POS 5 static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) { - printf("%s: %s (%s): [%ld, %ld]\n", __func__, t->name, ggml_type_name(t->type), (size_t) t->ne[0], (size_t) t->ne[1]); + printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]); if (!with_data) return; printf("%s: %s[0] = [", __func__, t->name); for (size_t i = 0; i <= DEBUG_POS; i++) { @@ -325,7 +325,7 @@ static void run_pca( params.i_layer = il; params.n_layers = v_input.size(); power_iteration(params, v_input[il], ctrl_out); - printf("%s: Done layer %ld / %ld\n", __func__, il+1, v_input.size()); + printf("%s: Done layer %d / %d\n", __func__, (int) il+1, v_input.size()); //print_debug_tensor(ctrl_out); } } From 8ee0c96688493f8db2c4237e354721e91c78ec94 Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 12:50:29 +0200 Subject: [PATCH 44/56] fix compile warn --- examples/control-vector-generator/pca.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 00f2ca52cf722..3732eaf1a5d4c 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -325,7 +325,7 @@ static void run_pca( params.i_layer = il; params.n_layers = v_input.size(); power_iteration(params, v_input[il], ctrl_out); - printf("%s: Done layer %d / %d\n", __func__, (int) il+1, v_input.size()); + printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size()); //print_debug_tensor(ctrl_out); } } From f54cb8e3075832dcb0edfa4fcd7a2e01522bf0dd Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 12:53:17 +0200 Subject: [PATCH 45/56] reuse allocr --- examples/control-vector-generator/pca.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 3732eaf1a5d4c..6bc68185a2009 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -265,17 +265,13 @@ static void power_iteration( //printf("in power iteration\n"); struct pca_model model(input); - ggml_gallocr_t allocr = NULL; + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); struct pca_result result; struct ggml_tensor * last_eigenvector = NULL; int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations for (int iter = 0; iter < n_iters; ++iter) { bool calc_square = (iter == 0); // only need to calculate square for first iteration - if (allocr) { - ggml_gallocr_free(allocr); - } - allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square); // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); compute_piter(params, model, gf, allocr, result); From 679f5137f88ebfbc1aa2e8160cfe0c9633982e21 Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 15:58:20 +0200 Subject: [PATCH 46/56] move param parser to common --- common/common.cpp | 59 +++++ common/common.h | 9 + examples/control-vector-generator/README.md | 9 +- .../control-vector-generator.cpp | 212 +++--------------- 4 files changed, 99 insertions(+), 190 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 1591790e6df4c..7dfd55dc0ce98 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1576,6 +1576,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.out_file = argv[i]; + params.cvector_outfile = argv[i]; return true; } if (arg == "-ofreq" || arg == "--output-frequency") { @@ -1610,6 +1611,55 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.i_chunk = std::stoi(argv[i]); return true; } + // control-vector-generator params + if (arg == "--completions-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.cvector_completions_file = argv[i]; + return true; + } + if (arg == "--positive-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.cvector_positive_file = argv[i]; + return true; + } + if (arg == "--negative-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.cvector_negative_file = argv[i]; + return true; + } + if (arg == "--num-completions") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_completions = std::stoi(argv[i]); + return true; + } + if (arg == "--pca-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_pca_batch = std::stoi(argv[i]); + return true; + } + if (arg == "--pca-iter") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_pca_iterations = std::stoi(argv[i]); + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1931,6 +1981,15 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); #endif // LOG_DISABLE_LOGS + options.push_back({ "control-vector-generator" }); + options.push_back({ "control-vector-generator", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); + options.push_back({ "control-vector-generator", "--positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); + options.push_back({ "control-vector-generator", "--negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); + options.push_back({ "control-vector-generator", "--completions-file", "completions file (default: '%s')", params.cvector_completions_file.c_str() }); + options.push_back({ "control-vector-generator", "--num-completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); + options.push_back({ "control-vector-generator", "--batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "control-vector-generator", "--iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + printf("usage: %s [options]\n", argv[0]); for (const auto & o : options) { diff --git a/common/common.h b/common/common.h index 2345d855eed3c..fa44859c6a63f 100644 --- a/common/common.h +++ b/common/common.h @@ -232,6 +232,15 @@ struct gpt_params { bool process_output = false; // collect data for the output tensor bool compute_ppl = true; // whether to compute perplexity + + // control-vector-generator params + int n_completions = 64; + int n_pca_batch = 20; + int n_pca_iterations = 1000; + std::string cvector_outfile = "control_vector.gguf"; + std::string cvector_completions_file = "examples/control-vector-generator/completions.txt"; + std::string cvector_positive_file = "examples/control-vector-generator/positive.txt"; + std::string cvector_negative_file = "examples/control-vector-generator/negative.txt"; }; void gpt_params_handle_model_default(gpt_params & params); diff --git a/examples/control-vector-generator/README.md b/examples/control-vector-generator/README.md index b064607789a91..1ccb05d7840cf 100644 --- a/examples/control-vector-generator/README.md +++ b/examples/control-vector-generator/README.md @@ -5,7 +5,7 @@ This example demonstrates how to generate a control vector using gguf models. Related PRs: - [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970) - (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880) -- [Add control-vector-generator](https://github.com/ggerganov/llama.cpp/pull/7514) +- [Add control-vector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514) Example: @@ -14,13 +14,12 @@ Example: ./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf # With GPU -./control-vector-generator --num-completions 2 --pca-iter 40 -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 +./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 # With advanced options -# Please note that the ORDER of arguments does matter -# example-related options (i.e., --num-completions, --pca-iter) always come before model options (i.e., -m, -ngl) -./control-vector-generator --num-completions 128 --pca-iter 2000 --batch-pca 100 -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 +./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --num-completions 128 --pca-iter 2000 --batch-pca 100 # To see help message ./control-vector-generator -h +# Then, have a look at "control-vector-generator" section ``` diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 477592bfd5204..0e4bb8a029a66 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -168,6 +168,10 @@ struct train_context { int n_embd; int n_layers; + /* pair of prompts to be used for generating final vector */ + std::vector positive_entries; + std::vector negative_entries; + // each element of the vector correspond to one layer // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here // NOTE (2): v_diff is transposed from v_diff_tmp @@ -243,23 +247,6 @@ struct train_context { } }; -struct ctrl_params { - /* default meta parameters */ - int n_completions = 64; - int n_pca_batch = 20; - int n_pca_iterations = 1000; - - /* default filepaths */ - std::string outfile = "control_vector.gguf"; - std::string completions_file = "examples/control-vector-generator/completions.txt"; - std::string positive_prompts_file = "examples/control-vector-generator/positive.txt"; - std::string negative_prompts_file = "examples/control-vector-generator/negative.txt"; - - /* pair of prompts to be used for generating final vector */ - std::vector positive_entries; - std::vector negative_entries; -}; - struct tokenized_prompt { std::vector tokens_pos; std::vector tokens_neg; @@ -293,148 +280,6 @@ static std::string to_string(const T & val) { return ss.str(); } -static void print_usage(const char * executable) { - struct ctrl_params defaults; - printf("\n"); - printf("usage: %s [options] -m [gpt-opts]", executable); - printf("\n"); - printf("Creates a GGUF control vector for a given model."); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -o, --outfile FNAME output file\n"); - printf(" default: %s\n", defaults.outfile.c_str()); - printf(" -pf, --positive-file FNAME positive prompts file, one prompt per line\n"); - printf(" default: %s\n", defaults.positive_prompts_file.c_str()); - printf(" -nf, --negative-file FNAME negative prompts file, one prompt per line\n"); - printf(" default: %s\n", defaults.negative_prompts_file.c_str()); - printf(" -cf, --completions-file completions file\n"); - printf(" default: %s\n", defaults.completions_file.c_str()); - printf(" -nc, --num-completions N number of lines of completions file to use\n"); - printf(" default: %d\n", defaults.n_completions); - printf(" --batch-pca N batch size used for PCA. Larger batch runs faster, but uses more memory\n"); - printf(" default: %d\n", defaults.n_pca_batch); - printf(" --iter-pca N number of iterations used for PCA\n"); - printf(" default: %d\n", defaults.n_pca_iterations); - printf("\n"); - printf("gpt-opts:\n"); - printf(" -m, --model FNAME path to model file\n"); - printf(" -ngl, --gpu-layers N number of layers to offload to GPU\n"); - printf(" ...other options from main\n"); - printf("\n"); -} - -static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) { - std::string arg; - const std::string arg_prefix = "-"; - // hack to skip ctrlvec args in gpt_parse_params but we'll leave it as is - int skipme = 0; - - for(int arg_idx = 1; arg_idx < argc; ++arg_idx) { - arg = argv[arg_idx]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (arg == "-h" || arg == "--help") { - print_usage(argv[0]); - exit(0); - } - if (arg == "--version") { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - if (arg == "--outfile" || arg == "-o") { - if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - params.outfile = argv[arg_idx]; - skipme += 2; - } else { - throw std::invalid_argument("error: missing argument for " + arg); - } - } - if (arg == "--completions-file" || arg == "-cf") { - if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - params.completions_file = argv[arg_idx]; - skipme += 2; - } else { - throw std::invalid_argument("error: missing argument for " + arg); - } - } - if (arg == "--positive-file" || arg == "-pf") { - if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - params.positive_prompts_file = argv[arg_idx]; - skipme += 2; - } else { - throw std::invalid_argument("error: missing argument for " + arg); - } - } - if (arg == "--negative-file" || arg == "-nf") { - if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - params.negative_prompts_file = argv[arg_idx]; - skipme += 2; - } else { - throw std::invalid_argument("error: missing argument for " + arg); - } - } - if (arg == "--num-completions" || arg == "-nc") { - if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - try { - params.n_completions = std::stoi(argv[arg_idx]); - } - catch (const std::invalid_argument & ex) { - throw std::invalid_argument("error: invalid argument for " + arg); - } - skipme += 2; - } else { - throw std::invalid_argument("error: missing argument for " + arg); - } - } - if (arg == "--pca-batch") { - if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - try { - params.n_pca_batch = std::stoi(argv[arg_idx]); - } - catch (const std::invalid_argument & ex) { - throw std::invalid_argument("error: invalid argument for " + arg); - } - skipme += 2; - } else { - throw std::invalid_argument("error: missing argument for " + arg); - } - } - if (arg == "--pca-iter") { - if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - try { - params.n_pca_iterations = std::stoi(argv[arg_idx]); - } - catch (const std::invalid_argument & ex) { - throw std::invalid_argument("error: invalid argument for " + arg); - } - skipme += 2; - } else { - throw std::invalid_argument("error: missing argument for " + arg); - } - } - // TODO it might be nice QoL to have single positive/negative args - // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params - } - return skipme; -} - -static int ctrlvec_params_parse(int argc, char ** argv, ctrl_params & params) { - int skipme = 0; - try { - skipme = ctrlvec_params_parse_ex(argc, argv, params); - } - catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); - print_usage(argv[0]); - exit(EXIT_FAILURE); - } - return skipme; -} - static std::vector ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines = false) { std::vector output; std::ifstream file(path); @@ -508,10 +353,10 @@ static void export_gguf(const std::vector & v_ctrl, const * Load prompt files and completion file. * Then format each pair of prompt + completion to make an entry. */ -static int prepare_entries(ctrl_params & cparams) { +static int prepare_entries(gpt_params & params, train_context & ctx_train) { // load prompts - std::vector positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file); - std::vector negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file); + std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file); + std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file); if (positive_prompts.size() != negative_prompts.size()) { fprintf(stderr, "number of positive and negative prompts must be equal\n"); return 1; @@ -522,7 +367,7 @@ static int prepare_entries(ctrl_params & cparams) { } // create templated prompts - std::vector completions = ctrlvec_load_prompt_file(cparams.completions_file, false); + std::vector completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); auto format_template = [](std::string persona, std::string suffix) { //const std::string user_tag = "[INST]"; //const std::string asst_tag = "[/INST]"; @@ -531,34 +376,28 @@ static int prepare_entries(ctrl_params & cparams) { return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" }; for (size_t i = 0; i < positive_prompts.size(); ++i) { - for (int j = 0; j < std::min((int) completions.size(), cparams.n_completions); ++j) { + for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { // TODO replicate the truncations done by the python implementation - cparams.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); - cparams.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); + ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); + ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); } } return 0; } int main(int argc, char ** argv) { - ctrl_params cparams; - - int skipme = ctrlvec_params_parse(argc, argv, cparams); - argc -= skipme; - argv += skipme; - gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + gpt_params_print_usage(argc, argv, params); return 1; } - if (cparams.n_pca_iterations % cparams.n_pca_batch != 0) { + if (params.n_pca_iterations % params.n_pca_batch != 0) { fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n"); return 1; } - // load and prepare entries for training - prepare_entries(cparams); callback_data cb_data; @@ -584,27 +423,30 @@ int main(int argc, char ** argv) { char model_hint[128]; llama_model_meta_val_str(model, "general.architecture", model_hint, 128); + // init train_context + train_context ctx_train(n_embd, n_layers); + + // load and prepare entries for training + prepare_entries(params, ctx_train); + // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped std::vector tokenized_prompts; size_t n_total_tokens = 0; - for (size_t i = 0; i < cparams.positive_entries.size(); ++i) { - tokenized_prompt t(ctx, cparams.positive_entries[i], cparams.negative_entries[i]); + for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { + tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]); n_total_tokens += 2 * t.max_seq_len; tokenized_prompts.push_back(std::move(t)); } std::cout << "n_total_tokens: " << n_total_tokens << std::endl; - // init train_context - train_context ctx_train(n_embd, n_layers); - - for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { + for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { tokenized_prompt t = tokenized_prompts[i]; cb_data.n_layers = n_layers; cb_data.n_tokens = t.max_seq_len; printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", - (int) i+1, (int) cparams.positive_entries.size(), + (int) i+1, (int) ctx_train.positive_entries.size(), tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), (int) t.max_seq_len); @@ -635,12 +477,12 @@ int main(int argc, char ** argv) { // run PCA PCA::pca_params pca_params; pca_params.n_threads = params.n_threads; - pca_params.n_batch = cparams.n_pca_batch; - pca_params.n_iterations = cparams.n_pca_iterations; + pca_params.n_batch = params.n_pca_batch; + pca_params.n_iterations = params.n_pca_iterations; PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); // write output vectors to gguf - export_gguf(ctx_train.v_final, cparams.outfile, model_hint); + export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); llama_backend_free(); From a2a5f1bfbdf50ad174990d762da9e6efe3d6e437 Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 16:01:00 +0200 Subject: [PATCH 47/56] better error handling --- .../control-vector-generator.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 0e4bb8a029a66..920b542f1fa67 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -284,7 +284,8 @@ static std::vector ctrlvec_load_prompt_file(std::string path, bool std::vector output; std::ifstream file(path); if (!file.is_open()) { - throw std::runtime_error("Unable to open file " + path); + fprintf(stderr, "error: unable to open file: %s\n", path.c_str()); + exit(1); } std::string line; while (std::getline(file, line)) { @@ -441,6 +442,7 @@ int main(int argc, char ** argv) { std::cout << "n_total_tokens: " << n_total_tokens << std::endl; for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { + bool success = false; tokenized_prompt t = tokenized_prompts[i]; cb_data.n_layers = n_layers; cb_data.n_tokens = t.max_seq_len; @@ -452,9 +454,12 @@ int main(int argc, char ** argv) { (int) t.max_seq_len); cb_data.is_eval_pos = true; - get_hidden_layers(ctx, t.tokens_pos); + success = get_hidden_layers(ctx, t.tokens_pos); + if (!success) break; + cb_data.is_eval_pos = false; - get_hidden_layers(ctx, t.tokens_neg); + success = get_hidden_layers(ctx, t.tokens_neg); + if (!success) break; // calculate diff and remove all zero rows auto v_diff_filtered = cb_data.calc_diff(); From b22c8459ffce33d20772cb3f9aec5a3d98de65fc Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 16:08:27 +0200 Subject: [PATCH 48/56] clean up a bit --- common/common.cpp | 14 +++++++------- .../control-vector-generator.cpp | 14 ++++---------- examples/control-vector-generator/pca.hpp | 10 ---------- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 7dfd55dc0ce98..a7ef4565cdb12 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1982,13 +1982,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param #endif // LOG_DISABLE_LOGS options.push_back({ "control-vector-generator" }); - options.push_back({ "control-vector-generator", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); - options.push_back({ "control-vector-generator", "--positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); - options.push_back({ "control-vector-generator", "--negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "control-vector-generator", "--completions-file", "completions file (default: '%s')", params.cvector_completions_file.c_str() }); - options.push_back({ "control-vector-generator", "--num-completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); - options.push_back({ "control-vector-generator", "--batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "control-vector-generator", "--iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "control-vector-generator", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); + options.push_back({ "control-vector-generator", "--positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); + options.push_back({ "control-vector-generator", "--negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); + options.push_back({ "control-vector-generator", "--completions-file FNAME","completions file (default: '%s')", params.cvector_completions_file.c_str() }); + options.push_back({ "control-vector-generator", "--num-completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); + options.push_back({ "control-vector-generator", "--batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "control-vector-generator", "--iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); printf("usage: %s [options]\n", argv[0]); diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 920b542f1fa67..96a0e7958c593 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -213,6 +213,7 @@ struct train_context { } // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) + // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method void build_v_diff() { printf("build_v_diff\n"); for (int il = 0; il < n_layers - 1; il++) { @@ -228,7 +229,6 @@ struct train_context { for (int ir = 0; ir < n_rows; ++ir) { for (int ic = 0; ic < n_embd; ++ic) { float f = arr[ir*n_embd + ic]; - //std::cout << ir << "," << ic << " = " << f << "\n"; ggml_set_f32_nd(diff, ir, ic, 0, 0, f); } } @@ -341,12 +341,9 @@ static void export_gguf(const std::vector & v_ctrl, const printf("Added tensor: %s\n", v_ctrl[i]->name); } - printf("Writing file...\n"); - + printf("%s: writing file...\n", __func__); gguf_write_to_file(ctx, fname.c_str(), false); - printf("%s: wrote file '%s'\n", __func__, fname.c_str()); - gguf_free(ctx); } @@ -370,11 +367,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { // create templated prompts std::vector completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); auto format_template = [](std::string persona, std::string suffix) { - //const std::string user_tag = "[INST]"; - //const std::string asst_tag = "[/INST]"; - //return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; - // TODO make this dynamic - allow the user to change it somehow - and adapt based on model - return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" + // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" + return persona + " " + suffix; }; for (size_t i = 0; i < positive_prompts.size(); ++i) { for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 6bc68185a2009..903f6cc7c9ec7 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -123,15 +123,6 @@ struct pca_model { } ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector)); } - - // init host context - //struct ggml_init_params host_params = { - // /*.mem_size =*/ (n_embd * sizeof(float) + ggml_tensor_overhead()) * 2u, - // /*.mem_buffer =*/ NULL, - // /*.no_alloc =*/ false, - //}; - //ctx_host = ggml_init(host_params); - //host_eigenvector = ggml_new_tensor_1d(ctx_host, GGML_TYPE_F32, n_embd); } ~pca_model() { @@ -322,7 +313,6 @@ static void run_pca( params.n_layers = v_input.size(); power_iteration(params, v_input[il], ctrl_out); printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size()); - //print_debug_tensor(ctrl_out); } } From c59bfa6368f6293addeab3353aa8268197ac3385 Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 17:12:02 +0200 Subject: [PATCH 49/56] add print_usage --- common/common.cpp | 4 ++-- examples/control-vector-generator/README.md | 2 +- .../control-vector-generator.cpp | 13 +++++++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index a7ef4565cdb12..0c04c0b9b3d3f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1636,7 +1636,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cvector_negative_file = argv[i]; return true; } - if (arg == "--num-completions") { + if (arg == "--completions") { if (++i >= argc) { invalid_param = true; return true; @@ -1986,7 +1986,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "control-vector-generator", "--positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); options.push_back({ "control-vector-generator", "--negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); options.push_back({ "control-vector-generator", "--completions-file FNAME","completions file (default: '%s')", params.cvector_completions_file.c_str() }); - options.push_back({ "control-vector-generator", "--num-completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); + options.push_back({ "control-vector-generator", "--completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); options.push_back({ "control-vector-generator", "--batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); options.push_back({ "control-vector-generator", "--iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); diff --git a/examples/control-vector-generator/README.md b/examples/control-vector-generator/README.md index 1ccb05d7840cf..fd89dfa3b7917 100644 --- a/examples/control-vector-generator/README.md +++ b/examples/control-vector-generator/README.md @@ -17,7 +17,7 @@ Example: ./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 # With advanced options -./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --num-completions 128 --pca-iter 2000 --batch-pca 100 +./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100 # To see help message ./control-vector-generator -h diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 96a0e7958c593..136f78974ca97 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -22,7 +22,7 @@ ////////////////////////////////////////////////// - +// utils template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { @@ -34,6 +34,15 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { return ret; } +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + printf("\nexample usage:\n"); + printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]); + printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]); + printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]); + printf("\n"); +} ////////////////////////////////////////////////// @@ -384,7 +393,7 @@ int main(int argc, char ** argv) { gpt_params params; if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + print_usage(argc, argv, params); return 1; } From 334dbaed3f43dda64626d61559e113ffcfb0a7e5 Mon Sep 17 00:00:00 2001 From: ngxson Date: Wed, 12 Jun 2024 17:13:19 +0200 Subject: [PATCH 50/56] shorten help msg --- common/common.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 0c04c0b9b3d3f..5f70b2a6da049 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1981,14 +1981,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); #endif // LOG_DISABLE_LOGS - options.push_back({ "control-vector-generator" }); - options.push_back({ "control-vector-generator", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); - options.push_back({ "control-vector-generator", "--positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); - options.push_back({ "control-vector-generator", "--negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "control-vector-generator", "--completions-file FNAME","completions file (default: '%s')", params.cvector_completions_file.c_str() }); - options.push_back({ "control-vector-generator", "--completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); - options.push_back({ "control-vector-generator", "--batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "control-vector-generator", "--iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "control-vector" }); + options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); + options.push_back({ "cvector", "--positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); + options.push_back({ "cvector", "--negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); + options.push_back({ "cvector", "--completions-file FNAME","completions file (default: '%s')", params.cvector_completions_file.c_str() }); + options.push_back({ "cvector", "--completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); + options.push_back({ "cvector", "--batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "cvector", "--iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); printf("usage: %s [options]\n", argv[0]); From 25fb0a6e61d01a6c5ffb69f34770fac433f16746 Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 13 Jun 2024 13:29:46 +0200 Subject: [PATCH 51/56] beautify help msg --- common/common.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 5f70b2a6da049..8cd3a02d83b84 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1982,13 +1982,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param #endif // LOG_DISABLE_LOGS options.push_back({ "control-vector" }); - options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); - options.push_back({ "cvector", "--positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); - options.push_back({ "cvector", "--negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "cvector", "--completions-file FNAME","completions file (default: '%s')", params.cvector_completions_file.c_str() }); - options.push_back({ "cvector", "--completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); - options.push_back({ "cvector", "--batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "cvector", "--iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); + options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); + options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); + options.push_back({ "cvector", " --completions-file FNAME", + "completions file (default: '%s')", params.cvector_completions_file.c_str() }); + options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); + options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); printf("usage: %s [options]\n", argv[0]); From ca86d4fd339cdd1534be2ab8d1a3137f0e66d978 Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 13 Jun 2024 13:29:58 +0200 Subject: [PATCH 52/56] escape prompt by default --- examples/control-vector-generator/README.md | 13 +++++++++++-- .../control-vector-generator.cpp | 7 ++++--- examples/control-vector-generator/pca.hpp | 1 + 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/examples/control-vector-generator/README.md b/examples/control-vector-generator/README.md index fd89dfa3b7917..d00652d73512f 100644 --- a/examples/control-vector-generator/README.md +++ b/examples/control-vector-generator/README.md @@ -7,7 +7,7 @@ Related PRs: - (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880) - [Add control-vector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514) -Example: +## Examples ```sh # CPU only @@ -21,5 +21,14 @@ Example: # To see help message ./control-vector-generator -h -# Then, have a look at "control-vector-generator" section +# Then, have a look at "control-vector" section +``` + +## Tips and tricks + +If your prompt have multiple lines (per prompt), convert the newline to escape sequence `\n`. For example: + +``` +<|im_start|>system\nAct like a person who is extremely happy.<|im_end|> +<|im_start|>system\nYou are in a very good mood today<|im_end|> ``` diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 136f78974ca97..9941683db677e 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -289,7 +289,7 @@ static std::string to_string(const T & val) { return ss.str(); } -static std::vector ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines = false) { +static std::vector ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) { std::vector output; std::ifstream file(path); if (!file.is_open()) { @@ -300,6 +300,7 @@ static std::vector ctrlvec_load_prompt_file(std::string path, bool while (std::getline(file, line)) { bool is_skip = skip_empty_lines && line.empty(); if (!is_skip) { + string_process_escapes(line); output.push_back(line); } } @@ -362,8 +363,8 @@ static void export_gguf(const std::vector & v_ctrl, const */ static int prepare_entries(gpt_params & params, train_context & ctx_train) { // load prompts - std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file); - std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file); + std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); + std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); if (positive_prompts.size() != negative_prompts.size()) { fprintf(stderr, "number of positive and negative prompts must be equal\n"); return 1; diff --git a/examples/control-vector-generator/pca.hpp b/examples/control-vector-generator/pca.hpp index 903f6cc7c9ec7..28f8bd3e9f4ca 100644 --- a/examples/control-vector-generator/pca.hpp +++ b/examples/control-vector-generator/pca.hpp @@ -11,6 +11,7 @@ #endif #include +#include #include #include #include From 64cad20c2ead025d6b02eb4428abff321e4f0b46 Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 13 Jun 2024 14:51:11 +0200 Subject: [PATCH 53/56] change compile target to llama-cvector-generator --- .editorconfig | 2 +- Makefile | 3 ++- common/common.cpp | 4 ++-- common/common.h | 8 ++++---- examples/CMakeLists.txt | 2 +- .../CMakeLists.txt | 4 ++-- .../README.md | 14 +++++++------- .../completions.txt | 0 .../cvector-generator.cpp} | 0 .../negative.txt | 0 .../pca.hpp | 0 .../positive.txt | 0 12 files changed, 19 insertions(+), 18 deletions(-) rename examples/{control-vector-generator => cvector-generator}/CMakeLists.txt (62%) rename examples/{control-vector-generator => cvector-generator}/README.md (58%) rename examples/{control-vector-generator => cvector-generator}/completions.txt (100%) rename examples/{control-vector-generator/control-vector-generator.cpp => cvector-generator/cvector-generator.cpp} (100%) rename examples/{control-vector-generator => cvector-generator}/negative.txt (100%) rename examples/{control-vector-generator => cvector-generator}/pca.hpp (100%) rename examples/{control-vector-generator => cvector-generator}/positive.txt (100%) diff --git a/.editorconfig b/.editorconfig index ee6f60e798473..bd525e13f3ece 100644 --- a/.editorconfig +++ b/.editorconfig @@ -27,5 +27,5 @@ indent_size = 2 [examples/llama.swiftui/llama.swiftui.xcodeproj/*] indent_style = tab -[examples/control-vector-generator/*.txt] +[examples/cvector-generator/*.txt] insert_final_newline = unset diff --git a/Makefile b/Makefile index 1b9affcd2abe9..5ab3481fb49ee 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,7 @@ BUILD_TARGETS = \ llama-tokenize \ llama-train-text-from-scratch \ llama-vdot \ + llama-cvector-generator \ tests/test-c.o # Binaries only useful for tests @@ -922,7 +923,7 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -control-vector-generator: examples/control-vector-generator/control-vector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index 8cd3a02d83b84..73ff0e85b7b4e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1611,7 +1611,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.i_chunk = std::stoi(argv[i]); return true; } - // control-vector-generator params + // cvector params if (arg == "--completions-file") { if (++i >= argc) { invalid_param = true; @@ -1981,7 +1981,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); #endif // LOG_DISABLE_LOGS - options.push_back({ "control-vector" }); + options.push_back({ "cvector" }); options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); diff --git a/common/common.h b/common/common.h index fa44859c6a63f..58ed72f433bdf 100644 --- a/common/common.h +++ b/common/common.h @@ -233,14 +233,14 @@ struct gpt_params { bool process_output = false; // collect data for the output tensor bool compute_ppl = true; // whether to compute perplexity - // control-vector-generator params + // cvector-generator params int n_completions = 64; int n_pca_batch = 20; int n_pca_iterations = 1000; std::string cvector_outfile = "control_vector.gguf"; - std::string cvector_completions_file = "examples/control-vector-generator/completions.txt"; - std::string cvector_positive_file = "examples/control-vector-generator/positive.txt"; - std::string cvector_negative_file = "examples/control-vector-generator/negative.txt"; + std::string cvector_completions_file = "examples/cvector-generator/completions.txt"; + std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; + std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; }; void gpt_params_handle_model_default(gpt_params & params); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 111fb68694d76..0b51c44c05e4e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,7 +12,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(control-vector-generator) + add_subdirectory(cvector-generator) add_subdirectory(baby-llama) add_subdirectory(batched-bench) add_subdirectory(batched) diff --git a/examples/control-vector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt similarity index 62% rename from examples/control-vector-generator/CMakeLists.txt rename to examples/cvector-generator/CMakeLists.txt index f3688e431d914..0a559d60c2a6d 100644 --- a/examples/control-vector-generator/CMakeLists.txt +++ b/examples/cvector-generator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET control-vector-generator) -add_executable(${TARGET} control-vector-generator.cpp pca.hpp) +set(TARGET llama-cvector-generator) +add_executable(${TARGET} cvector-generator.cpp pca.hpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/control-vector-generator/README.md b/examples/cvector-generator/README.md similarity index 58% rename from examples/control-vector-generator/README.md rename to examples/cvector-generator/README.md index d00652d73512f..46dc00cf8123e 100644 --- a/examples/control-vector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -1,27 +1,27 @@ -# control-vector-generator +# cvector-generator This example demonstrates how to generate a control vector using gguf models. Related PRs: - [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970) - (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880) -- [Add control-vector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514) +- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514) ## Examples ```sh # CPU only -./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf +./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf # With GPU -./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 +./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 # With advanced options -./control-vector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100 +./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100 # To see help message -./control-vector-generator -h -# Then, have a look at "control-vector" section +./cvector-generator -h +# Then, have a look at "cvector" section ``` ## Tips and tricks diff --git a/examples/control-vector-generator/completions.txt b/examples/cvector-generator/completions.txt similarity index 100% rename from examples/control-vector-generator/completions.txt rename to examples/cvector-generator/completions.txt diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp similarity index 100% rename from examples/control-vector-generator/control-vector-generator.cpp rename to examples/cvector-generator/cvector-generator.cpp diff --git a/examples/control-vector-generator/negative.txt b/examples/cvector-generator/negative.txt similarity index 100% rename from examples/control-vector-generator/negative.txt rename to examples/cvector-generator/negative.txt diff --git a/examples/control-vector-generator/pca.hpp b/examples/cvector-generator/pca.hpp similarity index 100% rename from examples/control-vector-generator/pca.hpp rename to examples/cvector-generator/pca.hpp diff --git a/examples/control-vector-generator/positive.txt b/examples/cvector-generator/positive.txt similarity index 100% rename from examples/control-vector-generator/positive.txt rename to examples/cvector-generator/positive.txt From 91f7dbfda2325da3652be60885c7071944b1a174 Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 13 Jun 2024 14:55:26 +0200 Subject: [PATCH 54/56] typo --- examples/cvector-generator/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md index 46dc00cf8123e..7b0e79c1ffba8 100644 --- a/examples/cvector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -26,7 +26,7 @@ Related PRs: ## Tips and tricks -If your prompt have multiple lines (per prompt), convert the newline to escape sequence `\n`. For example: +If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example: ``` <|im_start|>system\nAct like a person who is extremely happy.<|im_end|> From f99be2c3ff45b847924b4a4f63be8e997cd7ffe5 Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 13 Jun 2024 15:21:49 +0200 Subject: [PATCH 55/56] disable GPU for PCA --- examples/cvector-generator/pca.hpp | 42 ++++++++++++++++-------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 28f8bd3e9f4ca..329cb72bc7c5b 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -64,21 +64,22 @@ struct pca_model { struct ggml_tensor * dev_eigenvector; pca_model(struct ggml_tensor * t_input) { -#ifdef GGML_USE_CUDA - fprintf(stderr, "%s: using CUDA backend\n", __func__); - backend = ggml_backend_cuda_init(0); // init device 0 - if (!backend) { - fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); - } -#endif - -#ifdef GGML_USE_METAL - fprintf(stderr, "%s: using Metal backend\n", __func__); - backend = ggml_backend_metal_init(); - if (!backend) { - fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); - } -#endif +// TODO: enable GPU support when support for GGML_OP_SQRT is added +// #ifdef GGML_USE_CUDA +// fprintf(stderr, "%s: using CUDA backend\n", __func__); +// backend = ggml_backend_cuda_init(0); // init device 0 +// if (!backend) { +// fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); +// } +// #endif +// +// #ifdef GGML_USE_METAL +// fprintf(stderr, "%s: using Metal backend\n", __func__); +// backend = ggml_backend_metal_init(); +// if (!backend) { +// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); +// } +// #endif // if there aren't GPU Backends fallback to CPU backend if (!backend) { @@ -206,11 +207,12 @@ static ggml_status compute_piter( ggml_backend_cpu_set_n_threads(model.backend, params.n_threads); } -#ifdef GGML_USE_METAL - if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_cb(model.backend, params.n_threads); - } -#endif +// TODO: enable GPU support when support for GGML_OP_SQRT is added +//#ifdef GGML_USE_METAL +// if (ggml_backend_is_metal(model.backend)) { +// ggml_backend_metal_set_n_cb(model.backend, params.n_threads); +// } +//#endif ggml_status res = ggml_backend_graph_compute(model.backend, gf); if (res == GGML_STATUS_SUCCESS) { From 6d2464aef50c7c705c87e4fcfacfa1d04bfdc6dc Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 13 Jun 2024 15:36:03 +0200 Subject: [PATCH 56/56] code style --- examples/cvector-generator/pca.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 329cb72bc7c5b..8b95cec374c23 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -72,7 +72,7 @@ struct pca_model { // fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); // } // #endif -// + // #ifdef GGML_USE_METAL // fprintf(stderr, "%s: using Metal backend\n", __func__); // backend = ggml_backend_metal_init();