From 05cfbaa1b8d7b096fe48604dc7914c3a91355ad5 Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 15 May 2026 13:45:21 +0700 Subject: [PATCH] first commit --- .gitignore | 4 + Makefile | 47 +++ README.md | 198 +++++++++++++ binding.cpp | 719 ++++++++++++++++++++++++++++++++++++++++++++++ binding.h | 63 ++++ build.conf | 3 + cgo_flags.go | 10 + examples/main.go | 48 ++++ go.mod | 3 + llama.go | 409 ++++++++++++++++++++++++++ llama_cublas.go | 9 + llama_openblas.go | 9 + options.go | 460 +++++++++++++++++++++++++++++ 13 files changed, 1982 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.md create mode 100644 binding.cpp create mode 100644 binding.h create mode 100644 build.conf create mode 100644 cgo_flags.go create mode 100644 examples/main.go create mode 100644 go.mod create mode 100644 llama.go create mode 100644 llama_cublas.go create mode 100644 llama_openblas.go create mode 100644 options.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7448f6f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.o +*.a +binding.o +libbinding.a diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0b95e11 --- /dev/null +++ b/Makefile @@ -0,0 +1,47 @@ +.PHONY: all clean libbinding.a + +include build.conf + +LLAMA_INCLUDE := $(LLAMA_CPP_PATH)/include +LLAMA_COMMON := $(LLAMA_CPP_PATH)/common +LLAMA_GGML := $(LLAMA_CPP_PATH)/ggml/include + +CXXFLAGS := -std=c++17 -O3 -DNDEBUG -fPIC -pthread \ + -I$(LLAMA_INCLUDE) -I$(LLAMA_COMMON) -I$(LLAMA_GGML) -I. + +LDFLAGS_LIBS := \ + -L$(LLAMA_BUILD_PATH)/src -lllama \ + -L$(LLAMA_BUILD_PATH)/common -lllama-common \ + -L$(LLAMA_BUILD_PATH)/ggml/src -lggml -lggml-cpu -lggml-base \ + -L$(LLAMA_BUILD_PATH)/vendor/cpp-httplib -lcpp-httplib \ + -lpthread -fopenmp -ldl -lm -lstdc++ + +all: libbinding.a cgo_flags.go + +# Обновить пути в cgo_flags.go из build.conf +cgo_flags.go: build.conf + @LLAMA=$$(grep '^LLAMA_CPP_PATH=' build.conf | cut -d= -f2); \ + BUILD=$$(grep '^LLAMA_BUILD_PATH=' build.conf | cut -d= -f2); \ + printf '%s\n' \ + 'package llama' \ + '' \ + '/*' \ + "#cgo CXXFLAGS: -std=c++17 -I$$LLAMA/include -I$$LLAMA/common -I$$LLAMA/ggml/include -I\$${SRCDIR}" \ + "#cgo LDFLAGS: -L\$${SRCDIR} -lbinding -L$$BUILD/src -lllama -L$$BUILD/common -lllama-common -lllama-common-base -L$$BUILD/ggml/src -lggml -lggml-cpu -lggml-base -L$$BUILD/vendor/cpp-httplib -lcpp-httplib -lstdc++ -lm -lpthread -fopenmp -ldl" \ + '*/' \ + 'import "C"' \ + > cgo_flags.go + +$(LLAMA_BUILD_PATH)/src/libllama.a: + cd $(LLAMA_BUILD_PATH) && cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF && \ + cmake --build . --target llama llama-common -j$$(nproc) + +binding.o: binding.cpp binding.h $(LLAMA_BUILD_PATH)/src/libllama.a + $(CXX) $(CXXFLAGS) -c binding.cpp -o binding.o + +libbinding.a: binding.o + ar rcs libbinding.a binding.o + @echo "Собрано: libbinding.a. Линковка llama.cpp — через cgo_flags.go." + +clean: + rm -f binding.o libbinding.a diff --git a/README.md b/README.md new file mode 100644 index 0000000..e9f7fc9 --- /dev/null +++ b/README.md @@ -0,0 +1,198 @@ +# go-llama-new.cpp + +Go-обёртка над [llama.cpp](https://github.com/ggml-org/llama.cpp) с API, совместимым с [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp): те же имена типов, функций и экспортируемых переменных (`LLama`, `New`, `Predict`, `SetContext`, `EnableEmbeddings` и т.д.). + +Ядро собирается из локальных исходников llama.cpp (не из submodule внутри репозитория). Пути к исходникам задаются в файле `build.conf`, переменные окружения для этого не используются. + +## Требования + +- **Go** 1.21 или новее (с поддержкой CGO) +- **Компилятор C++** с поддержкой C++17 (`g++` / `clang++`) +- **CMake** 3.14+ +- **make**, **ar** +- **OpenMP** (обычно пакет `libgomp` в Linux) +- Инструменты сборки: `git`, `build-essential` (или аналог) + +Для линковки также нужны статические библиотеки, которые CMake собирает из llama.cpp: `libllama.a`, `libllama-common.a`, `libllama-common-base.a`, `libggml*.a`, `libcpp-httplib.a`. + +## Настройка путей + +Отредактируйте `build.conf` в корне модуля: + +```ini +# Пути к исходникам llama.cpp (без переменных окружения) +LLAMA_CPP_PATH=/home/admin/cpp/llama.cpp +LLAMA_BUILD_PATH=/home/admin/cpp/llama.cpp/build +``` + +| Параметр | Описание | +|----------|----------| +| `LLAMA_CPP_PATH` | Каталог с исходниками llama.cpp (`include/`, `common/`, `src/` и т.д.) | +| `LLAMA_BUILD_PATH` | Каталог сборки CMake (там появятся `build/src/libllama.a` и др.) | + +После изменения `build.conf` выполните `make` — будет пересоздан `cgo_flags.go` с актуальными путями для CGO. + +## Сборка + +Сборка состоит из двух этапов: сначала нативное ядро llama.cpp, затем Go-модуль с C-обёрткой `binding`. + +### 1. Сборка llama.cpp + +```bash +mkdir -p /home/admin/cpp/llama.cpp/build +cd /home/admin/cpp/llama.cpp/build + +cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF + +cmake --build . --target llama llama-common -j"$(nproc)" +``` + +Проверка, что библиотеки на месте: + +```bash +ls -la build/src/libllama.a +ls -la build/common/libllama-common.a +ls -la build/common/libllama-common-base.a +ls -la build/ggml/src/libggml.a +``` + +Цель `make` в каталоге модуля при необходимости запустит эту же сборку автоматически (см. `Makefile`). + +### 2. Сборка C-обёртки (libbinding.a) + +В каталоге модуля: + +```bash +cd /path/to/go-llama-new.cpp +make +``` + +Будет выполнено: + +1. Генерация `cgo_flags.go` из `build.conf` +2. Компиляция `binding.cpp` → `binding.o` +3. Создание архива `libbinding.a` + +Очистка артефактов обёртки: + +```bash +make clean +``` + +### 3. Сборка Go-модуля + +```bash +go build ./... +``` + +Или пример: + +```bash +go build -o llama-example ./examples/ +go run ./examples/main.go /path/to/model.gguf "Привет, мир" +``` + +При первой сборке CGO скомпилирует `binding.cpp` ещё раз и слинкует его с библиотеками из `LLAMA_BUILD_PATH` (см. `cgo_flags.go`). + +## Использование в своём проекте + +```go +import llama "go-llama-new.cpp" + +func main() { + model, err := llama.New("/path/to/model.gguf", + llama.SetContext(4096), + llama.SetGPULayers(0), + ) + if err != nil { + panic(err) + } + defer model.Free() + + text, err := model.Predict("Привет", + llama.SetTokens(128), + llama.SetTemperature(0.8), + ) + if err != nil { + panic(err) + } + println(text) +} +``` + +В `go.mod` вашего проекта: + +```go +require go-llama-new.cpp v0.0.0 + +replace go-llama-new.cpp => /path/to/go-llama-new.cpp +``` + +Перед `go build` в проекте-потребителе должны быть собраны llama.cpp и `libbinding.a` (шаги 1–2 выше). + +## Опциональные теги сборки + +Как в оригинальном go-llama.cpp: + +| Тег | Назначение | +|-----|------------| +| `openblas` | Дополнительная линковка с OpenBLAS (`llama_openblas.go`) | +| `cublas` | CUDA (`llama_cublas.go`) — требует отдельной сборки llama.cpp с `GGML_CUDA=ON` | + +Пример: + +```bash +go build -tags openblas ./... +``` + +Для GPU нужно пересобрать llama.cpp с нужными опциями CMake (например `-DGGML_CUDA=ON`) и убедиться, что пути в `build.conf` указывают на эту сборку. + +## Устранение неполадок + +### `неопределённая ссылка на llama_compiler` / `llama_commit` / `llama_build_number` + +Не слинкована `libllama-common-base.a`. Убедитесь, что в `cgo_flags.go` в `LDFLAGS` есть `-lllama-common-base`, и пересоберите: + +```bash +make +go build ./... +``` + +### `cannot find -lllama` или `-lllama-common` + +Проверьте `LLAMA_BUILD_PATH` в `build.conf` и выполните сборку llama.cpp (шаг 1). + +### CGO отключён + +```bash +go env CGO_ENABLED # должно быть 1 +``` + +Установите `gcc`/`g++`, если CGO выключен из-за отсутствия компилятора C. + +### Изменили путь к llama.cpp + +1. Обновите `build.conf` +2. `make` (обновит `cgo_flags.go` и `libbinding.a`) +3. `go build ./...` + +## Структура репозитория + +``` +. +├── build.conf # пути к llama.cpp +├── binding.h +├── binding.cpp # C API для CGO +├── cgo_flags.go # флаги CGO (генерируется make) +├── llama.go +├── options.go +├── Makefile +├── examples/main.go +└── README.md +``` + +## Лицензия + +Следует лицензиям llama.cpp и исходного go-llama.cpp. Используйте в соответствии с условиями соответствующих проектов. diff --git a/binding.cpp b/binding.cpp new file mode 100644 index 0000000..abdfe84 --- /dev/null +++ b/binding.cpp @@ -0,0 +1,719 @@ +#include "binding.h" + +#include "common.h" +#include "llama.h" +#include "sampling.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +struct llama_binding_state { + common_init_result_ptr init; + llama_model * model = nullptr; + llama_context * ctx = nullptr; + common_sampler * smpl = nullptr; + bool embeddings = false; +}; + +static llama_binding_state * binding_state(void * state_pr) { + return static_cast(state_pr); +} + +static void parse_tensor_split(const char * tensorsplit, float * out, size_t n) { + for (size_t i = 0; i < n; ++i) { + out[i] = 0.0f; + } + if (tensorsplit == nullptr || tensorsplit[0] == '\0') { + return; + } + std::string arg_next = tensorsplit; + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; + std::vector split_arg{it, {}}; + for (size_t i = 0; i < split_arg.size() && i < n; ++i) { + out[i] = std::stof(split_arg[i]); + } +} + +static void apply_model_load_options( + common_params & params, + int n_ctx, + int n_seed, + bool memory_f16, + bool mlock, + bool embeddings, + bool mmap, + int n_gpu, + int n_batch, + const char * maingpu, + const char * tensorsplit, + bool numa, + float rope_freq_base, + float rope_freq_scale, + const char * lora, + const char * lora_base, + bool perplexity) { + (void) lora_base; + + if (n_ctx > 0) { + params.n_ctx = n_ctx; + } + if (n_seed >= 0) { + params.sampling.seed = (uint32_t) n_seed; + } + params.use_mlock = mlock; + params.embedding = embeddings; + params.use_mmap = mmap; + params.n_gpu_layers = n_gpu; + params.n_batch = n_batch > 0 ? n_batch : params.n_batch; + params.n_ubatch = std::min(params.n_batch, params.n_ubatch); + params.numa = numa ? GGML_NUMA_STRATEGY_DISTRIBUTE : GGML_NUMA_STRATEGY_DISABLED; + params.warmup = false; + params.fit_params = false; + + if (rope_freq_base > 0.0f) { + params.rope_freq_base = rope_freq_base; + } + if (rope_freq_scale > 0.0f) { + params.rope_freq_scale = rope_freq_scale; + } + + if (memory_f16) { + params.cache_type_k = GGML_TYPE_F16; + params.cache_type_v = GGML_TYPE_F16; + } + + if (maingpu != nullptr && maingpu[0] != '\0') { + params.main_gpu = std::stoi(maingpu); + } + + parse_tensor_split(tensorsplit, params.tensor_split, sizeof(params.tensor_split) / sizeof(params.tensor_split[0])); + + if (perplexity) { + params.compute_ppl = true; + } + + if (lora != nullptr && lora[0] != '\0') { + common_adapter_lora_info la; + la.path = lora; + la.scale = 1.0f; + params.lora_adapters.push_back(la); + } +} + +static bool check_antiprompt( + const std::string & output, + const std::vector & antiprompt, + bool interactive) { + for (const auto & ap : antiprompt) { + if (ap.empty()) { + continue; + } + const size_t extra = interactive ? 0 : 2; + const size_t search_start = output.length() > ap.length() + extra + ? output.length() - ap.length() - extra + : 0; + if (output.find(ap, search_start) != std::string::npos) { + return true; + } + } + return false; +} + +extern "C" { + +void * load_model( + const char * fname, + int n_ctx, + int n_seed, + bool memory_f16, + bool mlock, + bool embeddings, + bool mmap, + bool low_vram, + int n_gpu, + int n_batch, + const char * maingpu, + const char * tensorsplit, + bool numa, + float rope_freq_base, + float rope_freq_scale, + bool mul_mat_q, + const char * lora, + const char * lora_base, + bool perplexity) { + (void) low_vram; + (void) mul_mat_q; + + common_init(); + llama_backend_init(); + + common_params params; + params.model.path = fname; + + apply_model_load_options( + params, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap, + n_gpu, n_batch, maingpu, tensorsplit, numa, + rope_freq_base, rope_freq_scale, lora, lora_base, perplexity); + + llama_numa_init(params.numa); + + auto * binding = new llama_binding_state(); + binding->init = common_init_from_params(params); + if (!binding->init || binding->init->context() == nullptr) { + delete binding; + return nullptr; + } + + binding->model = binding->init->model(); + binding->ctx = binding->init->context(); + binding->smpl = binding->init->sampler(0); + binding->embeddings = embeddings; + + return binding; +} + +void llama_binding_free_model(void * state_pr) { + delete binding_state(state_pr); +} + +int load_state(void * state_pr, char * statefile, char * modes) { + (void) modes; + auto * state = binding_state(state_pr); + if (state == nullptr || state->ctx == nullptr) { + return 1; + } + + std::vector tokens(llama_n_ctx(state->ctx)); + size_t n_out = 0; + if (!llama_state_load_file(state->ctx, statefile, tokens.data(), tokens.size(), &n_out)) { + return 1; + } + return 0; +} + +void save_state(void * state_pr, char * dst, char * modes) { + (void) modes; + auto * state = binding_state(state_pr); + if (state == nullptr || state->ctx == nullptr) { + return; + } + llama_state_save_file(state->ctx, dst, nullptr, 0); +} + +void * llama_allocate_params( + const char * prompt, + int seed, + int threads, + int tokens, + int top_k, + float top_p, + float temp, + float repeat_penalty, + int repeat_last_n, + bool ignore_eos, + bool memory_f16, + int n_batch, + int n_keep, + const char ** antiprompt, + int antiprompt_count, + float tfs_z, + float typical_p, + float frequency_penalty, + float presence_penalty, + int mirostat, + float mirostat_eta, + float mirostat_tau, + bool penalize_nl, + const char * logit_bias, + const char * session_file, + bool prompt_cache_all, + bool mlock, + bool mmap, + const char * maingpu, + const char * tensorsplit, + bool prompt_cache_ro, + const char * grammar, + float rope_freq_base, + float rope_freq_scale, + float negative_prompt_scale, + const char * negative_prompt, + int n_draft) { + (void) tfs_z; + (void) penalize_nl; + (void) negative_prompt_scale; + (void) negative_prompt; + (void) memory_f16; + + auto * params = new common_params(); + params->prompt = prompt != nullptr ? prompt : ""; + params->n_predict = tokens; + params->n_batch = n_batch > 0 ? n_batch : params->n_batch; + params->n_keep = n_keep; + params->use_mlock = mlock; + params->use_mmap = mmap; + params->path_prompt_cache = session_file != nullptr ? session_file : ""; + params->prompt_cache_all = prompt_cache_all; + params->prompt_cache_ro = prompt_cache_ro; + + if (rope_freq_base > 0.0f) { + params->rope_freq_base = rope_freq_base; + } + if (rope_freq_scale > 0.0f) { + params->rope_freq_scale = rope_freq_scale; + } + + params->sampling.seed = seed >= 0 ? (uint32_t) seed : LLAMA_DEFAULT_SEED; + params->cpuparams.n_threads = threads > 0 ? threads : 4; + params->cpuparams_batch.n_threads = params->cpuparams.n_threads; + params->sampling.top_k = top_k; + params->sampling.top_p = top_p; + params->sampling.temp = temp; + params->sampling.penalty_repeat = repeat_penalty; + params->sampling.penalty_last_n = repeat_last_n; + params->sampling.penalty_freq = frequency_penalty; + params->sampling.penalty_present = presence_penalty; + params->sampling.typ_p = typical_p > 0 ? typical_p : 1.0f; + params->sampling.mirostat = mirostat; + params->sampling.mirostat_eta = mirostat_eta; + params->sampling.mirostat_tau = mirostat_tau; + params->sampling.ignore_eos = ignore_eos; + + if (grammar != nullptr && grammar[0] != '\0') { + params->sampling.grammar = common_grammar(COMMON_GRAMMAR_TYPE_USER, grammar); + } + + if (maingpu != nullptr && maingpu[0] != '\0') { + params->main_gpu = std::stoi(maingpu); + } + parse_tensor_split(tensorsplit, params->tensor_split, sizeof(params->tensor_split) / sizeof(params->tensor_split[0])); + + if (antiprompt_count > 0 && antiprompt != nullptr) { + params->antiprompt = create_vector(antiprompt, antiprompt_count); + } + + if (logit_bias != nullptr && logit_bias[0] != '\0') { + std::stringstream ss(logit_bias); + llama_token key; + char sign = 0; + std::string value_str; + if (ss >> key >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + params->sampling.logit_bias.push_back({key, std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f)}); + } + } + + params->speculative.draft.n_max = n_draft > 0 ? n_draft : params->speculative.draft.n_max; + + return params; +} + +void llama_free_params(void * params_ptr) { + delete static_cast(params_ptr); +} + +int eval(void * params_ptr, void * state_pr, char * text) { + auto * params = static_cast(params_ptr); + auto * state = binding_state(state_pr); + if (state == nullptr || state->ctx == nullptr) { + return 1; + } + + std::string str = text != nullptr ? text : params->prompt; + auto embd = common_tokenize(state->ctx, str, true, true); + if (embd.empty()) { + return 1; + } + + int n_past = 0; + if (!common_prompt_batch_decode(state->ctx, embd, n_past, params->n_batch, "", false)) { + return 1; + } + return 0; +} + +int get_embeddings(void * params_ptr, void * state_pr, float * res_embeddings) { + auto * params = static_cast(params_ptr); + auto * state = binding_state(state_pr); + if (state == nullptr || state->ctx == nullptr || !state->embeddings) { + return 1; + } + + auto embd = common_tokenize(state->ctx, params->prompt, true, true); + if (!embd.empty()) { + int n_past = 0; + if (!common_prompt_batch_decode(state->ctx, embd, n_past, params->n_batch, "", false)) { + return 1; + } + } + + const int n_embd = llama_model_n_embd(state->model); + const float * emb = llama_get_embeddings_ith(state->ctx, -1); + if (emb == nullptr) { + emb = llama_get_embeddings(state->ctx); + } + if (emb == nullptr) { + return 1; + } + + for (int i = 0; i < n_embd; ++i) { + res_embeddings[i] = emb[i]; + } + return 0; +} + +int get_token_embeddings(void * params_ptr, void * state_pr, int * tokens, int tokenSize, float * res_embeddings) { + auto * params = static_cast(params_ptr); + auto * state = binding_state(state_pr); + if (state == nullptr || state->ctx == nullptr) { + return 1; + } + + std::string text; + for (int i = 0; i < tokenSize; ++i) { + text += common_token_to_piece(state->ctx, tokens[i]); + } + params->prompt = text; + return get_embeddings(params_ptr, state_pr, res_embeddings); +} + +int llama_tokenize_string(void * params_ptr, void * state_pr, int * result) { + auto * params = static_cast(params_ptr); + auto * state = binding_state(state_pr); + if (state == nullptr || state->ctx == nullptr) { + return -1; + } + + const llama_vocab * vocab = llama_model_get_vocab(state->model); + const bool add_bos = llama_vocab_get_add_bos(vocab); + const int32_t max_tokens = params->n_ctx > 0 ? params->n_ctx : 4096; + + return llama_tokenize( + vocab, + params->prompt.c_str(), + (int32_t) params->prompt.size(), + reinterpret_cast(result), + max_tokens, + add_bos, + true); +} + +int llama_predict(void * params_ptr, void * state_pr, char * result, bool debug) { + auto * params = static_cast(params_ptr); + auto * state = binding_state(state_pr); + if (state == nullptr || state->ctx == nullptr || state->smpl == nullptr) { + return 1; + } + + llama_context * ctx = state->ctx; + llama_model * model = state->model; + const llama_vocab * vocab = llama_model_get_vocab(model); + llama_memory_t mem = llama_get_memory(ctx); + + common_sampler_ptr smpl_ptr(common_sampler_init(model, params->sampling)); + if (!smpl_ptr) { + return 1; + } + common_sampler * smpl = smpl_ptr.get(); + + const int n_ctx = llama_n_ctx(ctx); + if (params->n_predict < 0) { + params->n_predict = 128; + } + + llama_set_n_threads(ctx, params->cpuparams.n_threads, params->cpuparams_batch.n_threads); + + std::string path_session = params->path_prompt_cache; + std::vector session_tokens; + + if (!path_session.empty()) { + session_tokens.resize(n_ctx); + size_t n_out = 0; + if (std::ifstream(path_session).good()) { + llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size(), &n_out); + session_tokens.resize(n_out); + } + } + + const bool add_bos = llama_vocab_get_add_bos(vocab); + std::vector embd_inp = common_tokenize(ctx, params->prompt, add_bos, true); + if (embd_inp.empty()) { + embd_inp.push_back(llama_vocab_bos(vocab)); + } + + if ((int) embd_inp.size() > n_ctx - 4) { + return 1; + } + + if (params->n_keep < 0 || params->n_keep > (int) embd_inp.size()) { + params->n_keep = (int) embd_inp.size(); + } + + common_sampler_reset(smpl); + + int n_past = 0; + int n_remain = params->n_predict; + int n_consumed = 0; + int n_session_consumed = 0; + bool is_antiprompt = false; + bool need_save_session = !path_session.empty() && !params->prompt_cache_ro; + + std::vector embd; + std::string res; + + while (n_remain > 0 && !is_antiprompt) { + if (!embd.empty()) { + const int max_embd_size = n_ctx - 4; + if ((int) embd.size() > max_embd_size) { + embd.resize(max_embd_size); + } + + if (n_past + (int) embd.size() >= n_ctx) { + const int n_left = n_past - params->n_keep; + const int n_discard = n_left / 2; + llama_memory_seq_rm(mem, 0, params->n_keep, params->n_keep + n_discard); + llama_memory_seq_add(mem, 0, params->n_keep + n_discard, n_past, -n_discard); + n_past -= n_discard; + path_session.clear(); + } + + if (n_session_consumed < (int) session_tokens.size()) { + size_t i = 0; + for (; i < embd.size(); ++i) { + if (embd[i] != session_tokens[n_session_consumed]) { + session_tokens.resize(n_session_consumed); + break; + } + n_past++; + n_session_consumed++; + if (n_session_consumed >= (int) session_tokens.size()) { + ++i; + break; + } + } + if (i > 0) { + embd.erase(embd.begin(), embd.begin() + i); + } + } + + if (!embd.empty()) { + const bool save_now = need_save_session && n_consumed >= (int) embd_inp.size(); + if (!common_prompt_batch_decode(ctx, embd, n_past, params->n_batch, path_session, save_now)) { + return 1; + } + session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); + n_session_consumed = session_tokens.size(); + need_save_session = false; + } + } + + embd.clear(); + + if ((int) embd_inp.size() <= n_consumed) { + const llama_token id = common_sampler_sample(smpl, ctx, -1); + common_sampler_accept(smpl, id, true); + embd.push_back(id); + + auto piece = common_token_to_piece(ctx, id); + if (!tokenCallback(state_pr, const_cast(piece.c_str()))) { + break; + } + + res += piece; + --n_remain; + + if (llama_vocab_is_eog(vocab, id)) { + break; + } + } else { + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + common_sampler_accept(smpl, embd_inp[n_consumed], false); + ++n_consumed; + if ((int) embd.size() >= params->n_batch) { + break; + } + } + } + + for (const auto id : embd) { + res += common_token_to_piece(ctx, id); + } + + if ((int) embd_inp.size() <= n_consumed && !params->antiprompt.empty()) { + is_antiprompt = check_antiprompt(res, params->antiprompt, false); + } + } + + if (!path_session.empty() && params->prompt_cache_all && !params->prompt_cache_ro) { + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + + if (debug) { + common_perf_print(ctx, smpl); + } + + if (result != nullptr) { + std::strncpy(result, res.c_str(), params->n_predict > 0 ? (size_t) params->n_predict : res.size()); + result[params->n_predict > 0 ? params->n_predict - 1 : res.size()] = '\0'; + } + + return 0; +} + +int speculative_sampling(void * params_ptr, void * target_model, void * draft_model, char * result, bool debug) { + auto * params = static_cast(params_ptr); + auto * tgt = binding_state(target_model); + auto * dft = binding_state(draft_model); + if (tgt == nullptr || dft == nullptr || tgt->ctx == nullptr || dft->ctx == nullptr) { + return 1; + } + + llama_context * ctx_tgt = tgt->ctx; + llama_context * ctx_dft = dft->ctx; + const llama_vocab * vocab = llama_model_get_vocab(tgt->model); + + common_sampler_ptr smpl_ptr(common_sampler_init(tgt->model, params->sampling)); + if (!smpl_ptr) { + return 1; + } + common_sampler * smpl_tgt = smpl_ptr.get(); + + auto inp = common_tokenize(ctx_tgt, params->prompt, true, true); + const int max_tokens = llama_n_ctx(ctx_tgt) - 4; + if ((int) inp.size() > max_tokens) { + return 1; + } + + int n_past_tgt = 0; + int n_past_dft = 0; + if (!inp.empty()) { + if (!common_prompt_batch_decode(ctx_tgt, inp, n_past_tgt, params->n_batch, "", false)) { + return 1; + } + if (!common_prompt_batch_decode(ctx_dft, inp, n_past_dft, params->n_batch, "", false)) { + return 1; + } + } + + const int n_draft = params->speculative.draft.n_max > 0 ? params->speculative.draft.n_max : 16; + int n_predict = 0; + std::string res; + bool has_eos = false; + + std::vector drafted; + std::vector last_tokens(llama_n_ctx(ctx_tgt), 0); + for (auto id : inp) { + last_tokens.erase(last_tokens.begin()); + last_tokens.push_back(id); + } + + while (n_predict < params->n_predict && !has_eos) { + int i_dft = 0; + while (true) { + const llama_token id = common_sampler_sample(smpl_tgt, ctx_tgt, -1); + common_sampler_accept(smpl_tgt, id, true); + + last_tokens.erase(last_tokens.begin()); + last_tokens.push_back(id); + + auto piece = common_token_to_piece(ctx_tgt, id); + if (!tokenCallback(draft_model, const_cast(piece.c_str()))) { + break; + } + res += piece; + + if (llama_vocab_is_eog(vocab, id)) { + has_eos = true; + } + + ++n_predict; + + if (i_dft < (int) drafted.size() && id == drafted[i_dft]) { + ++i_dft; + continue; + } + + llama_token dft_id = id; + llama_batch batch = llama_batch_get_one(&dft_id, 1); + if (llama_decode(ctx_dft, batch) != 0) { + return 1; + } + ++n_past_dft; + + drafted.clear(); + drafted.push_back(id); + break; + } + + if (n_predict >= params->n_predict || has_eos) { + break; + } + + int n_past_cur = n_past_dft; + for (int i = 0; i < n_draft; ++i) { + float * logits = llama_get_logits(ctx_dft); + const int n_vocab = llama_vocab_n_tokens(vocab); + + llama_token draft_id = 0; + float max_logit = logits[0]; + for (llama_token t = 1; t < n_vocab; ++t) { + if (logits[t] > max_logit) { + max_logit = logits[t]; + draft_id = t; + } + } + drafted.push_back(draft_id); + + if (i == n_draft - 1) { + break; + } + + llama_batch batch = llama_batch_get_one(&draft_id, 1); + if (llama_decode(ctx_dft, batch) != 0) { + return 1; + } + ++n_past_cur; + } + + llama_batch batch = llama_batch_get_one(drafted.data(), (int32_t) drafted.size()); + if (llama_decode(ctx_tgt, batch) != 0) { + return 1; + } + ++n_past_tgt; + + if (!drafted.empty()) { + drafted.erase(drafted.begin()); + } + } + + if (debug) { + common_perf_print(ctx_tgt, smpl_tgt); + common_perf_print(ctx_dft, nullptr); + } + + if (result != nullptr) { + std::strncpy(result, res.c_str(), params->n_predict > 0 ? (size_t) params->n_predict : res.size()); + } + + return 0; +} + +} // extern "C" + +std::vector create_vector(const char ** strings, int count) { + std::vector vec; + for (int i = 0; i < count; ++i) { + vec.emplace_back(strings[i]); + } + return vec; +} + +void delete_vector(std::vector * vec) { + delete vec; +} diff --git a/binding.h b/binding.h new file mode 100644 index 0000000..44664eb --- /dev/null +++ b/binding.h @@ -0,0 +1,63 @@ +#ifdef __cplusplus +#include +#include +extern "C" { +#endif + +#include + +extern unsigned char tokenCallback(void *, char *); + +int load_state(void *ctx, char *statefile, char*modes); + +int eval(void* params_ptr, void *ctx, char*text); + +void save_state(void *ctx, char *dst, char*modes); + +void* load_model(const char *fname, + int n_ctx, + int n_seed, + bool memory_f16, + bool mlock, + bool embeddings, + bool mmap, + bool low_vram, + int n_gpu, + int n_batch, + const char *maingpu, + const char *tensorsplit, + bool numa, + float rope_freq_base, + float rope_freq_scale, + bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity + ); + +int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings); + +int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tokenSize, float * res_embeddings); + +void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, + int top_k, float top_p, float temp, float repeat_penalty, + int repeat_last_n, bool ignore_eos, bool memory_f16, + int n_batch, int n_keep, const char** antiprompt, int antiprompt_count, + float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, const char *logit_bias, const char *session_file, bool prompt_cache_all, bool mlock, bool mmap, const char *maingpu, const char *tensorsplit , + bool prompt_cache_ro, const char *grammar, float rope_freq_base, float rope_freq_scale, float negative_prompt_scale, const char* negative_prompt, + int n_draft); + +int speculative_sampling(void* params_ptr, void* target_model, void* draft_model, char* result, bool debug); + +void llama_free_params(void* params_ptr); + +void llama_binding_free_model(void* state); + +int llama_tokenize_string(void* params_ptr, void* state_pr, int* result); + +int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug); + +#ifdef __cplusplus +} + + +std::vector create_vector(const char** strings, int count); +void delete_vector(std::vector* vec); +#endif diff --git a/build.conf b/build.conf new file mode 100644 index 0000000..3884c3e --- /dev/null +++ b/build.conf @@ -0,0 +1,3 @@ +# Пути к исходникам llama.cpp (без переменных окружения) +LLAMA_CPP_PATH=/home/admin/cpp/llama.cpp +LLAMA_BUILD_PATH=/home/admin/cpp/llama.cpp/build diff --git a/cgo_flags.go b/cgo_flags.go new file mode 100644 index 0000000..111815a --- /dev/null +++ b/cgo_flags.go @@ -0,0 +1,10 @@ +package llama + +// Флаги CGO генерируются из build.conf (пути к /home/admin/cpp/llama.cpp). +// При смене пути отредактируйте build.conf и этот файл. + +/* +#cgo CXXFLAGS: -std=c++17 -I/home/admin/cpp/llama.cpp/include -I/home/admin/cpp/llama.cpp/common -I/home/admin/cpp/llama.cpp/ggml/include -I${SRCDIR} +#cgo LDFLAGS: -L${SRCDIR} -lbinding -L/home/admin/cpp/llama.cpp/build/src -lllama -L/home/admin/cpp/llama.cpp/build/common -lllama-common -lllama-common-base -L/home/admin/cpp/llama.cpp/build/ggml/src -lggml -lggml-cpu -lggml-base -L/home/admin/cpp/llama.cpp/build/vendor/cpp-httplib -lcpp-httplib -lstdc++ -lm -lpthread -fopenmp -ldl +*/ +import "C" diff --git a/examples/main.go b/examples/main.go new file mode 100644 index 0000000..94904b1 --- /dev/null +++ b/examples/main.go @@ -0,0 +1,48 @@ +package main + +import ( + "bufio" + "fmt" + "os" + "strings" + + llama "go-llama-new.cpp" +) + +func main() { + if len(os.Args) < 2 { + fmt.Fprintf(os.Stderr, "usage: %s [prompt]\n", os.Args[0]) + os.Exit(1) + } + + modelPath := os.Args[1] + prompt := "Hello" + if len(os.Args) > 2 { + prompt = strings.Join(os.Args[2:], " ") + } + + l, err := llama.New(modelPath, llama.SetContext(512), llama.SetGPULayers(0)) + if err != nil { + fmt.Fprintf(os.Stderr, "load model: %v\n", err) + os.Exit(1) + } + defer l.Free() + + out, err := l.Predict(prompt, llama.SetTokens(64), llama.SetThreads(4)) + if err != nil { + fmt.Fprintf(os.Stderr, "predict: %v\n", err) + os.Exit(1) + } + + fmt.Println(out) + + reader := bufio.NewReader(os.Stdin) + fmt.Print("\nТокенизация (введите текст): ") + line, _ := reader.ReadString('\n') + _, tokens, err := l.TokenizeString(strings.TrimSpace(line)) + if err != nil { + fmt.Fprintf(os.Stderr, "tokenize: %v\n", err) + return + } + fmt.Printf("токенов: %d, ids: %v\n", len(tokens), tokens) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..97f0e89 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module go-llama-new.cpp + +go 1.21 diff --git a/llama.go b/llama.go new file mode 100644 index 0000000..5c21ec0 --- /dev/null +++ b/llama.go @@ -0,0 +1,409 @@ +package llama + +// #include "binding.h" +// #include +import "C" +import ( + "fmt" + "os" + "strings" + "sync" + "unsafe" +) + +type LLama struct { + state unsafe.Pointer + embeddings bool + contextSize int +} + +func New(model string, opts ...ModelOption) (*LLama, error) { + mo := NewModelOptions(opts...) + modelPath := C.CString(model) + defer C.free(unsafe.Pointer(modelPath)) + loraBase := C.CString(mo.LoraBase) + defer C.free(unsafe.Pointer(loraBase)) + loraAdapter := C.CString(mo.LoraAdapter) + defer C.free(unsafe.Pointer(loraAdapter)) + + MulMatQ := true + + if mo.MulMatQ != nil { + MulMatQ = *mo.MulMatQ + } + + result := C.load_model(modelPath, + C.int(mo.ContextSize), C.int(mo.Seed), + C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), + C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA), + C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale), + C.bool(MulMatQ), loraAdapter, loraBase, C.bool(mo.Perplexity), + ) + + if result == nil { + return nil, fmt.Errorf("failed loading model") + } + + ll := &LLama{state: result, contextSize: mo.ContextSize, embeddings: mo.Embeddings} + return ll, nil +} + +func (l *LLama) Free() { + C.llama_binding_free_model(l.state) +} + +func (l *LLama) LoadState(state string) error { + d := C.CString(state) + w := C.CString("rb") + result := C.load_state(l.state, d, w) + + defer C.free(unsafe.Pointer(d)) + defer C.free(unsafe.Pointer(w)) + + if result != 0 { + return fmt.Errorf("error while loading state") + } + + return nil +} + +func (l *LLama) SaveState(dst string) error { + d := C.CString(dst) + w := C.CString("wb") + + C.save_state(l.state, d, w) + + defer C.free(unsafe.Pointer(d)) + defer C.free(unsafe.Pointer(w)) + + _, err := os.Stat(dst) + return err +} + +// Token Embeddings +func (l *LLama) TokenEmbeddings(tokens []int, opts ...PredictOption) ([]float32, error) { + if !l.embeddings { + return []float32{}, fmt.Errorf("model loaded without embeddings") + } + + po := NewPredictOptions(opts...) + + outSize := po.Tokens + if po.Tokens == 0 { + outSize = 9999999 + } + + floats := make([]float32, outSize) + + myArray := (*C.int)(C.malloc(C.size_t(len(tokens)) * C.sizeof_int)) + + for i, v := range tokens { + (*[1 << 31]int32)(unsafe.Pointer(myArray))[i] = int32(v) + } + + params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), + C.bool(po.IgnoreEOS), C.bool(po.F16KV), + C.int(po.Batch), C.int(po.NKeep), nil, C.int(0), + C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), + C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias), + C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap), + C.CString(po.MainGPU), C.CString(po.TensorSplit), + C.bool(po.PromptCacheRO), + C.CString(po.Grammar), + C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt), + C.int(po.NDraft), + ) + ret := C.get_token_embeddings(params, l.state, myArray, C.int(len(tokens)), (*C.float)(&floats[0])) + C.free(unsafe.Pointer(myArray)) + C.llama_free_params(params) + if ret != 0 { + return floats, fmt.Errorf("embedding inference failed") + } + return floats, nil +} + +// Embeddings +func (l *LLama) Embeddings(text string, opts ...PredictOption) ([]float32, error) { + if !l.embeddings { + return []float32{}, fmt.Errorf("model loaded without embeddings") + } + + po := NewPredictOptions(opts...) + + input := C.CString(text) + defer C.free(unsafe.Pointer(input)) + if po.Tokens == 0 { + po.Tokens = 99999999 + } + floats := make([]float32, po.Tokens) + reverseCount := len(po.StopPrompts) + reversePrompt := make([]*C.char, reverseCount) + var pass **C.char + for i, s := range po.StopPrompts { + cs := C.CString(s) + defer C.free(unsafe.Pointer(cs)) + reversePrompt[i] = cs + pass = &reversePrompt[0] + } + + params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), + C.bool(po.IgnoreEOS), C.bool(po.F16KV), + C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount), + C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), + C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias), + C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap), + C.CString(po.MainGPU), C.CString(po.TensorSplit), + C.bool(po.PromptCacheRO), + C.CString(po.Grammar), + C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt), + C.int(po.NDraft), + ) + + ret := C.get_embeddings(params, l.state, (*C.float)(&floats[0])) + C.llama_free_params(params) + if ret != 0 { + return floats, fmt.Errorf("embedding inference failed") + } + + return floats, nil +} + +func (l *LLama) Eval(text string, opts ...PredictOption) error { + po := NewPredictOptions(opts...) + + input := C.CString(text) + defer C.free(unsafe.Pointer(input)) + if po.Tokens == 0 { + po.Tokens = 99999999 + } + + reverseCount := len(po.StopPrompts) + reversePrompt := make([]*C.char, reverseCount) + var pass **C.char + for i, s := range po.StopPrompts { + cs := C.CString(s) + defer C.free(unsafe.Pointer(cs)) + reversePrompt[i] = cs + pass = &reversePrompt[0] + } + + params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), + C.bool(po.IgnoreEOS), C.bool(po.F16KV), + C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount), + C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), + C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias), + C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap), + C.CString(po.MainGPU), C.CString(po.TensorSplit), + C.bool(po.PromptCacheRO), + C.CString(po.Grammar), + C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt), + C.int(po.NDraft), + ) + ret := C.eval(params, l.state, input) + C.llama_free_params(params) + if ret != 0 { + return fmt.Errorf("inference failed") + } + + return nil +} + +func (l *LLama) SpeculativeSampling(ll *LLama, text string, opts ...PredictOption) (string, error) { + po := NewPredictOptions(opts...) + + if po.TokenCallback != nil { + setCallback(l.state, po.TokenCallback) + } + + input := C.CString(text) + defer C.free(unsafe.Pointer(input)) + if po.Tokens == 0 { + po.Tokens = 99999999 + } + out := make([]byte, po.Tokens) + + reverseCount := len(po.StopPrompts) + reversePrompt := make([]*C.char, reverseCount) + var pass **C.char + for i, s := range po.StopPrompts { + cs := C.CString(s) + defer C.free(unsafe.Pointer(cs)) + reversePrompt[i] = cs + pass = &reversePrompt[0] + } + + params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), + C.bool(po.IgnoreEOS), C.bool(po.F16KV), + C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount), + C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), + C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias), + C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap), + C.CString(po.MainGPU), C.CString(po.TensorSplit), + C.bool(po.PromptCacheRO), + C.CString(po.Grammar), + C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt), + C.int(po.NDraft), + ) + ret := C.speculative_sampling(params, l.state, ll.state, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode)) + C.llama_free_params(params) + + if po.TokenCallback != nil { + setCallback(l.state, nil) + } + + if ret != 0 { + return "", fmt.Errorf("inference failed") + } + res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) + + res = strings.TrimPrefix(res, " ") + res = strings.TrimPrefix(res, text) + res = strings.TrimPrefix(res, "\n") + + for _, s := range po.StopPrompts { + res = strings.TrimRight(res, s) + } + + return res, nil +} + +func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { + po := NewPredictOptions(opts...) + + if po.TokenCallback != nil { + setCallback(l.state, po.TokenCallback) + } + + input := C.CString(text) + defer C.free(unsafe.Pointer(input)) + if po.Tokens == 0 { + po.Tokens = 99999999 + } + out := make([]byte, po.Tokens) + + reverseCount := len(po.StopPrompts) + reversePrompt := make([]*C.char, reverseCount) + var pass **C.char + for i, s := range po.StopPrompts { + cs := C.CString(s) + defer C.free(unsafe.Pointer(cs)) + reversePrompt[i] = cs + pass = &reversePrompt[0] + } + + params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), + C.bool(po.IgnoreEOS), C.bool(po.F16KV), + C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount), + C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), + C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias), + C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap), + C.CString(po.MainGPU), C.CString(po.TensorSplit), + C.bool(po.PromptCacheRO), + C.CString(po.Grammar), + C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt), + C.int(po.NDraft), + ) + ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode)) + C.llama_free_params(params) + + if po.TokenCallback != nil { + setCallback(l.state, nil) + } + + if ret != 0 { + return "", fmt.Errorf("inference failed") + } + res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) + + res = strings.TrimPrefix(res, " ") + res = strings.TrimPrefix(res, text) + res = strings.TrimPrefix(res, "\n") + + for _, s := range po.StopPrompts { + res = strings.TrimRight(res, s) + } + + return res, nil +} + +func (l *LLama) TokenizeString(text string, opts ...PredictOption) (int32, []int32, error) { + po := NewPredictOptions(opts...) + + input := C.CString(text) + defer C.free(unsafe.Pointer(input)) + if po.Tokens == 0 { + po.Tokens = 4096 + } + out := make([]C.int, po.Tokens) + + var fakeDblPtr **C.char + + params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), + C.bool(po.IgnoreEOS), C.bool(po.F16KV), + C.int(po.Batch), C.int(po.NKeep), fakeDblPtr, C.int(0), + C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), + C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias), + C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap), + C.CString(po.MainGPU), C.CString(po.TensorSplit), + C.bool(po.PromptCacheRO), + C.CString(po.Grammar), + C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt), + C.int(po.NDraft), + ) + + tokRet := C.llama_tokenize_string(params, l.state, (*C.int)(unsafe.Pointer(&out[0]))) + C.llama_free_params(params) + + if tokRet < 0 { + return int32(tokRet), []int32{}, fmt.Errorf("llama_tokenize_string returned negative count %d", tokRet) + } + + gTokRet := int32(tokRet) + + gLenOut := min(len(out), int(gTokRet)) + + goSlice := make([]int32, gLenOut) + for i := 0; i < gLenOut; i++ { + goSlice[i] = int32(out[i]) + } + + return gTokRet, goSlice, nil +} + +func (l *LLama) SetTokenCallback(callback func(token string) bool) { + setCallback(l.state, callback) +} + +var ( + m sync.RWMutex + callbacks = map[uintptr]func(string) bool{} +) + +//export tokenCallback +func tokenCallback(statePtr unsafe.Pointer, token *C.char) bool { + m.RLock() + defer m.RUnlock() + + if callback, ok := callbacks[uintptr(statePtr)]; ok { + return callback(C.GoString(token)) + } + + return true +} + +func setCallback(statePtr unsafe.Pointer, callback func(string) bool) { + m.Lock() + defer m.Unlock() + + if callback == nil { + delete(callbacks, uintptr(statePtr)) + } else { + callbacks[uintptr(statePtr)] = callback + } +} diff --git a/llama_cublas.go b/llama_cublas.go new file mode 100644 index 0000000..efd1519 --- /dev/null +++ b/llama_cublas.go @@ -0,0 +1,9 @@ +//go:build cublas +// +build cublas + +package llama + +/* +#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/ +*/ +import "C" diff --git a/llama_openblas.go b/llama_openblas.go new file mode 100644 index 0000000..31e09f7 --- /dev/null +++ b/llama_openblas.go @@ -0,0 +1,9 @@ +//go:build openblas +// +build openblas + +package llama + +/* +#cgo LDFLAGS: -lopenblas +*/ +import "C" diff --git a/options.go b/options.go new file mode 100644 index 0000000..bfac3b8 --- /dev/null +++ b/options.go @@ -0,0 +1,460 @@ +package llama + +type ModelOptions struct { + ContextSize int + Seed int + NBatch int + F16Memory bool + MLock bool + MMap bool + LowVRAM bool + Embeddings bool + NUMA bool + NGPULayers int + MainGPU string + TensorSplit string + FreqRopeBase float32 + FreqRopeScale float32 + MulMatQ *bool + LoraBase string + LoraAdapter string + Perplexity bool +} + +type PredictOptions struct { + Seed, Threads, Tokens, TopK, Repeat, Batch, NKeep int + TopP, Temperature, Penalty float32 + NDraft int + F16KV bool + DebugMode bool + StopPrompts []string + IgnoreEOS bool + + TailFreeSamplingZ float32 + TypicalP float32 + FrequencyPenalty float32 + PresencePenalty float32 + Mirostat int + MirostatETA float32 + MirostatTAU float32 + PenalizeNL bool + LogitBias string + TokenCallback func(string) bool + + PathPromptCache string + MLock, MMap, PromptCacheAll bool + PromptCacheRO bool + Grammar string + MainGPU string + TensorSplit string + + // Rope parameters + RopeFreqBase float32 + RopeFreqScale float32 + + // Negative prompt parameters + NegativePromptScale float32 + NegativePrompt string +} + +type PredictOption func(p *PredictOptions) + +type ModelOption func(p *ModelOptions) + +var DefaultModelOptions ModelOptions = ModelOptions{ + ContextSize: 512, + Seed: 0, + F16Memory: false, + MLock: false, + Embeddings: false, + MMap: true, + LowVRAM: false, + NBatch: 512, + FreqRopeBase: 10000, + FreqRopeScale: 1.0, +} + +var DefaultOptions PredictOptions = PredictOptions{ + Seed: -1, + Threads: 4, + Tokens: 128, + Penalty: 1.1, + Repeat: 64, + Batch: 512, + NKeep: 64, + TopK: 40, + TopP: 0.95, + TailFreeSamplingZ: 1.0, + TypicalP: 1.0, + Temperature: 0.8, + FrequencyPenalty: 0.0, + PresencePenalty: 0.0, + Mirostat: 0, + MirostatTAU: 5.0, + MirostatETA: 0.1, + MMap: true, + RopeFreqBase: 10000, + RopeFreqScale: 1.0, +} + +func SetMulMatQ(b bool) ModelOption { + return func(p *ModelOptions) { + p.MulMatQ = &b + } +} + +func SetLoraBase(s string) ModelOption { + return func(p *ModelOptions) { + p.LoraBase = s + } +} + +func SetLoraAdapter(s string) ModelOption { + return func(p *ModelOptions) { + p.LoraAdapter = s + } +} + +// SetContext sets the context size. +func SetContext(c int) ModelOption { + return func(p *ModelOptions) { + p.ContextSize = c + } +} + +func WithRopeFreqBase(f float32) ModelOption { + return func(p *ModelOptions) { + p.FreqRopeBase = f + } +} + +func WithRopeFreqScale(f float32) ModelOption { + return func(p *ModelOptions) { + p.FreqRopeScale = f + } +} + +func SetModelSeed(c int) ModelOption { + return func(p *ModelOptions) { + p.Seed = c + } +} + +// SetContext sets the context size. +func SetMMap(b bool) ModelOption { + return func(p *ModelOptions) { + p.MMap = b + } +} + +// SetNBatch sets the n_Batch +func SetNBatch(n_batch int) ModelOption { + return func(p *ModelOptions) { + p.NBatch = n_batch + } +} + +// Set sets the tensor split for the GPU +func SetTensorSplit(maingpu string) ModelOption { + return func(p *ModelOptions) { + p.TensorSplit = maingpu + } +} + +// SetMainGPU sets the main_gpu +func SetMainGPU(maingpu string) ModelOption { + return func(p *ModelOptions) { + p.MainGPU = maingpu + } +} + +// SetPredictionTensorSplit sets the tensor split for the GPU +func SetPredictionTensorSplit(maingpu string) PredictOption { + return func(p *PredictOptions) { + p.TensorSplit = maingpu + } +} + +// SetPredictionMainGPU sets the main_gpu +func SetPredictionMainGPU(maingpu string) PredictOption { + return func(p *PredictOptions) { + p.MainGPU = maingpu + } +} + +// Rope and negative prompt parameters +func SetRopeFreqBase(rfb float32) PredictOption { + return func(p *PredictOptions) { + p.RopeFreqBase = rfb + } +} + +func SetRopeFreqScale(rfs float32) PredictOption { + return func(p *PredictOptions) { + p.RopeFreqScale = rfs + } +} + +func SetNDraft(nd int) PredictOption { + return func(p *PredictOptions) { + p.NDraft = nd + } +} + +func SetPerplexity(b bool) ModelOption { + return func(p *ModelOptions) { + p.Perplexity = b + } +} + +func SetNegativePromptScale(nps float32) PredictOption { + return func(p *PredictOptions) { + p.NegativePromptScale = nps + } +} + +func SetNegativePrompt(np string) PredictOption { + return func(p *PredictOptions) { + p.NegativePrompt = np + } +} + +var EnabelLowVRAM ModelOption = func(p *ModelOptions) { + p.LowVRAM = true +} + +var EnableNUMA ModelOption = func(p *ModelOptions) { + p.NUMA = true +} + +var EnableEmbeddings ModelOption = func(p *ModelOptions) { + p.Embeddings = true +} + +var EnableF16Memory ModelOption = func(p *ModelOptions) { + p.F16Memory = true +} + +var EnableF16KV PredictOption = func(p *PredictOptions) { + p.F16KV = true +} + +var Debug PredictOption = func(p *PredictOptions) { + p.DebugMode = true +} + +var EnablePromptCacheAll PredictOption = func(p *PredictOptions) { + p.PromptCacheAll = true +} + +var EnablePromptCacheRO PredictOption = func(p *PredictOptions) { + p.PromptCacheRO = true +} + +var EnableMLock ModelOption = func(p *ModelOptions) { + p.MLock = true +} + +// Create a new PredictOptions object with the given options. +func NewModelOptions(opts ...ModelOption) ModelOptions { + p := DefaultModelOptions + for _, opt := range opts { + opt(&p) + } + return p +} + +var IgnoreEOS PredictOption = func(p *PredictOptions) { + p.IgnoreEOS = true +} + +// WithGrammar sets the grammar to constrain the output of the LLM response +func WithGrammar(s string) PredictOption { + return func(p *PredictOptions) { + p.Grammar = s + } +} + +// SetMlock sets the memory lock. +func SetMlock(b bool) PredictOption { + return func(p *PredictOptions) { + p.MLock = b + } +} + +// SetMemoryMap sets memory mapping. +func SetMemoryMap(b bool) PredictOption { + return func(p *PredictOptions) { + p.MMap = b + } +} + +// SetGPULayers sets the number of GPU layers to use to offload computation +func SetGPULayers(n int) ModelOption { + return func(p *ModelOptions) { + p.NGPULayers = n + } +} + +// SetTokenCallback sets the prompts that will stop predictions. +func SetTokenCallback(fn func(string) bool) PredictOption { + return func(p *PredictOptions) { + p.TokenCallback = fn + } +} + +// SetStopWords sets the prompts that will stop predictions. +func SetStopWords(stop ...string) PredictOption { + return func(p *PredictOptions) { + p.StopPrompts = stop + } +} + +// SetSeed sets the random seed for sampling text generation. +func SetSeed(seed int) PredictOption { + return func(p *PredictOptions) { + p.Seed = seed + } +} + +// SetThreads sets the number of threads to use for text generation. +func SetThreads(threads int) PredictOption { + return func(p *PredictOptions) { + p.Threads = threads + } +} + +// SetTokens sets the number of tokens to generate. +func SetTokens(tokens int) PredictOption { + return func(p *PredictOptions) { + p.Tokens = tokens + } +} + +// SetTopK sets the value for top-K sampling. +func SetTopK(topk int) PredictOption { + return func(p *PredictOptions) { + p.TopK = topk + } +} + +// SetTopP sets the value for nucleus sampling. +func SetTopP(topp float32) PredictOption { + return func(p *PredictOptions) { + p.TopP = topp + } +} + +// SetTemperature sets the temperature value for text generation. +func SetTemperature(temp float32) PredictOption { + return func(p *PredictOptions) { + p.Temperature = temp + } +} + +// SetPathPromptCache sets the session file to store the prompt cache. +func SetPathPromptCache(f string) PredictOption { + return func(p *PredictOptions) { + p.PathPromptCache = f + } +} + +// SetPenalty sets the repetition penalty for text generation. +func SetPenalty(penalty float32) PredictOption { + return func(p *PredictOptions) { + p.Penalty = penalty + } +} + +// SetRepeat sets the number of times to repeat text generation. +func SetRepeat(repeat int) PredictOption { + return func(p *PredictOptions) { + p.Repeat = repeat + } +} + +// SetBatch sets the batch size. +func SetBatch(size int) PredictOption { + return func(p *PredictOptions) { + p.Batch = size + } +} + +// SetKeep sets the number of tokens from initial prompt to keep. +func SetNKeep(n int) PredictOption { + return func(p *PredictOptions) { + p.NKeep = n + } +} + +// Create a new PredictOptions object with the given options. +func NewPredictOptions(opts ...PredictOption) PredictOptions { + p := DefaultOptions + for _, opt := range opts { + opt(&p) + } + return p +} + +// SetTailFreeSamplingZ sets the tail free sampling, parameter z. +func SetTailFreeSamplingZ(tfz float32) PredictOption { + return func(p *PredictOptions) { + p.TailFreeSamplingZ = tfz + } +} + +// SetTypicalP sets the typicality parameter, p_typical. +func SetTypicalP(tp float32) PredictOption { + return func(p *PredictOptions) { + p.TypicalP = tp + } +} + +// SetFrequencyPenalty sets the frequency penalty parameter, freq_penalty. +func SetFrequencyPenalty(fp float32) PredictOption { + return func(p *PredictOptions) { + p.FrequencyPenalty = fp + } +} + +// SetPresencePenalty sets the presence penalty parameter, presence_penalty. +func SetPresencePenalty(pp float32) PredictOption { + return func(p *PredictOptions) { + p.PresencePenalty = pp + } +} + +// SetMirostat sets the mirostat parameter. +func SetMirostat(m int) PredictOption { + return func(p *PredictOptions) { + p.Mirostat = m + } +} + +// SetMirostatETA sets the mirostat ETA parameter. +func SetMirostatETA(me float32) PredictOption { + return func(p *PredictOptions) { + p.MirostatETA = me + } +} + +// SetMirostatTAU sets the mirostat TAU parameter. +func SetMirostatTAU(mt float32) PredictOption { + return func(p *PredictOptions) { + p.MirostatTAU = mt + } +} + +// SetPenalizeNL sets whether to penalize newlines or not. +func SetPenalizeNL(pnl bool) PredictOption { + return func(p *PredictOptions) { + p.PenalizeNL = pnl + } +} + +// SetLogitBias sets the logit bias parameter. +func SetLogitBias(lb string) PredictOption { + return func(p *PredictOptions) { + p.LogitBias = lb + } +}