first commit

This commit is contained in:
admin 2026-05-15 13:45:21 +07:00
commit 05cfbaa1b8
13 changed files with 1982 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
*.o
*.a
binding.o
libbinding.a

47
Makefile Normal file
View File

@ -0,0 +1,47 @@
.PHONY: all clean libbinding.a
include build.conf
LLAMA_INCLUDE := $(LLAMA_CPP_PATH)/include
LLAMA_COMMON := $(LLAMA_CPP_PATH)/common
LLAMA_GGML := $(LLAMA_CPP_PATH)/ggml/include
CXXFLAGS := -std=c++17 -O3 -DNDEBUG -fPIC -pthread \
-I$(LLAMA_INCLUDE) -I$(LLAMA_COMMON) -I$(LLAMA_GGML) -I.
LDFLAGS_LIBS := \
-L$(LLAMA_BUILD_PATH)/src -lllama \
-L$(LLAMA_BUILD_PATH)/common -lllama-common \
-L$(LLAMA_BUILD_PATH)/ggml/src -lggml -lggml-cpu -lggml-base \
-L$(LLAMA_BUILD_PATH)/vendor/cpp-httplib -lcpp-httplib \
-lpthread -fopenmp -ldl -lm -lstdc++
all: libbinding.a cgo_flags.go
# Обновить пути в cgo_flags.go из build.conf
cgo_flags.go: build.conf
@LLAMA=$$(grep '^LLAMA_CPP_PATH=' build.conf | cut -d= -f2); \
BUILD=$$(grep '^LLAMA_BUILD_PATH=' build.conf | cut -d= -f2); \
printf '%s\n' \
'package llama' \
'' \
'/*' \
"#cgo CXXFLAGS: -std=c++17 -I$$LLAMA/include -I$$LLAMA/common -I$$LLAMA/ggml/include -I\$${SRCDIR}" \
"#cgo LDFLAGS: -L\$${SRCDIR} -lbinding -L$$BUILD/src -lllama -L$$BUILD/common -lllama-common -lllama-common-base -L$$BUILD/ggml/src -lggml -lggml-cpu -lggml-base -L$$BUILD/vendor/cpp-httplib -lcpp-httplib -lstdc++ -lm -lpthread -fopenmp -ldl" \
'*/' \
'import "C"' \
> cgo_flags.go
$(LLAMA_BUILD_PATH)/src/libllama.a:
cd $(LLAMA_BUILD_PATH) && cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF && \
cmake --build . --target llama llama-common -j$$(nproc)
binding.o: binding.cpp binding.h $(LLAMA_BUILD_PATH)/src/libllama.a
$(CXX) $(CXXFLAGS) -c binding.cpp -o binding.o
libbinding.a: binding.o
ar rcs libbinding.a binding.o
@echo "Собрано: libbinding.a. Линковка llama.cpp — через cgo_flags.go."
clean:
rm -f binding.o libbinding.a

198
README.md Normal file
View File

@ -0,0 +1,198 @@
# go-llama-new.cpp
Go-обёртка над [llama.cpp](https://github.com/ggml-org/llama.cpp) с API, совместимым с [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp): те же имена типов, функций и экспортируемых переменных (`LLama`, `New`, `Predict`, `SetContext`, `EnableEmbeddings` и т.д.).
Ядро собирается из локальных исходников llama.cpp (не из submodule внутри репозитория). Пути к исходникам задаются в файле `build.conf`, переменные окружения для этого не используются.
## Требования
- **Go** 1.21 или новее (с поддержкой CGO)
- **Компилятор C++** с поддержкой C++17 (`g++` / `clang++`)
- **CMake** 3.14+
- **make**, **ar**
- **OpenMP** (обычно пакет `libgomp` в Linux)
- Инструменты сборки: `git`, `build-essential` (или аналог)
Для линковки также нужны статические библиотеки, которые CMake собирает из llama.cpp: `libllama.a`, `libllama-common.a`, `libllama-common-base.a`, `libggml*.a`, `libcpp-httplib.a`.
## Настройка путей
Отредактируйте `build.conf` в корне модуля:
```ini
# Пути к исходникам llama.cpp (без переменных окружения)
LLAMA_CPP_PATH=/home/admin/cpp/llama.cpp
LLAMA_BUILD_PATH=/home/admin/cpp/llama.cpp/build
```
| Параметр | Описание |
|----------|----------|
| `LLAMA_CPP_PATH` | Каталог с исходниками llama.cpp (`include/`, `common/`, `src/` и т.д.) |
| `LLAMA_BUILD_PATH` | Каталог сборки CMake (там появятся `build/src/libllama.a` и др.) |
После изменения `build.conf` выполните `make` — будет пересоздан `cgo_flags.go` с актуальными путями для CGO.
## Сборка
Сборка состоит из двух этапов: сначала нативное ядро llama.cpp, затем Go-модуль с C-обёрткой `binding`.
### 1. Сборка llama.cpp
```bash
mkdir -p /home/admin/cpp/llama.cpp/build
cd /home/admin/cpp/llama.cpp/build
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=OFF
cmake --build . --target llama llama-common -j"$(nproc)"
```
Проверка, что библиотеки на месте:
```bash
ls -la build/src/libllama.a
ls -la build/common/libllama-common.a
ls -la build/common/libllama-common-base.a
ls -la build/ggml/src/libggml.a
```
Цель `make` в каталоге модуля при необходимости запустит эту же сборку автоматически (см. `Makefile`).
### 2. Сборка C-обёртки (libbinding.a)
В каталоге модуля:
```bash
cd /path/to/go-llama-new.cpp
make
```
Будет выполнено:
1. Генерация `cgo_flags.go` из `build.conf`
2. Компиляция `binding.cpp``binding.o`
3. Создание архива `libbinding.a`
Очистка артефактов обёртки:
```bash
make clean
```
### 3. Сборка Go-модуля
```bash
go build ./...
```
Или пример:
```bash
go build -o llama-example ./examples/
go run ./examples/main.go /path/to/model.gguf "Привет, мир"
```
При первой сборке CGO скомпилирует `binding.cpp` ещё раз и слинкует его с библиотеками из `LLAMA_BUILD_PATH` (см. `cgo_flags.go`).
## Использование в своём проекте
```go
import llama "go-llama-new.cpp"
func main() {
model, err := llama.New("/path/to/model.gguf",
llama.SetContext(4096),
llama.SetGPULayers(0),
)
if err != nil {
panic(err)
}
defer model.Free()
text, err := model.Predict("Привет",
llama.SetTokens(128),
llama.SetTemperature(0.8),
)
if err != nil {
panic(err)
}
println(text)
}
```
В `go.mod` вашего проекта:
```go
require go-llama-new.cpp v0.0.0
replace go-llama-new.cpp => /path/to/go-llama-new.cpp
```
Перед `go build` в проекте-потребителе должны быть собраны llama.cpp и `libbinding.a` (шаги 12 выше).
## Опциональные теги сборки
Как в оригинальном go-llama.cpp:
| Тег | Назначение |
|-----|------------|
| `openblas` | Дополнительная линковка с OpenBLAS (`llama_openblas.go`) |
| `cublas` | CUDA (`llama_cublas.go`) — требует отдельной сборки llama.cpp с `GGML_CUDA=ON` |
Пример:
```bash
go build -tags openblas ./...
```
Для GPU нужно пересобрать llama.cpp с нужными опциями CMake (например `-DGGML_CUDA=ON`) и убедиться, что пути в `build.conf` указывают на эту сборку.
## Устранение неполадок
### `неопределённая ссылка на llama_compiler` / `llama_commit` / `llama_build_number`
Не слинкована `libllama-common-base.a`. Убедитесь, что в `cgo_flags.go` в `LDFLAGS` есть `-lllama-common-base`, и пересоберите:
```bash
make
go build ./...
```
### `cannot find -lllama` или `-lllama-common`
Проверьте `LLAMA_BUILD_PATH` в `build.conf` и выполните сборку llama.cpp (шаг 1).
### CGO отключён
```bash
go env CGO_ENABLED # должно быть 1
```
Установите `gcc`/`g++`, если CGO выключен из-за отсутствия компилятора C.
### Изменили путь к llama.cpp
1. Обновите `build.conf`
2. `make` (обновит `cgo_flags.go` и `libbinding.a`)
3. `go build ./...`
## Структура репозитория
```
.
├── build.conf # пути к llama.cpp
├── binding.h
├── binding.cpp # C API для CGO
├── cgo_flags.go # флаги CGO (генерируется make)
├── llama.go
├── options.go
├── Makefile
├── examples/main.go
└── README.md
```
## Лицензия
Следует лицензиям llama.cpp и исходного go-llama.cpp. Используйте в соответствии с условиями соответствующих проектов.

719
binding.cpp Normal file
View File

@ -0,0 +1,719 @@
#include "binding.h"
#include "common.h"
#include "llama.h"
#include "sampling.h"
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <regex>
#include <sstream>
#include <string>
#include <vector>
struct llama_binding_state {
common_init_result_ptr init;
llama_model * model = nullptr;
llama_context * ctx = nullptr;
common_sampler * smpl = nullptr;
bool embeddings = false;
};
static llama_binding_state * binding_state(void * state_pr) {
return static_cast<llama_binding_state *>(state_pr);
}
static void parse_tensor_split(const char * tensorsplit, float * out, size_t n) {
for (size_t i = 0; i < n; ++i) {
out[i] = 0.0f;
}
if (tensorsplit == nullptr || tensorsplit[0] == '\0') {
return;
}
std::string arg_next = tensorsplit;
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
for (size_t i = 0; i < split_arg.size() && i < n; ++i) {
out[i] = std::stof(split_arg[i]);
}
}
static void apply_model_load_options(
common_params & params,
int n_ctx,
int n_seed,
bool memory_f16,
bool mlock,
bool embeddings,
bool mmap,
int n_gpu,
int n_batch,
const char * maingpu,
const char * tensorsplit,
bool numa,
float rope_freq_base,
float rope_freq_scale,
const char * lora,
const char * lora_base,
bool perplexity) {
(void) lora_base;
if (n_ctx > 0) {
params.n_ctx = n_ctx;
}
if (n_seed >= 0) {
params.sampling.seed = (uint32_t) n_seed;
}
params.use_mlock = mlock;
params.embedding = embeddings;
params.use_mmap = mmap;
params.n_gpu_layers = n_gpu;
params.n_batch = n_batch > 0 ? n_batch : params.n_batch;
params.n_ubatch = std::min(params.n_batch, params.n_ubatch);
params.numa = numa ? GGML_NUMA_STRATEGY_DISTRIBUTE : GGML_NUMA_STRATEGY_DISABLED;
params.warmup = false;
params.fit_params = false;
if (rope_freq_base > 0.0f) {
params.rope_freq_base = rope_freq_base;
}
if (rope_freq_scale > 0.0f) {
params.rope_freq_scale = rope_freq_scale;
}
if (memory_f16) {
params.cache_type_k = GGML_TYPE_F16;
params.cache_type_v = GGML_TYPE_F16;
}
if (maingpu != nullptr && maingpu[0] != '\0') {
params.main_gpu = std::stoi(maingpu);
}
parse_tensor_split(tensorsplit, params.tensor_split, sizeof(params.tensor_split) / sizeof(params.tensor_split[0]));
if (perplexity) {
params.compute_ppl = true;
}
if (lora != nullptr && lora[0] != '\0') {
common_adapter_lora_info la;
la.path = lora;
la.scale = 1.0f;
params.lora_adapters.push_back(la);
}
}
static bool check_antiprompt(
const std::string & output,
const std::vector<std::string> & antiprompt,
bool interactive) {
for (const auto & ap : antiprompt) {
if (ap.empty()) {
continue;
}
const size_t extra = interactive ? 0 : 2;
const size_t search_start = output.length() > ap.length() + extra
? output.length() - ap.length() - extra
: 0;
if (output.find(ap, search_start) != std::string::npos) {
return true;
}
}
return false;
}
extern "C" {
void * load_model(
const char * fname,
int n_ctx,
int n_seed,
bool memory_f16,
bool mlock,
bool embeddings,
bool mmap,
bool low_vram,
int n_gpu,
int n_batch,
const char * maingpu,
const char * tensorsplit,
bool numa,
float rope_freq_base,
float rope_freq_scale,
bool mul_mat_q,
const char * lora,
const char * lora_base,
bool perplexity) {
(void) low_vram;
(void) mul_mat_q;
common_init();
llama_backend_init();
common_params params;
params.model.path = fname;
apply_model_load_options(
params, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap,
n_gpu, n_batch, maingpu, tensorsplit, numa,
rope_freq_base, rope_freq_scale, lora, lora_base, perplexity);
llama_numa_init(params.numa);
auto * binding = new llama_binding_state();
binding->init = common_init_from_params(params);
if (!binding->init || binding->init->context() == nullptr) {
delete binding;
return nullptr;
}
binding->model = binding->init->model();
binding->ctx = binding->init->context();
binding->smpl = binding->init->sampler(0);
binding->embeddings = embeddings;
return binding;
}
void llama_binding_free_model(void * state_pr) {
delete binding_state(state_pr);
}
int load_state(void * state_pr, char * statefile, char * modes) {
(void) modes;
auto * state = binding_state(state_pr);
if (state == nullptr || state->ctx == nullptr) {
return 1;
}
std::vector<llama_token> tokens(llama_n_ctx(state->ctx));
size_t n_out = 0;
if (!llama_state_load_file(state->ctx, statefile, tokens.data(), tokens.size(), &n_out)) {
return 1;
}
return 0;
}
void save_state(void * state_pr, char * dst, char * modes) {
(void) modes;
auto * state = binding_state(state_pr);
if (state == nullptr || state->ctx == nullptr) {
return;
}
llama_state_save_file(state->ctx, dst, nullptr, 0);
}
void * llama_allocate_params(
const char * prompt,
int seed,
int threads,
int tokens,
int top_k,
float top_p,
float temp,
float repeat_penalty,
int repeat_last_n,
bool ignore_eos,
bool memory_f16,
int n_batch,
int n_keep,
const char ** antiprompt,
int antiprompt_count,
float tfs_z,
float typical_p,
float frequency_penalty,
float presence_penalty,
int mirostat,
float mirostat_eta,
float mirostat_tau,
bool penalize_nl,
const char * logit_bias,
const char * session_file,
bool prompt_cache_all,
bool mlock,
bool mmap,
const char * maingpu,
const char * tensorsplit,
bool prompt_cache_ro,
const char * grammar,
float rope_freq_base,
float rope_freq_scale,
float negative_prompt_scale,
const char * negative_prompt,
int n_draft) {
(void) tfs_z;
(void) penalize_nl;
(void) negative_prompt_scale;
(void) negative_prompt;
(void) memory_f16;
auto * params = new common_params();
params->prompt = prompt != nullptr ? prompt : "";
params->n_predict = tokens;
params->n_batch = n_batch > 0 ? n_batch : params->n_batch;
params->n_keep = n_keep;
params->use_mlock = mlock;
params->use_mmap = mmap;
params->path_prompt_cache = session_file != nullptr ? session_file : "";
params->prompt_cache_all = prompt_cache_all;
params->prompt_cache_ro = prompt_cache_ro;
if (rope_freq_base > 0.0f) {
params->rope_freq_base = rope_freq_base;
}
if (rope_freq_scale > 0.0f) {
params->rope_freq_scale = rope_freq_scale;
}
params->sampling.seed = seed >= 0 ? (uint32_t) seed : LLAMA_DEFAULT_SEED;
params->cpuparams.n_threads = threads > 0 ? threads : 4;
params->cpuparams_batch.n_threads = params->cpuparams.n_threads;
params->sampling.top_k = top_k;
params->sampling.top_p = top_p;
params->sampling.temp = temp;
params->sampling.penalty_repeat = repeat_penalty;
params->sampling.penalty_last_n = repeat_last_n;
params->sampling.penalty_freq = frequency_penalty;
params->sampling.penalty_present = presence_penalty;
params->sampling.typ_p = typical_p > 0 ? typical_p : 1.0f;
params->sampling.mirostat = mirostat;
params->sampling.mirostat_eta = mirostat_eta;
params->sampling.mirostat_tau = mirostat_tau;
params->sampling.ignore_eos = ignore_eos;
if (grammar != nullptr && grammar[0] != '\0') {
params->sampling.grammar = common_grammar(COMMON_GRAMMAR_TYPE_USER, grammar);
}
if (maingpu != nullptr && maingpu[0] != '\0') {
params->main_gpu = std::stoi(maingpu);
}
parse_tensor_split(tensorsplit, params->tensor_split, sizeof(params->tensor_split) / sizeof(params->tensor_split[0]));
if (antiprompt_count > 0 && antiprompt != nullptr) {
params->antiprompt = create_vector(antiprompt, antiprompt_count);
}
if (logit_bias != nullptr && logit_bias[0] != '\0') {
std::stringstream ss(logit_bias);
llama_token key;
char sign = 0;
std::string value_str;
if (ss >> key >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
params->sampling.logit_bias.push_back({key, std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f)});
}
}
params->speculative.draft.n_max = n_draft > 0 ? n_draft : params->speculative.draft.n_max;
return params;
}
void llama_free_params(void * params_ptr) {
delete static_cast<common_params *>(params_ptr);
}
int eval(void * params_ptr, void * state_pr, char * text) {
auto * params = static_cast<common_params *>(params_ptr);
auto * state = binding_state(state_pr);
if (state == nullptr || state->ctx == nullptr) {
return 1;
}
std::string str = text != nullptr ? text : params->prompt;
auto embd = common_tokenize(state->ctx, str, true, true);
if (embd.empty()) {
return 1;
}
int n_past = 0;
if (!common_prompt_batch_decode(state->ctx, embd, n_past, params->n_batch, "", false)) {
return 1;
}
return 0;
}
int get_embeddings(void * params_ptr, void * state_pr, float * res_embeddings) {
auto * params = static_cast<common_params *>(params_ptr);
auto * state = binding_state(state_pr);
if (state == nullptr || state->ctx == nullptr || !state->embeddings) {
return 1;
}
auto embd = common_tokenize(state->ctx, params->prompt, true, true);
if (!embd.empty()) {
int n_past = 0;
if (!common_prompt_batch_decode(state->ctx, embd, n_past, params->n_batch, "", false)) {
return 1;
}
}
const int n_embd = llama_model_n_embd(state->model);
const float * emb = llama_get_embeddings_ith(state->ctx, -1);
if (emb == nullptr) {
emb = llama_get_embeddings(state->ctx);
}
if (emb == nullptr) {
return 1;
}
for (int i = 0; i < n_embd; ++i) {
res_embeddings[i] = emb[i];
}
return 0;
}
int get_token_embeddings(void * params_ptr, void * state_pr, int * tokens, int tokenSize, float * res_embeddings) {
auto * params = static_cast<common_params *>(params_ptr);
auto * state = binding_state(state_pr);
if (state == nullptr || state->ctx == nullptr) {
return 1;
}
std::string text;
for (int i = 0; i < tokenSize; ++i) {
text += common_token_to_piece(state->ctx, tokens[i]);
}
params->prompt = text;
return get_embeddings(params_ptr, state_pr, res_embeddings);
}
int llama_tokenize_string(void * params_ptr, void * state_pr, int * result) {
auto * params = static_cast<common_params *>(params_ptr);
auto * state = binding_state(state_pr);
if (state == nullptr || state->ctx == nullptr) {
return -1;
}
const llama_vocab * vocab = llama_model_get_vocab(state->model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
const int32_t max_tokens = params->n_ctx > 0 ? params->n_ctx : 4096;
return llama_tokenize(
vocab,
params->prompt.c_str(),
(int32_t) params->prompt.size(),
reinterpret_cast<llama_token *>(result),
max_tokens,
add_bos,
true);
}
int llama_predict(void * params_ptr, void * state_pr, char * result, bool debug) {
auto * params = static_cast<common_params *>(params_ptr);
auto * state = binding_state(state_pr);
if (state == nullptr || state->ctx == nullptr || state->smpl == nullptr) {
return 1;
}
llama_context * ctx = state->ctx;
llama_model * model = state->model;
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_memory_t mem = llama_get_memory(ctx);
common_sampler_ptr smpl_ptr(common_sampler_init(model, params->sampling));
if (!smpl_ptr) {
return 1;
}
common_sampler * smpl = smpl_ptr.get();
const int n_ctx = llama_n_ctx(ctx);
if (params->n_predict < 0) {
params->n_predict = 128;
}
llama_set_n_threads(ctx, params->cpuparams.n_threads, params->cpuparams_batch.n_threads);
std::string path_session = params->path_prompt_cache;
std::vector<llama_token> session_tokens;
if (!path_session.empty()) {
session_tokens.resize(n_ctx);
size_t n_out = 0;
if (std::ifstream(path_session).good()) {
llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size(), &n_out);
session_tokens.resize(n_out);
}
}
const bool add_bos = llama_vocab_get_add_bos(vocab);
std::vector<llama_token> embd_inp = common_tokenize(ctx, params->prompt, add_bos, true);
if (embd_inp.empty()) {
embd_inp.push_back(llama_vocab_bos(vocab));
}
if ((int) embd_inp.size() > n_ctx - 4) {
return 1;
}
if (params->n_keep < 0 || params->n_keep > (int) embd_inp.size()) {
params->n_keep = (int) embd_inp.size();
}
common_sampler_reset(smpl);
int n_past = 0;
int n_remain = params->n_predict;
int n_consumed = 0;
int n_session_consumed = 0;
bool is_antiprompt = false;
bool need_save_session = !path_session.empty() && !params->prompt_cache_ro;
std::vector<llama_token> embd;
std::string res;
while (n_remain > 0 && !is_antiprompt) {
if (!embd.empty()) {
const int max_embd_size = n_ctx - 4;
if ((int) embd.size() > max_embd_size) {
embd.resize(max_embd_size);
}
if (n_past + (int) embd.size() >= n_ctx) {
const int n_left = n_past - params->n_keep;
const int n_discard = n_left / 2;
llama_memory_seq_rm(mem, 0, params->n_keep, params->n_keep + n_discard);
llama_memory_seq_add(mem, 0, params->n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
path_session.clear();
}
if (n_session_consumed < (int) session_tokens.size()) {
size_t i = 0;
for (; i < embd.size(); ++i) {
if (embd[i] != session_tokens[n_session_consumed]) {
session_tokens.resize(n_session_consumed);
break;
}
n_past++;
n_session_consumed++;
if (n_session_consumed >= (int) session_tokens.size()) {
++i;
break;
}
}
if (i > 0) {
embd.erase(embd.begin(), embd.begin() + i);
}
}
if (!embd.empty()) {
const bool save_now = need_save_session && n_consumed >= (int) embd_inp.size();
if (!common_prompt_batch_decode(ctx, embd, n_past, params->n_batch, path_session, save_now)) {
return 1;
}
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
n_session_consumed = session_tokens.size();
need_save_session = false;
}
}
embd.clear();
if ((int) embd_inp.size() <= n_consumed) {
const llama_token id = common_sampler_sample(smpl, ctx, -1);
common_sampler_accept(smpl, id, true);
embd.push_back(id);
auto piece = common_token_to_piece(ctx, id);
if (!tokenCallback(state_pr, const_cast<char *>(piece.c_str()))) {
break;
}
res += piece;
--n_remain;
if (llama_vocab_is_eog(vocab, id)) {
break;
}
} else {
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
common_sampler_accept(smpl, embd_inp[n_consumed], false);
++n_consumed;
if ((int) embd.size() >= params->n_batch) {
break;
}
}
}
for (const auto id : embd) {
res += common_token_to_piece(ctx, id);
}
if ((int) embd_inp.size() <= n_consumed && !params->antiprompt.empty()) {
is_antiprompt = check_antiprompt(res, params->antiprompt, false);
}
}
if (!path_session.empty() && params->prompt_cache_all && !params->prompt_cache_ro) {
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
if (debug) {
common_perf_print(ctx, smpl);
}
if (result != nullptr) {
std::strncpy(result, res.c_str(), params->n_predict > 0 ? (size_t) params->n_predict : res.size());
result[params->n_predict > 0 ? params->n_predict - 1 : res.size()] = '\0';
}
return 0;
}
int speculative_sampling(void * params_ptr, void * target_model, void * draft_model, char * result, bool debug) {
auto * params = static_cast<common_params *>(params_ptr);
auto * tgt = binding_state(target_model);
auto * dft = binding_state(draft_model);
if (tgt == nullptr || dft == nullptr || tgt->ctx == nullptr || dft->ctx == nullptr) {
return 1;
}
llama_context * ctx_tgt = tgt->ctx;
llama_context * ctx_dft = dft->ctx;
const llama_vocab * vocab = llama_model_get_vocab(tgt->model);
common_sampler_ptr smpl_ptr(common_sampler_init(tgt->model, params->sampling));
if (!smpl_ptr) {
return 1;
}
common_sampler * smpl_tgt = smpl_ptr.get();
auto inp = common_tokenize(ctx_tgt, params->prompt, true, true);
const int max_tokens = llama_n_ctx(ctx_tgt) - 4;
if ((int) inp.size() > max_tokens) {
return 1;
}
int n_past_tgt = 0;
int n_past_dft = 0;
if (!inp.empty()) {
if (!common_prompt_batch_decode(ctx_tgt, inp, n_past_tgt, params->n_batch, "", false)) {
return 1;
}
if (!common_prompt_batch_decode(ctx_dft, inp, n_past_dft, params->n_batch, "", false)) {
return 1;
}
}
const int n_draft = params->speculative.draft.n_max > 0 ? params->speculative.draft.n_max : 16;
int n_predict = 0;
std::string res;
bool has_eos = false;
std::vector<llama_token> drafted;
std::vector<llama_token> last_tokens(llama_n_ctx(ctx_tgt), 0);
for (auto id : inp) {
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
}
while (n_predict < params->n_predict && !has_eos) {
int i_dft = 0;
while (true) {
const llama_token id = common_sampler_sample(smpl_tgt, ctx_tgt, -1);
common_sampler_accept(smpl_tgt, id, true);
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
auto piece = common_token_to_piece(ctx_tgt, id);
if (!tokenCallback(draft_model, const_cast<char *>(piece.c_str()))) {
break;
}
res += piece;
if (llama_vocab_is_eog(vocab, id)) {
has_eos = true;
}
++n_predict;
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
++i_dft;
continue;
}
llama_token dft_id = id;
llama_batch batch = llama_batch_get_one(&dft_id, 1);
if (llama_decode(ctx_dft, batch) != 0) {
return 1;
}
++n_past_dft;
drafted.clear();
drafted.push_back(id);
break;
}
if (n_predict >= params->n_predict || has_eos) {
break;
}
int n_past_cur = n_past_dft;
for (int i = 0; i < n_draft; ++i) {
float * logits = llama_get_logits(ctx_dft);
const int n_vocab = llama_vocab_n_tokens(vocab);
llama_token draft_id = 0;
float max_logit = logits[0];
for (llama_token t = 1; t < n_vocab; ++t) {
if (logits[t] > max_logit) {
max_logit = logits[t];
draft_id = t;
}
}
drafted.push_back(draft_id);
if (i == n_draft - 1) {
break;
}
llama_batch batch = llama_batch_get_one(&draft_id, 1);
if (llama_decode(ctx_dft, batch) != 0) {
return 1;
}
++n_past_cur;
}
llama_batch batch = llama_batch_get_one(drafted.data(), (int32_t) drafted.size());
if (llama_decode(ctx_tgt, batch) != 0) {
return 1;
}
++n_past_tgt;
if (!drafted.empty()) {
drafted.erase(drafted.begin());
}
}
if (debug) {
common_perf_print(ctx_tgt, smpl_tgt);
common_perf_print(ctx_dft, nullptr);
}
if (result != nullptr) {
std::strncpy(result, res.c_str(), params->n_predict > 0 ? (size_t) params->n_predict : res.size());
}
return 0;
}
} // extern "C"
std::vector<std::string> create_vector(const char ** strings, int count) {
std::vector<std::string> vec;
for (int i = 0; i < count; ++i) {
vec.emplace_back(strings[i]);
}
return vec;
}
void delete_vector(std::vector<std::string> * vec) {
delete vec;
}

63
binding.h Normal file
View File

@ -0,0 +1,63 @@
#ifdef __cplusplus
#include <vector>
#include <string>
extern "C" {
#endif
#include <stdbool.h>
extern unsigned char tokenCallback(void *, char *);
int load_state(void *ctx, char *statefile, char*modes);
int eval(void* params_ptr, void *ctx, char*text);
void save_state(void *ctx, char *dst, char*modes);
void* load_model(const char *fname,
int n_ctx,
int n_seed,
bool memory_f16,
bool mlock,
bool embeddings,
bool mmap,
bool low_vram,
int n_gpu,
int n_batch,
const char *maingpu,
const char *tensorsplit,
bool numa,
float rope_freq_base,
float rope_freq_scale,
bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity
);
int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);
int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tokenSize, float * res_embeddings);
void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
int top_k, float top_p, float temp, float repeat_penalty,
int repeat_last_n, bool ignore_eos, bool memory_f16,
int n_batch, int n_keep, const char** antiprompt, int antiprompt_count,
float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, const char *logit_bias, const char *session_file, bool prompt_cache_all, bool mlock, bool mmap, const char *maingpu, const char *tensorsplit ,
bool prompt_cache_ro, const char *grammar, float rope_freq_base, float rope_freq_scale, float negative_prompt_scale, const char* negative_prompt,
int n_draft);
int speculative_sampling(void* params_ptr, void* target_model, void* draft_model, char* result, bool debug);
void llama_free_params(void* params_ptr);
void llama_binding_free_model(void* state);
int llama_tokenize_string(void* params_ptr, void* state_pr, int* result);
int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug);
#ifdef __cplusplus
}
std::vector<std::string> create_vector(const char** strings, int count);
void delete_vector(std::vector<std::string>* vec);
#endif

3
build.conf Normal file
View File

@ -0,0 +1,3 @@
# Пути к исходникам llama.cpp (без переменных окружения)
LLAMA_CPP_PATH=/home/admin/cpp/llama.cpp
LLAMA_BUILD_PATH=/home/admin/cpp/llama.cpp/build

10
cgo_flags.go Normal file
View File

@ -0,0 +1,10 @@
package llama
// Флаги CGO генерируются из build.conf (пути к /home/admin/cpp/llama.cpp).
// При смене пути отредактируйте build.conf и этот файл.
/*
#cgo CXXFLAGS: -std=c++17 -I/home/admin/cpp/llama.cpp/include -I/home/admin/cpp/llama.cpp/common -I/home/admin/cpp/llama.cpp/ggml/include -I${SRCDIR}
#cgo LDFLAGS: -L${SRCDIR} -lbinding -L/home/admin/cpp/llama.cpp/build/src -lllama -L/home/admin/cpp/llama.cpp/build/common -lllama-common -lllama-common-base -L/home/admin/cpp/llama.cpp/build/ggml/src -lggml -lggml-cpu -lggml-base -L/home/admin/cpp/llama.cpp/build/vendor/cpp-httplib -lcpp-httplib -lstdc++ -lm -lpthread -fopenmp -ldl
*/
import "C"

48
examples/main.go Normal file
View File

@ -0,0 +1,48 @@
package main
import (
"bufio"
"fmt"
"os"
"strings"
llama "go-llama-new.cpp"
)
func main() {
if len(os.Args) < 2 {
fmt.Fprintf(os.Stderr, "usage: %s <model.gguf> [prompt]\n", os.Args[0])
os.Exit(1)
}
modelPath := os.Args[1]
prompt := "Hello"
if len(os.Args) > 2 {
prompt = strings.Join(os.Args[2:], " ")
}
l, err := llama.New(modelPath, llama.SetContext(512), llama.SetGPULayers(0))
if err != nil {
fmt.Fprintf(os.Stderr, "load model: %v\n", err)
os.Exit(1)
}
defer l.Free()
out, err := l.Predict(prompt, llama.SetTokens(64), llama.SetThreads(4))
if err != nil {
fmt.Fprintf(os.Stderr, "predict: %v\n", err)
os.Exit(1)
}
fmt.Println(out)
reader := bufio.NewReader(os.Stdin)
fmt.Print("\nТокенизация (введите текст): ")
line, _ := reader.ReadString('\n')
_, tokens, err := l.TokenizeString(strings.TrimSpace(line))
if err != nil {
fmt.Fprintf(os.Stderr, "tokenize: %v\n", err)
return
}
fmt.Printf("токенов: %d, ids: %v\n", len(tokens), tokens)
}

3
go.mod Normal file
View File

@ -0,0 +1,3 @@
module go-llama-new.cpp
go 1.21

409
llama.go Normal file
View File

@ -0,0 +1,409 @@
package llama
// #include "binding.h"
// #include <stdlib.h>
import "C"
import (
"fmt"
"os"
"strings"
"sync"
"unsafe"
)
type LLama struct {
state unsafe.Pointer
embeddings bool
contextSize int
}
func New(model string, opts ...ModelOption) (*LLama, error) {
mo := NewModelOptions(opts...)
modelPath := C.CString(model)
defer C.free(unsafe.Pointer(modelPath))
loraBase := C.CString(mo.LoraBase)
defer C.free(unsafe.Pointer(loraBase))
loraAdapter := C.CString(mo.LoraAdapter)
defer C.free(unsafe.Pointer(loraAdapter))
MulMatQ := true
if mo.MulMatQ != nil {
MulMatQ = *mo.MulMatQ
}
result := C.load_model(modelPath,
C.int(mo.ContextSize), C.int(mo.Seed),
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM),
C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA),
C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale),
C.bool(MulMatQ), loraAdapter, loraBase, C.bool(mo.Perplexity),
)
if result == nil {
return nil, fmt.Errorf("failed loading model")
}
ll := &LLama{state: result, contextSize: mo.ContextSize, embeddings: mo.Embeddings}
return ll, nil
}
func (l *LLama) Free() {
C.llama_binding_free_model(l.state)
}
func (l *LLama) LoadState(state string) error {
d := C.CString(state)
w := C.CString("rb")
result := C.load_state(l.state, d, w)
defer C.free(unsafe.Pointer(d))
defer C.free(unsafe.Pointer(w))
if result != 0 {
return fmt.Errorf("error while loading state")
}
return nil
}
func (l *LLama) SaveState(dst string) error {
d := C.CString(dst)
w := C.CString("wb")
C.save_state(l.state, d, w)
defer C.free(unsafe.Pointer(d))
defer C.free(unsafe.Pointer(w))
_, err := os.Stat(dst)
return err
}
// Token Embeddings
func (l *LLama) TokenEmbeddings(tokens []int, opts ...PredictOption) ([]float32, error) {
if !l.embeddings {
return []float32{}, fmt.Errorf("model loaded without embeddings")
}
po := NewPredictOptions(opts...)
outSize := po.Tokens
if po.Tokens == 0 {
outSize = 9999999
}
floats := make([]float32, outSize)
myArray := (*C.int)(C.malloc(C.size_t(len(tokens)) * C.sizeof_int))
for i, v := range tokens {
(*[1 << 31]int32)(unsafe.Pointer(myArray))[i] = int32(v)
}
params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), nil, C.int(0),
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
C.CString(po.MainGPU), C.CString(po.TensorSplit),
C.bool(po.PromptCacheRO),
C.CString(po.Grammar),
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
C.int(po.NDraft),
)
ret := C.get_token_embeddings(params, l.state, myArray, C.int(len(tokens)), (*C.float)(&floats[0]))
C.free(unsafe.Pointer(myArray))
C.llama_free_params(params)
if ret != 0 {
return floats, fmt.Errorf("embedding inference failed")
}
return floats, nil
}
// Embeddings
func (l *LLama) Embeddings(text string, opts ...PredictOption) ([]float32, error) {
if !l.embeddings {
return []float32{}, fmt.Errorf("model loaded without embeddings")
}
po := NewPredictOptions(opts...)
input := C.CString(text)
defer C.free(unsafe.Pointer(input))
if po.Tokens == 0 {
po.Tokens = 99999999
}
floats := make([]float32, po.Tokens)
reverseCount := len(po.StopPrompts)
reversePrompt := make([]*C.char, reverseCount)
var pass **C.char
for i, s := range po.StopPrompts {
cs := C.CString(s)
defer C.free(unsafe.Pointer(cs))
reversePrompt[i] = cs
pass = &reversePrompt[0]
}
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
C.CString(po.MainGPU), C.CString(po.TensorSplit),
C.bool(po.PromptCacheRO),
C.CString(po.Grammar),
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
C.int(po.NDraft),
)
ret := C.get_embeddings(params, l.state, (*C.float)(&floats[0]))
C.llama_free_params(params)
if ret != 0 {
return floats, fmt.Errorf("embedding inference failed")
}
return floats, nil
}
func (l *LLama) Eval(text string, opts ...PredictOption) error {
po := NewPredictOptions(opts...)
input := C.CString(text)
defer C.free(unsafe.Pointer(input))
if po.Tokens == 0 {
po.Tokens = 99999999
}
reverseCount := len(po.StopPrompts)
reversePrompt := make([]*C.char, reverseCount)
var pass **C.char
for i, s := range po.StopPrompts {
cs := C.CString(s)
defer C.free(unsafe.Pointer(cs))
reversePrompt[i] = cs
pass = &reversePrompt[0]
}
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
C.CString(po.MainGPU), C.CString(po.TensorSplit),
C.bool(po.PromptCacheRO),
C.CString(po.Grammar),
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
C.int(po.NDraft),
)
ret := C.eval(params, l.state, input)
C.llama_free_params(params)
if ret != 0 {
return fmt.Errorf("inference failed")
}
return nil
}
func (l *LLama) SpeculativeSampling(ll *LLama, text string, opts ...PredictOption) (string, error) {
po := NewPredictOptions(opts...)
if po.TokenCallback != nil {
setCallback(l.state, po.TokenCallback)
}
input := C.CString(text)
defer C.free(unsafe.Pointer(input))
if po.Tokens == 0 {
po.Tokens = 99999999
}
out := make([]byte, po.Tokens)
reverseCount := len(po.StopPrompts)
reversePrompt := make([]*C.char, reverseCount)
var pass **C.char
for i, s := range po.StopPrompts {
cs := C.CString(s)
defer C.free(unsafe.Pointer(cs))
reversePrompt[i] = cs
pass = &reversePrompt[0]
}
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
C.CString(po.MainGPU), C.CString(po.TensorSplit),
C.bool(po.PromptCacheRO),
C.CString(po.Grammar),
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
C.int(po.NDraft),
)
ret := C.speculative_sampling(params, l.state, ll.state, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode))
C.llama_free_params(params)
if po.TokenCallback != nil {
setCallback(l.state, nil)
}
if ret != 0 {
return "", fmt.Errorf("inference failed")
}
res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
res = strings.TrimPrefix(res, " ")
res = strings.TrimPrefix(res, text)
res = strings.TrimPrefix(res, "\n")
for _, s := range po.StopPrompts {
res = strings.TrimRight(res, s)
}
return res, nil
}
func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
po := NewPredictOptions(opts...)
if po.TokenCallback != nil {
setCallback(l.state, po.TokenCallback)
}
input := C.CString(text)
defer C.free(unsafe.Pointer(input))
if po.Tokens == 0 {
po.Tokens = 99999999
}
out := make([]byte, po.Tokens)
reverseCount := len(po.StopPrompts)
reversePrompt := make([]*C.char, reverseCount)
var pass **C.char
for i, s := range po.StopPrompts {
cs := C.CString(s)
defer C.free(unsafe.Pointer(cs))
reversePrompt[i] = cs
pass = &reversePrompt[0]
}
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
C.CString(po.MainGPU), C.CString(po.TensorSplit),
C.bool(po.PromptCacheRO),
C.CString(po.Grammar),
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
C.int(po.NDraft),
)
ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode))
C.llama_free_params(params)
if po.TokenCallback != nil {
setCallback(l.state, nil)
}
if ret != 0 {
return "", fmt.Errorf("inference failed")
}
res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
res = strings.TrimPrefix(res, " ")
res = strings.TrimPrefix(res, text)
res = strings.TrimPrefix(res, "\n")
for _, s := range po.StopPrompts {
res = strings.TrimRight(res, s)
}
return res, nil
}
func (l *LLama) TokenizeString(text string, opts ...PredictOption) (int32, []int32, error) {
po := NewPredictOptions(opts...)
input := C.CString(text)
defer C.free(unsafe.Pointer(input))
if po.Tokens == 0 {
po.Tokens = 4096
}
out := make([]C.int, po.Tokens)
var fakeDblPtr **C.char
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), fakeDblPtr, C.int(0),
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
C.CString(po.MainGPU), C.CString(po.TensorSplit),
C.bool(po.PromptCacheRO),
C.CString(po.Grammar),
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
C.int(po.NDraft),
)
tokRet := C.llama_tokenize_string(params, l.state, (*C.int)(unsafe.Pointer(&out[0])))
C.llama_free_params(params)
if tokRet < 0 {
return int32(tokRet), []int32{}, fmt.Errorf("llama_tokenize_string returned negative count %d", tokRet)
}
gTokRet := int32(tokRet)
gLenOut := min(len(out), int(gTokRet))
goSlice := make([]int32, gLenOut)
for i := 0; i < gLenOut; i++ {
goSlice[i] = int32(out[i])
}
return gTokRet, goSlice, nil
}
func (l *LLama) SetTokenCallback(callback func(token string) bool) {
setCallback(l.state, callback)
}
var (
m sync.RWMutex
callbacks = map[uintptr]func(string) bool{}
)
//export tokenCallback
func tokenCallback(statePtr unsafe.Pointer, token *C.char) bool {
m.RLock()
defer m.RUnlock()
if callback, ok := callbacks[uintptr(statePtr)]; ok {
return callback(C.GoString(token))
}
return true
}
func setCallback(statePtr unsafe.Pointer, callback func(string) bool) {
m.Lock()
defer m.Unlock()
if callback == nil {
delete(callbacks, uintptr(statePtr))
} else {
callbacks[uintptr(statePtr)] = callback
}
}

9
llama_cublas.go Normal file
View File

@ -0,0 +1,9 @@
//go:build cublas
// +build cublas
package llama
/*
#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/
*/
import "C"

9
llama_openblas.go Normal file
View File

@ -0,0 +1,9 @@
//go:build openblas
// +build openblas
package llama
/*
#cgo LDFLAGS: -lopenblas
*/
import "C"

460
options.go Normal file
View File

@ -0,0 +1,460 @@
package llama
type ModelOptions struct {
ContextSize int
Seed int
NBatch int
F16Memory bool
MLock bool
MMap bool
LowVRAM bool
Embeddings bool
NUMA bool
NGPULayers int
MainGPU string
TensorSplit string
FreqRopeBase float32
FreqRopeScale float32
MulMatQ *bool
LoraBase string
LoraAdapter string
Perplexity bool
}
type PredictOptions struct {
Seed, Threads, Tokens, TopK, Repeat, Batch, NKeep int
TopP, Temperature, Penalty float32
NDraft int
F16KV bool
DebugMode bool
StopPrompts []string
IgnoreEOS bool
TailFreeSamplingZ float32
TypicalP float32
FrequencyPenalty float32
PresencePenalty float32
Mirostat int
MirostatETA float32
MirostatTAU float32
PenalizeNL bool
LogitBias string
TokenCallback func(string) bool
PathPromptCache string
MLock, MMap, PromptCacheAll bool
PromptCacheRO bool
Grammar string
MainGPU string
TensorSplit string
// Rope parameters
RopeFreqBase float32
RopeFreqScale float32
// Negative prompt parameters
NegativePromptScale float32
NegativePrompt string
}
type PredictOption func(p *PredictOptions)
type ModelOption func(p *ModelOptions)
var DefaultModelOptions ModelOptions = ModelOptions{
ContextSize: 512,
Seed: 0,
F16Memory: false,
MLock: false,
Embeddings: false,
MMap: true,
LowVRAM: false,
NBatch: 512,
FreqRopeBase: 10000,
FreqRopeScale: 1.0,
}
var DefaultOptions PredictOptions = PredictOptions{
Seed: -1,
Threads: 4,
Tokens: 128,
Penalty: 1.1,
Repeat: 64,
Batch: 512,
NKeep: 64,
TopK: 40,
TopP: 0.95,
TailFreeSamplingZ: 1.0,
TypicalP: 1.0,
Temperature: 0.8,
FrequencyPenalty: 0.0,
PresencePenalty: 0.0,
Mirostat: 0,
MirostatTAU: 5.0,
MirostatETA: 0.1,
MMap: true,
RopeFreqBase: 10000,
RopeFreqScale: 1.0,
}
func SetMulMatQ(b bool) ModelOption {
return func(p *ModelOptions) {
p.MulMatQ = &b
}
}
func SetLoraBase(s string) ModelOption {
return func(p *ModelOptions) {
p.LoraBase = s
}
}
func SetLoraAdapter(s string) ModelOption {
return func(p *ModelOptions) {
p.LoraAdapter = s
}
}
// SetContext sets the context size.
func SetContext(c int) ModelOption {
return func(p *ModelOptions) {
p.ContextSize = c
}
}
func WithRopeFreqBase(f float32) ModelOption {
return func(p *ModelOptions) {
p.FreqRopeBase = f
}
}
func WithRopeFreqScale(f float32) ModelOption {
return func(p *ModelOptions) {
p.FreqRopeScale = f
}
}
func SetModelSeed(c int) ModelOption {
return func(p *ModelOptions) {
p.Seed = c
}
}
// SetContext sets the context size.
func SetMMap(b bool) ModelOption {
return func(p *ModelOptions) {
p.MMap = b
}
}
// SetNBatch sets the n_Batch
func SetNBatch(n_batch int) ModelOption {
return func(p *ModelOptions) {
p.NBatch = n_batch
}
}
// Set sets the tensor split for the GPU
func SetTensorSplit(maingpu string) ModelOption {
return func(p *ModelOptions) {
p.TensorSplit = maingpu
}
}
// SetMainGPU sets the main_gpu
func SetMainGPU(maingpu string) ModelOption {
return func(p *ModelOptions) {
p.MainGPU = maingpu
}
}
// SetPredictionTensorSplit sets the tensor split for the GPU
func SetPredictionTensorSplit(maingpu string) PredictOption {
return func(p *PredictOptions) {
p.TensorSplit = maingpu
}
}
// SetPredictionMainGPU sets the main_gpu
func SetPredictionMainGPU(maingpu string) PredictOption {
return func(p *PredictOptions) {
p.MainGPU = maingpu
}
}
// Rope and negative prompt parameters
func SetRopeFreqBase(rfb float32) PredictOption {
return func(p *PredictOptions) {
p.RopeFreqBase = rfb
}
}
func SetRopeFreqScale(rfs float32) PredictOption {
return func(p *PredictOptions) {
p.RopeFreqScale = rfs
}
}
func SetNDraft(nd int) PredictOption {
return func(p *PredictOptions) {
p.NDraft = nd
}
}
func SetPerplexity(b bool) ModelOption {
return func(p *ModelOptions) {
p.Perplexity = b
}
}
func SetNegativePromptScale(nps float32) PredictOption {
return func(p *PredictOptions) {
p.NegativePromptScale = nps
}
}
func SetNegativePrompt(np string) PredictOption {
return func(p *PredictOptions) {
p.NegativePrompt = np
}
}
var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
p.LowVRAM = true
}
var EnableNUMA ModelOption = func(p *ModelOptions) {
p.NUMA = true
}
var EnableEmbeddings ModelOption = func(p *ModelOptions) {
p.Embeddings = true
}
var EnableF16Memory ModelOption = func(p *ModelOptions) {
p.F16Memory = true
}
var EnableF16KV PredictOption = func(p *PredictOptions) {
p.F16KV = true
}
var Debug PredictOption = func(p *PredictOptions) {
p.DebugMode = true
}
var EnablePromptCacheAll PredictOption = func(p *PredictOptions) {
p.PromptCacheAll = true
}
var EnablePromptCacheRO PredictOption = func(p *PredictOptions) {
p.PromptCacheRO = true
}
var EnableMLock ModelOption = func(p *ModelOptions) {
p.MLock = true
}
// Create a new PredictOptions object with the given options.
func NewModelOptions(opts ...ModelOption) ModelOptions {
p := DefaultModelOptions
for _, opt := range opts {
opt(&p)
}
return p
}
var IgnoreEOS PredictOption = func(p *PredictOptions) {
p.IgnoreEOS = true
}
// WithGrammar sets the grammar to constrain the output of the LLM response
func WithGrammar(s string) PredictOption {
return func(p *PredictOptions) {
p.Grammar = s
}
}
// SetMlock sets the memory lock.
func SetMlock(b bool) PredictOption {
return func(p *PredictOptions) {
p.MLock = b
}
}
// SetMemoryMap sets memory mapping.
func SetMemoryMap(b bool) PredictOption {
return func(p *PredictOptions) {
p.MMap = b
}
}
// SetGPULayers sets the number of GPU layers to use to offload computation
func SetGPULayers(n int) ModelOption {
return func(p *ModelOptions) {
p.NGPULayers = n
}
}
// SetTokenCallback sets the prompts that will stop predictions.
func SetTokenCallback(fn func(string) bool) PredictOption {
return func(p *PredictOptions) {
p.TokenCallback = fn
}
}
// SetStopWords sets the prompts that will stop predictions.
func SetStopWords(stop ...string) PredictOption {
return func(p *PredictOptions) {
p.StopPrompts = stop
}
}
// SetSeed sets the random seed for sampling text generation.
func SetSeed(seed int) PredictOption {
return func(p *PredictOptions) {
p.Seed = seed
}
}
// SetThreads sets the number of threads to use for text generation.
func SetThreads(threads int) PredictOption {
return func(p *PredictOptions) {
p.Threads = threads
}
}
// SetTokens sets the number of tokens to generate.
func SetTokens(tokens int) PredictOption {
return func(p *PredictOptions) {
p.Tokens = tokens
}
}
// SetTopK sets the value for top-K sampling.
func SetTopK(topk int) PredictOption {
return func(p *PredictOptions) {
p.TopK = topk
}
}
// SetTopP sets the value for nucleus sampling.
func SetTopP(topp float32) PredictOption {
return func(p *PredictOptions) {
p.TopP = topp
}
}
// SetTemperature sets the temperature value for text generation.
func SetTemperature(temp float32) PredictOption {
return func(p *PredictOptions) {
p.Temperature = temp
}
}
// SetPathPromptCache sets the session file to store the prompt cache.
func SetPathPromptCache(f string) PredictOption {
return func(p *PredictOptions) {
p.PathPromptCache = f
}
}
// SetPenalty sets the repetition penalty for text generation.
func SetPenalty(penalty float32) PredictOption {
return func(p *PredictOptions) {
p.Penalty = penalty
}
}
// SetRepeat sets the number of times to repeat text generation.
func SetRepeat(repeat int) PredictOption {
return func(p *PredictOptions) {
p.Repeat = repeat
}
}
// SetBatch sets the batch size.
func SetBatch(size int) PredictOption {
return func(p *PredictOptions) {
p.Batch = size
}
}
// SetKeep sets the number of tokens from initial prompt to keep.
func SetNKeep(n int) PredictOption {
return func(p *PredictOptions) {
p.NKeep = n
}
}
// Create a new PredictOptions object with the given options.
func NewPredictOptions(opts ...PredictOption) PredictOptions {
p := DefaultOptions
for _, opt := range opts {
opt(&p)
}
return p
}
// SetTailFreeSamplingZ sets the tail free sampling, parameter z.
func SetTailFreeSamplingZ(tfz float32) PredictOption {
return func(p *PredictOptions) {
p.TailFreeSamplingZ = tfz
}
}
// SetTypicalP sets the typicality parameter, p_typical.
func SetTypicalP(tp float32) PredictOption {
return func(p *PredictOptions) {
p.TypicalP = tp
}
}
// SetFrequencyPenalty sets the frequency penalty parameter, freq_penalty.
func SetFrequencyPenalty(fp float32) PredictOption {
return func(p *PredictOptions) {
p.FrequencyPenalty = fp
}
}
// SetPresencePenalty sets the presence penalty parameter, presence_penalty.
func SetPresencePenalty(pp float32) PredictOption {
return func(p *PredictOptions) {
p.PresencePenalty = pp
}
}
// SetMirostat sets the mirostat parameter.
func SetMirostat(m int) PredictOption {
return func(p *PredictOptions) {
p.Mirostat = m
}
}
// SetMirostatETA sets the mirostat ETA parameter.
func SetMirostatETA(me float32) PredictOption {
return func(p *PredictOptions) {
p.MirostatETA = me
}
}
// SetMirostatTAU sets the mirostat TAU parameter.
func SetMirostatTAU(mt float32) PredictOption {
return func(p *PredictOptions) {
p.MirostatTAU = mt
}
}
// SetPenalizeNL sets whether to penalize newlines or not.
func SetPenalizeNL(pnl bool) PredictOption {
return func(p *PredictOptions) {
p.PenalizeNL = pnl
}
}
// SetLogitBias sets the logit bias parameter.
func SetLogitBias(lb string) PredictOption {
return func(p *PredictOptions) {
p.LogitBias = lb
}
}