first commit
This commit is contained in:
commit
05cfbaa1b8
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
*.o
|
||||
*.a
|
||||
binding.o
|
||||
libbinding.a
|
||||
47
Makefile
Normal file
47
Makefile
Normal file
@ -0,0 +1,47 @@
|
||||
.PHONY: all clean libbinding.a
|
||||
|
||||
include build.conf
|
||||
|
||||
LLAMA_INCLUDE := $(LLAMA_CPP_PATH)/include
|
||||
LLAMA_COMMON := $(LLAMA_CPP_PATH)/common
|
||||
LLAMA_GGML := $(LLAMA_CPP_PATH)/ggml/include
|
||||
|
||||
CXXFLAGS := -std=c++17 -O3 -DNDEBUG -fPIC -pthread \
|
||||
-I$(LLAMA_INCLUDE) -I$(LLAMA_COMMON) -I$(LLAMA_GGML) -I.
|
||||
|
||||
LDFLAGS_LIBS := \
|
||||
-L$(LLAMA_BUILD_PATH)/src -lllama \
|
||||
-L$(LLAMA_BUILD_PATH)/common -lllama-common \
|
||||
-L$(LLAMA_BUILD_PATH)/ggml/src -lggml -lggml-cpu -lggml-base \
|
||||
-L$(LLAMA_BUILD_PATH)/vendor/cpp-httplib -lcpp-httplib \
|
||||
-lpthread -fopenmp -ldl -lm -lstdc++
|
||||
|
||||
all: libbinding.a cgo_flags.go
|
||||
|
||||
# Обновить пути в cgo_flags.go из build.conf
|
||||
cgo_flags.go: build.conf
|
||||
@LLAMA=$$(grep '^LLAMA_CPP_PATH=' build.conf | cut -d= -f2); \
|
||||
BUILD=$$(grep '^LLAMA_BUILD_PATH=' build.conf | cut -d= -f2); \
|
||||
printf '%s\n' \
|
||||
'package llama' \
|
||||
'' \
|
||||
'/*' \
|
||||
"#cgo CXXFLAGS: -std=c++17 -I$$LLAMA/include -I$$LLAMA/common -I$$LLAMA/ggml/include -I\$${SRCDIR}" \
|
||||
"#cgo LDFLAGS: -L\$${SRCDIR} -lbinding -L$$BUILD/src -lllama -L$$BUILD/common -lllama-common -lllama-common-base -L$$BUILD/ggml/src -lggml -lggml-cpu -lggml-base -L$$BUILD/vendor/cpp-httplib -lcpp-httplib -lstdc++ -lm -lpthread -fopenmp -ldl" \
|
||||
'*/' \
|
||||
'import "C"' \
|
||||
> cgo_flags.go
|
||||
|
||||
$(LLAMA_BUILD_PATH)/src/libllama.a:
|
||||
cd $(LLAMA_BUILD_PATH) && cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF && \
|
||||
cmake --build . --target llama llama-common -j$$(nproc)
|
||||
|
||||
binding.o: binding.cpp binding.h $(LLAMA_BUILD_PATH)/src/libllama.a
|
||||
$(CXX) $(CXXFLAGS) -c binding.cpp -o binding.o
|
||||
|
||||
libbinding.a: binding.o
|
||||
ar rcs libbinding.a binding.o
|
||||
@echo "Собрано: libbinding.a. Линковка llama.cpp — через cgo_flags.go."
|
||||
|
||||
clean:
|
||||
rm -f binding.o libbinding.a
|
||||
198
README.md
Normal file
198
README.md
Normal file
@ -0,0 +1,198 @@
|
||||
# go-llama-new.cpp
|
||||
|
||||
Go-обёртка над [llama.cpp](https://github.com/ggml-org/llama.cpp) с API, совместимым с [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp): те же имена типов, функций и экспортируемых переменных (`LLama`, `New`, `Predict`, `SetContext`, `EnableEmbeddings` и т.д.).
|
||||
|
||||
Ядро собирается из локальных исходников llama.cpp (не из submodule внутри репозитория). Пути к исходникам задаются в файле `build.conf`, переменные окружения для этого не используются.
|
||||
|
||||
## Требования
|
||||
|
||||
- **Go** 1.21 или новее (с поддержкой CGO)
|
||||
- **Компилятор C++** с поддержкой C++17 (`g++` / `clang++`)
|
||||
- **CMake** 3.14+
|
||||
- **make**, **ar**
|
||||
- **OpenMP** (обычно пакет `libgomp` в Linux)
|
||||
- Инструменты сборки: `git`, `build-essential` (или аналог)
|
||||
|
||||
Для линковки также нужны статические библиотеки, которые CMake собирает из llama.cpp: `libllama.a`, `libllama-common.a`, `libllama-common-base.a`, `libggml*.a`, `libcpp-httplib.a`.
|
||||
|
||||
## Настройка путей
|
||||
|
||||
Отредактируйте `build.conf` в корне модуля:
|
||||
|
||||
```ini
|
||||
# Пути к исходникам llama.cpp (без переменных окружения)
|
||||
LLAMA_CPP_PATH=/home/admin/cpp/llama.cpp
|
||||
LLAMA_BUILD_PATH=/home/admin/cpp/llama.cpp/build
|
||||
```
|
||||
|
||||
| Параметр | Описание |
|
||||
|----------|----------|
|
||||
| `LLAMA_CPP_PATH` | Каталог с исходниками llama.cpp (`include/`, `common/`, `src/` и т.д.) |
|
||||
| `LLAMA_BUILD_PATH` | Каталог сборки CMake (там появятся `build/src/libllama.a` и др.) |
|
||||
|
||||
После изменения `build.conf` выполните `make` — будет пересоздан `cgo_flags.go` с актуальными путями для CGO.
|
||||
|
||||
## Сборка
|
||||
|
||||
Сборка состоит из двух этапов: сначала нативное ядро llama.cpp, затем Go-модуль с C-обёрткой `binding`.
|
||||
|
||||
### 1. Сборка llama.cpp
|
||||
|
||||
```bash
|
||||
mkdir -p /home/admin/cpp/llama.cpp/build
|
||||
cd /home/admin/cpp/llama.cpp/build
|
||||
|
||||
cmake .. \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
cmake --build . --target llama llama-common -j"$(nproc)"
|
||||
```
|
||||
|
||||
Проверка, что библиотеки на месте:
|
||||
|
||||
```bash
|
||||
ls -la build/src/libllama.a
|
||||
ls -la build/common/libllama-common.a
|
||||
ls -la build/common/libllama-common-base.a
|
||||
ls -la build/ggml/src/libggml.a
|
||||
```
|
||||
|
||||
Цель `make` в каталоге модуля при необходимости запустит эту же сборку автоматически (см. `Makefile`).
|
||||
|
||||
### 2. Сборка C-обёртки (libbinding.a)
|
||||
|
||||
В каталоге модуля:
|
||||
|
||||
```bash
|
||||
cd /path/to/go-llama-new.cpp
|
||||
make
|
||||
```
|
||||
|
||||
Будет выполнено:
|
||||
|
||||
1. Генерация `cgo_flags.go` из `build.conf`
|
||||
2. Компиляция `binding.cpp` → `binding.o`
|
||||
3. Создание архива `libbinding.a`
|
||||
|
||||
Очистка артефактов обёртки:
|
||||
|
||||
```bash
|
||||
make clean
|
||||
```
|
||||
|
||||
### 3. Сборка Go-модуля
|
||||
|
||||
```bash
|
||||
go build ./...
|
||||
```
|
||||
|
||||
Или пример:
|
||||
|
||||
```bash
|
||||
go build -o llama-example ./examples/
|
||||
go run ./examples/main.go /path/to/model.gguf "Привет, мир"
|
||||
```
|
||||
|
||||
При первой сборке CGO скомпилирует `binding.cpp` ещё раз и слинкует его с библиотеками из `LLAMA_BUILD_PATH` (см. `cgo_flags.go`).
|
||||
|
||||
## Использование в своём проекте
|
||||
|
||||
```go
|
||||
import llama "go-llama-new.cpp"
|
||||
|
||||
func main() {
|
||||
model, err := llama.New("/path/to/model.gguf",
|
||||
llama.SetContext(4096),
|
||||
llama.SetGPULayers(0),
|
||||
)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer model.Free()
|
||||
|
||||
text, err := model.Predict("Привет",
|
||||
llama.SetTokens(128),
|
||||
llama.SetTemperature(0.8),
|
||||
)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
println(text)
|
||||
}
|
||||
```
|
||||
|
||||
В `go.mod` вашего проекта:
|
||||
|
||||
```go
|
||||
require go-llama-new.cpp v0.0.0
|
||||
|
||||
replace go-llama-new.cpp => /path/to/go-llama-new.cpp
|
||||
```
|
||||
|
||||
Перед `go build` в проекте-потребителе должны быть собраны llama.cpp и `libbinding.a` (шаги 1–2 выше).
|
||||
|
||||
## Опциональные теги сборки
|
||||
|
||||
Как в оригинальном go-llama.cpp:
|
||||
|
||||
| Тег | Назначение |
|
||||
|-----|------------|
|
||||
| `openblas` | Дополнительная линковка с OpenBLAS (`llama_openblas.go`) |
|
||||
| `cublas` | CUDA (`llama_cublas.go`) — требует отдельной сборки llama.cpp с `GGML_CUDA=ON` |
|
||||
|
||||
Пример:
|
||||
|
||||
```bash
|
||||
go build -tags openblas ./...
|
||||
```
|
||||
|
||||
Для GPU нужно пересобрать llama.cpp с нужными опциями CMake (например `-DGGML_CUDA=ON`) и убедиться, что пути в `build.conf` указывают на эту сборку.
|
||||
|
||||
## Устранение неполадок
|
||||
|
||||
### `неопределённая ссылка на llama_compiler` / `llama_commit` / `llama_build_number`
|
||||
|
||||
Не слинкована `libllama-common-base.a`. Убедитесь, что в `cgo_flags.go` в `LDFLAGS` есть `-lllama-common-base`, и пересоберите:
|
||||
|
||||
```bash
|
||||
make
|
||||
go build ./...
|
||||
```
|
||||
|
||||
### `cannot find -lllama` или `-lllama-common`
|
||||
|
||||
Проверьте `LLAMA_BUILD_PATH` в `build.conf` и выполните сборку llama.cpp (шаг 1).
|
||||
|
||||
### CGO отключён
|
||||
|
||||
```bash
|
||||
go env CGO_ENABLED # должно быть 1
|
||||
```
|
||||
|
||||
Установите `gcc`/`g++`, если CGO выключен из-за отсутствия компилятора C.
|
||||
|
||||
### Изменили путь к llama.cpp
|
||||
|
||||
1. Обновите `build.conf`
|
||||
2. `make` (обновит `cgo_flags.go` и `libbinding.a`)
|
||||
3. `go build ./...`
|
||||
|
||||
## Структура репозитория
|
||||
|
||||
```
|
||||
.
|
||||
├── build.conf # пути к llama.cpp
|
||||
├── binding.h
|
||||
├── binding.cpp # C API для CGO
|
||||
├── cgo_flags.go # флаги CGO (генерируется make)
|
||||
├── llama.go
|
||||
├── options.go
|
||||
├── Makefile
|
||||
├── examples/main.go
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## Лицензия
|
||||
|
||||
Следует лицензиям llama.cpp и исходного go-llama.cpp. Используйте в соответствии с условиями соответствующих проектов.
|
||||
719
binding.cpp
Normal file
719
binding.cpp
Normal file
@ -0,0 +1,719 @@
|
||||
#include "binding.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct llama_binding_state {
|
||||
common_init_result_ptr init;
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
common_sampler * smpl = nullptr;
|
||||
bool embeddings = false;
|
||||
};
|
||||
|
||||
static llama_binding_state * binding_state(void * state_pr) {
|
||||
return static_cast<llama_binding_state *>(state_pr);
|
||||
}
|
||||
|
||||
static void parse_tensor_split(const char * tensorsplit, float * out, size_t n) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
out[i] = 0.0f;
|
||||
}
|
||||
if (tensorsplit == nullptr || tensorsplit[0] == '\0') {
|
||||
return;
|
||||
}
|
||||
std::string arg_next = tensorsplit;
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
for (size_t i = 0; i < split_arg.size() && i < n; ++i) {
|
||||
out[i] = std::stof(split_arg[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void apply_model_load_options(
|
||||
common_params & params,
|
||||
int n_ctx,
|
||||
int n_seed,
|
||||
bool memory_f16,
|
||||
bool mlock,
|
||||
bool embeddings,
|
||||
bool mmap,
|
||||
int n_gpu,
|
||||
int n_batch,
|
||||
const char * maingpu,
|
||||
const char * tensorsplit,
|
||||
bool numa,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
const char * lora,
|
||||
const char * lora_base,
|
||||
bool perplexity) {
|
||||
(void) lora_base;
|
||||
|
||||
if (n_ctx > 0) {
|
||||
params.n_ctx = n_ctx;
|
||||
}
|
||||
if (n_seed >= 0) {
|
||||
params.sampling.seed = (uint32_t) n_seed;
|
||||
}
|
||||
params.use_mlock = mlock;
|
||||
params.embedding = embeddings;
|
||||
params.use_mmap = mmap;
|
||||
params.n_gpu_layers = n_gpu;
|
||||
params.n_batch = n_batch > 0 ? n_batch : params.n_batch;
|
||||
params.n_ubatch = std::min(params.n_batch, params.n_ubatch);
|
||||
params.numa = numa ? GGML_NUMA_STRATEGY_DISTRIBUTE : GGML_NUMA_STRATEGY_DISABLED;
|
||||
params.warmup = false;
|
||||
params.fit_params = false;
|
||||
|
||||
if (rope_freq_base > 0.0f) {
|
||||
params.rope_freq_base = rope_freq_base;
|
||||
}
|
||||
if (rope_freq_scale > 0.0f) {
|
||||
params.rope_freq_scale = rope_freq_scale;
|
||||
}
|
||||
|
||||
if (memory_f16) {
|
||||
params.cache_type_k = GGML_TYPE_F16;
|
||||
params.cache_type_v = GGML_TYPE_F16;
|
||||
}
|
||||
|
||||
if (maingpu != nullptr && maingpu[0] != '\0') {
|
||||
params.main_gpu = std::stoi(maingpu);
|
||||
}
|
||||
|
||||
parse_tensor_split(tensorsplit, params.tensor_split, sizeof(params.tensor_split) / sizeof(params.tensor_split[0]));
|
||||
|
||||
if (perplexity) {
|
||||
params.compute_ppl = true;
|
||||
}
|
||||
|
||||
if (lora != nullptr && lora[0] != '\0') {
|
||||
common_adapter_lora_info la;
|
||||
la.path = lora;
|
||||
la.scale = 1.0f;
|
||||
params.lora_adapters.push_back(la);
|
||||
}
|
||||
}
|
||||
|
||||
static bool check_antiprompt(
|
||||
const std::string & output,
|
||||
const std::vector<std::string> & antiprompt,
|
||||
bool interactive) {
|
||||
for (const auto & ap : antiprompt) {
|
||||
if (ap.empty()) {
|
||||
continue;
|
||||
}
|
||||
const size_t extra = interactive ? 0 : 2;
|
||||
const size_t search_start = output.length() > ap.length() + extra
|
||||
? output.length() - ap.length() - extra
|
||||
: 0;
|
||||
if (output.find(ap, search_start) != std::string::npos) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void * load_model(
|
||||
const char * fname,
|
||||
int n_ctx,
|
||||
int n_seed,
|
||||
bool memory_f16,
|
||||
bool mlock,
|
||||
bool embeddings,
|
||||
bool mmap,
|
||||
bool low_vram,
|
||||
int n_gpu,
|
||||
int n_batch,
|
||||
const char * maingpu,
|
||||
const char * tensorsplit,
|
||||
bool numa,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
bool mul_mat_q,
|
||||
const char * lora,
|
||||
const char * lora_base,
|
||||
bool perplexity) {
|
||||
(void) low_vram;
|
||||
(void) mul_mat_q;
|
||||
|
||||
common_init();
|
||||
llama_backend_init();
|
||||
|
||||
common_params params;
|
||||
params.model.path = fname;
|
||||
|
||||
apply_model_load_options(
|
||||
params, n_ctx, n_seed, memory_f16, mlock, embeddings, mmap,
|
||||
n_gpu, n_batch, maingpu, tensorsplit, numa,
|
||||
rope_freq_base, rope_freq_scale, lora, lora_base, perplexity);
|
||||
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
auto * binding = new llama_binding_state();
|
||||
binding->init = common_init_from_params(params);
|
||||
if (!binding->init || binding->init->context() == nullptr) {
|
||||
delete binding;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
binding->model = binding->init->model();
|
||||
binding->ctx = binding->init->context();
|
||||
binding->smpl = binding->init->sampler(0);
|
||||
binding->embeddings = embeddings;
|
||||
|
||||
return binding;
|
||||
}
|
||||
|
||||
void llama_binding_free_model(void * state_pr) {
|
||||
delete binding_state(state_pr);
|
||||
}
|
||||
|
||||
int load_state(void * state_pr, char * statefile, char * modes) {
|
||||
(void) modes;
|
||||
auto * state = binding_state(state_pr);
|
||||
if (state == nullptr || state->ctx == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens(llama_n_ctx(state->ctx));
|
||||
size_t n_out = 0;
|
||||
if (!llama_state_load_file(state->ctx, statefile, tokens.data(), tokens.size(), &n_out)) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void save_state(void * state_pr, char * dst, char * modes) {
|
||||
(void) modes;
|
||||
auto * state = binding_state(state_pr);
|
||||
if (state == nullptr || state->ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
llama_state_save_file(state->ctx, dst, nullptr, 0);
|
||||
}
|
||||
|
||||
void * llama_allocate_params(
|
||||
const char * prompt,
|
||||
int seed,
|
||||
int threads,
|
||||
int tokens,
|
||||
int top_k,
|
||||
float top_p,
|
||||
float temp,
|
||||
float repeat_penalty,
|
||||
int repeat_last_n,
|
||||
bool ignore_eos,
|
||||
bool memory_f16,
|
||||
int n_batch,
|
||||
int n_keep,
|
||||
const char ** antiprompt,
|
||||
int antiprompt_count,
|
||||
float tfs_z,
|
||||
float typical_p,
|
||||
float frequency_penalty,
|
||||
float presence_penalty,
|
||||
int mirostat,
|
||||
float mirostat_eta,
|
||||
float mirostat_tau,
|
||||
bool penalize_nl,
|
||||
const char * logit_bias,
|
||||
const char * session_file,
|
||||
bool prompt_cache_all,
|
||||
bool mlock,
|
||||
bool mmap,
|
||||
const char * maingpu,
|
||||
const char * tensorsplit,
|
||||
bool prompt_cache_ro,
|
||||
const char * grammar,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
float negative_prompt_scale,
|
||||
const char * negative_prompt,
|
||||
int n_draft) {
|
||||
(void) tfs_z;
|
||||
(void) penalize_nl;
|
||||
(void) negative_prompt_scale;
|
||||
(void) negative_prompt;
|
||||
(void) memory_f16;
|
||||
|
||||
auto * params = new common_params();
|
||||
params->prompt = prompt != nullptr ? prompt : "";
|
||||
params->n_predict = tokens;
|
||||
params->n_batch = n_batch > 0 ? n_batch : params->n_batch;
|
||||
params->n_keep = n_keep;
|
||||
params->use_mlock = mlock;
|
||||
params->use_mmap = mmap;
|
||||
params->path_prompt_cache = session_file != nullptr ? session_file : "";
|
||||
params->prompt_cache_all = prompt_cache_all;
|
||||
params->prompt_cache_ro = prompt_cache_ro;
|
||||
|
||||
if (rope_freq_base > 0.0f) {
|
||||
params->rope_freq_base = rope_freq_base;
|
||||
}
|
||||
if (rope_freq_scale > 0.0f) {
|
||||
params->rope_freq_scale = rope_freq_scale;
|
||||
}
|
||||
|
||||
params->sampling.seed = seed >= 0 ? (uint32_t) seed : LLAMA_DEFAULT_SEED;
|
||||
params->cpuparams.n_threads = threads > 0 ? threads : 4;
|
||||
params->cpuparams_batch.n_threads = params->cpuparams.n_threads;
|
||||
params->sampling.top_k = top_k;
|
||||
params->sampling.top_p = top_p;
|
||||
params->sampling.temp = temp;
|
||||
params->sampling.penalty_repeat = repeat_penalty;
|
||||
params->sampling.penalty_last_n = repeat_last_n;
|
||||
params->sampling.penalty_freq = frequency_penalty;
|
||||
params->sampling.penalty_present = presence_penalty;
|
||||
params->sampling.typ_p = typical_p > 0 ? typical_p : 1.0f;
|
||||
params->sampling.mirostat = mirostat;
|
||||
params->sampling.mirostat_eta = mirostat_eta;
|
||||
params->sampling.mirostat_tau = mirostat_tau;
|
||||
params->sampling.ignore_eos = ignore_eos;
|
||||
|
||||
if (grammar != nullptr && grammar[0] != '\0') {
|
||||
params->sampling.grammar = common_grammar(COMMON_GRAMMAR_TYPE_USER, grammar);
|
||||
}
|
||||
|
||||
if (maingpu != nullptr && maingpu[0] != '\0') {
|
||||
params->main_gpu = std::stoi(maingpu);
|
||||
}
|
||||
parse_tensor_split(tensorsplit, params->tensor_split, sizeof(params->tensor_split) / sizeof(params->tensor_split[0]));
|
||||
|
||||
if (antiprompt_count > 0 && antiprompt != nullptr) {
|
||||
params->antiprompt = create_vector(antiprompt, antiprompt_count);
|
||||
}
|
||||
|
||||
if (logit_bias != nullptr && logit_bias[0] != '\0') {
|
||||
std::stringstream ss(logit_bias);
|
||||
llama_token key;
|
||||
char sign = 0;
|
||||
std::string value_str;
|
||||
if (ss >> key >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
||||
params->sampling.logit_bias.push_back({key, std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f)});
|
||||
}
|
||||
}
|
||||
|
||||
params->speculative.draft.n_max = n_draft > 0 ? n_draft : params->speculative.draft.n_max;
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
void llama_free_params(void * params_ptr) {
|
||||
delete static_cast<common_params *>(params_ptr);
|
||||
}
|
||||
|
||||
int eval(void * params_ptr, void * state_pr, char * text) {
|
||||
auto * params = static_cast<common_params *>(params_ptr);
|
||||
auto * state = binding_state(state_pr);
|
||||
if (state == nullptr || state->ctx == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string str = text != nullptr ? text : params->prompt;
|
||||
auto embd = common_tokenize(state->ctx, str, true, true);
|
||||
if (embd.empty()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int n_past = 0;
|
||||
if (!common_prompt_batch_decode(state->ctx, embd, n_past, params->n_batch, "", false)) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_embeddings(void * params_ptr, void * state_pr, float * res_embeddings) {
|
||||
auto * params = static_cast<common_params *>(params_ptr);
|
||||
auto * state = binding_state(state_pr);
|
||||
if (state == nullptr || state->ctx == nullptr || !state->embeddings) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto embd = common_tokenize(state->ctx, params->prompt, true, true);
|
||||
if (!embd.empty()) {
|
||||
int n_past = 0;
|
||||
if (!common_prompt_batch_decode(state->ctx, embd, n_past, params->n_batch, "", false)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_embd = llama_model_n_embd(state->model);
|
||||
const float * emb = llama_get_embeddings_ith(state->ctx, -1);
|
||||
if (emb == nullptr) {
|
||||
emb = llama_get_embeddings(state->ctx);
|
||||
}
|
||||
if (emb == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_embd; ++i) {
|
||||
res_embeddings[i] = emb[i];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_token_embeddings(void * params_ptr, void * state_pr, int * tokens, int tokenSize, float * res_embeddings) {
|
||||
auto * params = static_cast<common_params *>(params_ptr);
|
||||
auto * state = binding_state(state_pr);
|
||||
if (state == nullptr || state->ctx == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string text;
|
||||
for (int i = 0; i < tokenSize; ++i) {
|
||||
text += common_token_to_piece(state->ctx, tokens[i]);
|
||||
}
|
||||
params->prompt = text;
|
||||
return get_embeddings(params_ptr, state_pr, res_embeddings);
|
||||
}
|
||||
|
||||
int llama_tokenize_string(void * params_ptr, void * state_pr, int * result) {
|
||||
auto * params = static_cast<common_params *>(params_ptr);
|
||||
auto * state = binding_state(state_pr);
|
||||
if (state == nullptr || state->ctx == nullptr) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(state->model);
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
const int32_t max_tokens = params->n_ctx > 0 ? params->n_ctx : 4096;
|
||||
|
||||
return llama_tokenize(
|
||||
vocab,
|
||||
params->prompt.c_str(),
|
||||
(int32_t) params->prompt.size(),
|
||||
reinterpret_cast<llama_token *>(result),
|
||||
max_tokens,
|
||||
add_bos,
|
||||
true);
|
||||
}
|
||||
|
||||
int llama_predict(void * params_ptr, void * state_pr, char * result, bool debug) {
|
||||
auto * params = static_cast<common_params *>(params_ptr);
|
||||
auto * state = binding_state(state_pr);
|
||||
if (state == nullptr || state->ctx == nullptr || state->smpl == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context * ctx = state->ctx;
|
||||
llama_model * model = state->model;
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
llama_memory_t mem = llama_get_memory(ctx);
|
||||
|
||||
common_sampler_ptr smpl_ptr(common_sampler_init(model, params->sampling));
|
||||
if (!smpl_ptr) {
|
||||
return 1;
|
||||
}
|
||||
common_sampler * smpl = smpl_ptr.get();
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
if (params->n_predict < 0) {
|
||||
params->n_predict = 128;
|
||||
}
|
||||
|
||||
llama_set_n_threads(ctx, params->cpuparams.n_threads, params->cpuparams_batch.n_threads);
|
||||
|
||||
std::string path_session = params->path_prompt_cache;
|
||||
std::vector<llama_token> session_tokens;
|
||||
|
||||
if (!path_session.empty()) {
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_out = 0;
|
||||
if (std::ifstream(path_session).good()) {
|
||||
llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size(), &n_out);
|
||||
session_tokens.resize(n_out);
|
||||
}
|
||||
}
|
||||
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
std::vector<llama_token> embd_inp = common_tokenize(ctx, params->prompt, add_bos, true);
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params->n_keep < 0 || params->n_keep > (int) embd_inp.size()) {
|
||||
params->n_keep = (int) embd_inp.size();
|
||||
}
|
||||
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
int n_past = 0;
|
||||
int n_remain = params->n_predict;
|
||||
int n_consumed = 0;
|
||||
int n_session_consumed = 0;
|
||||
bool is_antiprompt = false;
|
||||
bool need_save_session = !path_session.empty() && !params->prompt_cache_ro;
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
std::string res;
|
||||
|
||||
while (n_remain > 0 && !is_antiprompt) {
|
||||
if (!embd.empty()) {
|
||||
const int max_embd_size = n_ctx - 4;
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
embd.resize(max_embd_size);
|
||||
}
|
||||
|
||||
if (n_past + (int) embd.size() >= n_ctx) {
|
||||
const int n_left = n_past - params->n_keep;
|
||||
const int n_discard = n_left / 2;
|
||||
llama_memory_seq_rm(mem, 0, params->n_keep, params->n_keep + n_discard);
|
||||
llama_memory_seq_add(mem, 0, params->n_keep + n_discard, n_past, -n_discard);
|
||||
n_past -= n_discard;
|
||||
path_session.clear();
|
||||
}
|
||||
|
||||
if (n_session_consumed < (int) session_tokens.size()) {
|
||||
size_t i = 0;
|
||||
for (; i < embd.size(); ++i) {
|
||||
if (embd[i] != session_tokens[n_session_consumed]) {
|
||||
session_tokens.resize(n_session_consumed);
|
||||
break;
|
||||
}
|
||||
n_past++;
|
||||
n_session_consumed++;
|
||||
if (n_session_consumed >= (int) session_tokens.size()) {
|
||||
++i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i > 0) {
|
||||
embd.erase(embd.begin(), embd.begin() + i);
|
||||
}
|
||||
}
|
||||
|
||||
if (!embd.empty()) {
|
||||
const bool save_now = need_save_session && n_consumed >= (int) embd_inp.size();
|
||||
if (!common_prompt_batch_decode(ctx, embd, n_past, params->n_batch, path_session, save_now)) {
|
||||
return 1;
|
||||
}
|
||||
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
|
||||
n_session_consumed = session_tokens.size();
|
||||
need_save_session = false;
|
||||
}
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
embd.push_back(id);
|
||||
|
||||
auto piece = common_token_to_piece(ctx, id);
|
||||
if (!tokenCallback(state_pr, const_cast<char *>(piece.c_str()))) {
|
||||
break;
|
||||
}
|
||||
|
||||
res += piece;
|
||||
--n_remain;
|
||||
|
||||
if (llama_vocab_is_eog(vocab, id)) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params->n_batch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto id : embd) {
|
||||
res += common_token_to_piece(ctx, id);
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !params->antiprompt.empty()) {
|
||||
is_antiprompt = check_antiprompt(res, params->antiprompt, false);
|
||||
}
|
||||
}
|
||||
|
||||
if (!path_session.empty() && params->prompt_cache_all && !params->prompt_cache_ro) {
|
||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
||||
if (debug) {
|
||||
common_perf_print(ctx, smpl);
|
||||
}
|
||||
|
||||
if (result != nullptr) {
|
||||
std::strncpy(result, res.c_str(), params->n_predict > 0 ? (size_t) params->n_predict : res.size());
|
||||
result[params->n_predict > 0 ? params->n_predict - 1 : res.size()] = '\0';
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int speculative_sampling(void * params_ptr, void * target_model, void * draft_model, char * result, bool debug) {
|
||||
auto * params = static_cast<common_params *>(params_ptr);
|
||||
auto * tgt = binding_state(target_model);
|
||||
auto * dft = binding_state(draft_model);
|
||||
if (tgt == nullptr || dft == nullptr || tgt->ctx == nullptr || dft->ctx == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context * ctx_tgt = tgt->ctx;
|
||||
llama_context * ctx_dft = dft->ctx;
|
||||
const llama_vocab * vocab = llama_model_get_vocab(tgt->model);
|
||||
|
||||
common_sampler_ptr smpl_ptr(common_sampler_init(tgt->model, params->sampling));
|
||||
if (!smpl_ptr) {
|
||||
return 1;
|
||||
}
|
||||
common_sampler * smpl_tgt = smpl_ptr.get();
|
||||
|
||||
auto inp = common_tokenize(ctx_tgt, params->prompt, true, true);
|
||||
const int max_tokens = llama_n_ctx(ctx_tgt) - 4;
|
||||
if ((int) inp.size() > max_tokens) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int n_past_tgt = 0;
|
||||
int n_past_dft = 0;
|
||||
if (!inp.empty()) {
|
||||
if (!common_prompt_batch_decode(ctx_tgt, inp, n_past_tgt, params->n_batch, "", false)) {
|
||||
return 1;
|
||||
}
|
||||
if (!common_prompt_batch_decode(ctx_dft, inp, n_past_dft, params->n_batch, "", false)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_draft = params->speculative.draft.n_max > 0 ? params->speculative.draft.n_max : 16;
|
||||
int n_predict = 0;
|
||||
std::string res;
|
||||
bool has_eos = false;
|
||||
|
||||
std::vector<llama_token> drafted;
|
||||
std::vector<llama_token> last_tokens(llama_n_ctx(ctx_tgt), 0);
|
||||
for (auto id : inp) {
|
||||
last_tokens.erase(last_tokens.begin());
|
||||
last_tokens.push_back(id);
|
||||
}
|
||||
|
||||
while (n_predict < params->n_predict && !has_eos) {
|
||||
int i_dft = 0;
|
||||
while (true) {
|
||||
const llama_token id = common_sampler_sample(smpl_tgt, ctx_tgt, -1);
|
||||
common_sampler_accept(smpl_tgt, id, true);
|
||||
|
||||
last_tokens.erase(last_tokens.begin());
|
||||
last_tokens.push_back(id);
|
||||
|
||||
auto piece = common_token_to_piece(ctx_tgt, id);
|
||||
if (!tokenCallback(draft_model, const_cast<char *>(piece.c_str()))) {
|
||||
break;
|
||||
}
|
||||
res += piece;
|
||||
|
||||
if (llama_vocab_is_eog(vocab, id)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
++n_predict;
|
||||
|
||||
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
|
||||
++i_dft;
|
||||
continue;
|
||||
}
|
||||
|
||||
llama_token dft_id = id;
|
||||
llama_batch batch = llama_batch_get_one(&dft_id, 1);
|
||||
if (llama_decode(ctx_dft, batch) != 0) {
|
||||
return 1;
|
||||
}
|
||||
++n_past_dft;
|
||||
|
||||
drafted.clear();
|
||||
drafted.push_back(id);
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_predict >= params->n_predict || has_eos) {
|
||||
break;
|
||||
}
|
||||
|
||||
int n_past_cur = n_past_dft;
|
||||
for (int i = 0; i < n_draft; ++i) {
|
||||
float * logits = llama_get_logits(ctx_dft);
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
llama_token draft_id = 0;
|
||||
float max_logit = logits[0];
|
||||
for (llama_token t = 1; t < n_vocab; ++t) {
|
||||
if (logits[t] > max_logit) {
|
||||
max_logit = logits[t];
|
||||
draft_id = t;
|
||||
}
|
||||
}
|
||||
drafted.push_back(draft_id);
|
||||
|
||||
if (i == n_draft - 1) {
|
||||
break;
|
||||
}
|
||||
|
||||
llama_batch batch = llama_batch_get_one(&draft_id, 1);
|
||||
if (llama_decode(ctx_dft, batch) != 0) {
|
||||
return 1;
|
||||
}
|
||||
++n_past_cur;
|
||||
}
|
||||
|
||||
llama_batch batch = llama_batch_get_one(drafted.data(), (int32_t) drafted.size());
|
||||
if (llama_decode(ctx_tgt, batch) != 0) {
|
||||
return 1;
|
||||
}
|
||||
++n_past_tgt;
|
||||
|
||||
if (!drafted.empty()) {
|
||||
drafted.erase(drafted.begin());
|
||||
}
|
||||
}
|
||||
|
||||
if (debug) {
|
||||
common_perf_print(ctx_tgt, smpl_tgt);
|
||||
common_perf_print(ctx_dft, nullptr);
|
||||
}
|
||||
|
||||
if (result != nullptr) {
|
||||
std::strncpy(result, res.c_str(), params->n_predict > 0 ? (size_t) params->n_predict : res.size());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
||||
std::vector<std::string> create_vector(const char ** strings, int count) {
|
||||
std::vector<std::string> vec;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
vec.emplace_back(strings[i]);
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
void delete_vector(std::vector<std::string> * vec) {
|
||||
delete vec;
|
||||
}
|
||||
63
binding.h
Normal file
63
binding.h
Normal file
@ -0,0 +1,63 @@
|
||||
#ifdef __cplusplus
|
||||
#include <vector>
|
||||
#include <string>
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
extern unsigned char tokenCallback(void *, char *);
|
||||
|
||||
int load_state(void *ctx, char *statefile, char*modes);
|
||||
|
||||
int eval(void* params_ptr, void *ctx, char*text);
|
||||
|
||||
void save_state(void *ctx, char *dst, char*modes);
|
||||
|
||||
void* load_model(const char *fname,
|
||||
int n_ctx,
|
||||
int n_seed,
|
||||
bool memory_f16,
|
||||
bool mlock,
|
||||
bool embeddings,
|
||||
bool mmap,
|
||||
bool low_vram,
|
||||
int n_gpu,
|
||||
int n_batch,
|
||||
const char *maingpu,
|
||||
const char *tensorsplit,
|
||||
bool numa,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
bool mul_mat_q, const char *lora, const char *lora_base, bool perplexity
|
||||
);
|
||||
|
||||
int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);
|
||||
|
||||
int get_token_embeddings(void* params_ptr, void* state_pr, int *tokens, int tokenSize, float * res_embeddings);
|
||||
|
||||
void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
|
||||
int top_k, float top_p, float temp, float repeat_penalty,
|
||||
int repeat_last_n, bool ignore_eos, bool memory_f16,
|
||||
int n_batch, int n_keep, const char** antiprompt, int antiprompt_count,
|
||||
float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, const char *logit_bias, const char *session_file, bool prompt_cache_all, bool mlock, bool mmap, const char *maingpu, const char *tensorsplit ,
|
||||
bool prompt_cache_ro, const char *grammar, float rope_freq_base, float rope_freq_scale, float negative_prompt_scale, const char* negative_prompt,
|
||||
int n_draft);
|
||||
|
||||
int speculative_sampling(void* params_ptr, void* target_model, void* draft_model, char* result, bool debug);
|
||||
|
||||
void llama_free_params(void* params_ptr);
|
||||
|
||||
void llama_binding_free_model(void* state);
|
||||
|
||||
int llama_tokenize_string(void* params_ptr, void* state_pr, int* result);
|
||||
|
||||
int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
|
||||
std::vector<std::string> create_vector(const char** strings, int count);
|
||||
void delete_vector(std::vector<std::string>* vec);
|
||||
#endif
|
||||
3
build.conf
Normal file
3
build.conf
Normal file
@ -0,0 +1,3 @@
|
||||
# Пути к исходникам llama.cpp (без переменных окружения)
|
||||
LLAMA_CPP_PATH=/home/admin/cpp/llama.cpp
|
||||
LLAMA_BUILD_PATH=/home/admin/cpp/llama.cpp/build
|
||||
10
cgo_flags.go
Normal file
10
cgo_flags.go
Normal file
@ -0,0 +1,10 @@
|
||||
package llama
|
||||
|
||||
// Флаги CGO генерируются из build.conf (пути к /home/admin/cpp/llama.cpp).
|
||||
// При смене пути отредактируйте build.conf и этот файл.
|
||||
|
||||
/*
|
||||
#cgo CXXFLAGS: -std=c++17 -I/home/admin/cpp/llama.cpp/include -I/home/admin/cpp/llama.cpp/common -I/home/admin/cpp/llama.cpp/ggml/include -I${SRCDIR}
|
||||
#cgo LDFLAGS: -L${SRCDIR} -lbinding -L/home/admin/cpp/llama.cpp/build/src -lllama -L/home/admin/cpp/llama.cpp/build/common -lllama-common -lllama-common-base -L/home/admin/cpp/llama.cpp/build/ggml/src -lggml -lggml-cpu -lggml-base -L/home/admin/cpp/llama.cpp/build/vendor/cpp-httplib -lcpp-httplib -lstdc++ -lm -lpthread -fopenmp -ldl
|
||||
*/
|
||||
import "C"
|
||||
48
examples/main.go
Normal file
48
examples/main.go
Normal file
@ -0,0 +1,48 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
llama "go-llama-new.cpp"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
fmt.Fprintf(os.Stderr, "usage: %s <model.gguf> [prompt]\n", os.Args[0])
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
modelPath := os.Args[1]
|
||||
prompt := "Hello"
|
||||
if len(os.Args) > 2 {
|
||||
prompt = strings.Join(os.Args[2:], " ")
|
||||
}
|
||||
|
||||
l, err := llama.New(modelPath, llama.SetContext(512), llama.SetGPULayers(0))
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "load model: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer l.Free()
|
||||
|
||||
out, err := l.Predict(prompt, llama.SetTokens(64), llama.SetThreads(4))
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "predict: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Println(out)
|
||||
|
||||
reader := bufio.NewReader(os.Stdin)
|
||||
fmt.Print("\nТокенизация (введите текст): ")
|
||||
line, _ := reader.ReadString('\n')
|
||||
_, tokens, err := l.TokenizeString(strings.TrimSpace(line))
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "tokenize: %v\n", err)
|
||||
return
|
||||
}
|
||||
fmt.Printf("токенов: %d, ids: %v\n", len(tokens), tokens)
|
||||
}
|
||||
409
llama.go
Normal file
409
llama.go
Normal file
@ -0,0 +1,409 @@
|
||||
package llama
|
||||
|
||||
// #include "binding.h"
|
||||
// #include <stdlib.h>
|
||||
import "C"
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
type LLama struct {
|
||||
state unsafe.Pointer
|
||||
embeddings bool
|
||||
contextSize int
|
||||
}
|
||||
|
||||
func New(model string, opts ...ModelOption) (*LLama, error) {
|
||||
mo := NewModelOptions(opts...)
|
||||
modelPath := C.CString(model)
|
||||
defer C.free(unsafe.Pointer(modelPath))
|
||||
loraBase := C.CString(mo.LoraBase)
|
||||
defer C.free(unsafe.Pointer(loraBase))
|
||||
loraAdapter := C.CString(mo.LoraAdapter)
|
||||
defer C.free(unsafe.Pointer(loraAdapter))
|
||||
|
||||
MulMatQ := true
|
||||
|
||||
if mo.MulMatQ != nil {
|
||||
MulMatQ = *mo.MulMatQ
|
||||
}
|
||||
|
||||
result := C.load_model(modelPath,
|
||||
C.int(mo.ContextSize), C.int(mo.Seed),
|
||||
C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM),
|
||||
C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA),
|
||||
C.float(mo.FreqRopeBase), C.float(mo.FreqRopeScale),
|
||||
C.bool(MulMatQ), loraAdapter, loraBase, C.bool(mo.Perplexity),
|
||||
)
|
||||
|
||||
if result == nil {
|
||||
return nil, fmt.Errorf("failed loading model")
|
||||
}
|
||||
|
||||
ll := &LLama{state: result, contextSize: mo.ContextSize, embeddings: mo.Embeddings}
|
||||
return ll, nil
|
||||
}
|
||||
|
||||
func (l *LLama) Free() {
|
||||
C.llama_binding_free_model(l.state)
|
||||
}
|
||||
|
||||
func (l *LLama) LoadState(state string) error {
|
||||
d := C.CString(state)
|
||||
w := C.CString("rb")
|
||||
result := C.load_state(l.state, d, w)
|
||||
|
||||
defer C.free(unsafe.Pointer(d))
|
||||
defer C.free(unsafe.Pointer(w))
|
||||
|
||||
if result != 0 {
|
||||
return fmt.Errorf("error while loading state")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *LLama) SaveState(dst string) error {
|
||||
d := C.CString(dst)
|
||||
w := C.CString("wb")
|
||||
|
||||
C.save_state(l.state, d, w)
|
||||
|
||||
defer C.free(unsafe.Pointer(d))
|
||||
defer C.free(unsafe.Pointer(w))
|
||||
|
||||
_, err := os.Stat(dst)
|
||||
return err
|
||||
}
|
||||
|
||||
// Token Embeddings
|
||||
func (l *LLama) TokenEmbeddings(tokens []int, opts ...PredictOption) ([]float32, error) {
|
||||
if !l.embeddings {
|
||||
return []float32{}, fmt.Errorf("model loaded without embeddings")
|
||||
}
|
||||
|
||||
po := NewPredictOptions(opts...)
|
||||
|
||||
outSize := po.Tokens
|
||||
if po.Tokens == 0 {
|
||||
outSize = 9999999
|
||||
}
|
||||
|
||||
floats := make([]float32, outSize)
|
||||
|
||||
myArray := (*C.int)(C.malloc(C.size_t(len(tokens)) * C.sizeof_int))
|
||||
|
||||
for i, v := range tokens {
|
||||
(*[1 << 31]int32)(unsafe.Pointer(myArray))[i] = int32(v)
|
||||
}
|
||||
|
||||
params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
|
||||
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
|
||||
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
|
||||
C.int(po.Batch), C.int(po.NKeep), nil, C.int(0),
|
||||
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
|
||||
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
|
||||
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
|
||||
C.CString(po.MainGPU), C.CString(po.TensorSplit),
|
||||
C.bool(po.PromptCacheRO),
|
||||
C.CString(po.Grammar),
|
||||
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
|
||||
C.int(po.NDraft),
|
||||
)
|
||||
ret := C.get_token_embeddings(params, l.state, myArray, C.int(len(tokens)), (*C.float)(&floats[0]))
|
||||
C.free(unsafe.Pointer(myArray))
|
||||
C.llama_free_params(params)
|
||||
if ret != 0 {
|
||||
return floats, fmt.Errorf("embedding inference failed")
|
||||
}
|
||||
return floats, nil
|
||||
}
|
||||
|
||||
// Embeddings
|
||||
func (l *LLama) Embeddings(text string, opts ...PredictOption) ([]float32, error) {
|
||||
if !l.embeddings {
|
||||
return []float32{}, fmt.Errorf("model loaded without embeddings")
|
||||
}
|
||||
|
||||
po := NewPredictOptions(opts...)
|
||||
|
||||
input := C.CString(text)
|
||||
defer C.free(unsafe.Pointer(input))
|
||||
if po.Tokens == 0 {
|
||||
po.Tokens = 99999999
|
||||
}
|
||||
floats := make([]float32, po.Tokens)
|
||||
reverseCount := len(po.StopPrompts)
|
||||
reversePrompt := make([]*C.char, reverseCount)
|
||||
var pass **C.char
|
||||
for i, s := range po.StopPrompts {
|
||||
cs := C.CString(s)
|
||||
defer C.free(unsafe.Pointer(cs))
|
||||
reversePrompt[i] = cs
|
||||
pass = &reversePrompt[0]
|
||||
}
|
||||
|
||||
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
|
||||
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
|
||||
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
|
||||
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
|
||||
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
|
||||
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
|
||||
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
|
||||
C.CString(po.MainGPU), C.CString(po.TensorSplit),
|
||||
C.bool(po.PromptCacheRO),
|
||||
C.CString(po.Grammar),
|
||||
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
|
||||
C.int(po.NDraft),
|
||||
)
|
||||
|
||||
ret := C.get_embeddings(params, l.state, (*C.float)(&floats[0]))
|
||||
C.llama_free_params(params)
|
||||
if ret != 0 {
|
||||
return floats, fmt.Errorf("embedding inference failed")
|
||||
}
|
||||
|
||||
return floats, nil
|
||||
}
|
||||
|
||||
func (l *LLama) Eval(text string, opts ...PredictOption) error {
|
||||
po := NewPredictOptions(opts...)
|
||||
|
||||
input := C.CString(text)
|
||||
defer C.free(unsafe.Pointer(input))
|
||||
if po.Tokens == 0 {
|
||||
po.Tokens = 99999999
|
||||
}
|
||||
|
||||
reverseCount := len(po.StopPrompts)
|
||||
reversePrompt := make([]*C.char, reverseCount)
|
||||
var pass **C.char
|
||||
for i, s := range po.StopPrompts {
|
||||
cs := C.CString(s)
|
||||
defer C.free(unsafe.Pointer(cs))
|
||||
reversePrompt[i] = cs
|
||||
pass = &reversePrompt[0]
|
||||
}
|
||||
|
||||
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
|
||||
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
|
||||
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
|
||||
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
|
||||
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
|
||||
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
|
||||
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
|
||||
C.CString(po.MainGPU), C.CString(po.TensorSplit),
|
||||
C.bool(po.PromptCacheRO),
|
||||
C.CString(po.Grammar),
|
||||
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
|
||||
C.int(po.NDraft),
|
||||
)
|
||||
ret := C.eval(params, l.state, input)
|
||||
C.llama_free_params(params)
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("inference failed")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (l *LLama) SpeculativeSampling(ll *LLama, text string, opts ...PredictOption) (string, error) {
|
||||
po := NewPredictOptions(opts...)
|
||||
|
||||
if po.TokenCallback != nil {
|
||||
setCallback(l.state, po.TokenCallback)
|
||||
}
|
||||
|
||||
input := C.CString(text)
|
||||
defer C.free(unsafe.Pointer(input))
|
||||
if po.Tokens == 0 {
|
||||
po.Tokens = 99999999
|
||||
}
|
||||
out := make([]byte, po.Tokens)
|
||||
|
||||
reverseCount := len(po.StopPrompts)
|
||||
reversePrompt := make([]*C.char, reverseCount)
|
||||
var pass **C.char
|
||||
for i, s := range po.StopPrompts {
|
||||
cs := C.CString(s)
|
||||
defer C.free(unsafe.Pointer(cs))
|
||||
reversePrompt[i] = cs
|
||||
pass = &reversePrompt[0]
|
||||
}
|
||||
|
||||
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
|
||||
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
|
||||
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
|
||||
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
|
||||
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
|
||||
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
|
||||
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
|
||||
C.CString(po.MainGPU), C.CString(po.TensorSplit),
|
||||
C.bool(po.PromptCacheRO),
|
||||
C.CString(po.Grammar),
|
||||
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
|
||||
C.int(po.NDraft),
|
||||
)
|
||||
ret := C.speculative_sampling(params, l.state, ll.state, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode))
|
||||
C.llama_free_params(params)
|
||||
|
||||
if po.TokenCallback != nil {
|
||||
setCallback(l.state, nil)
|
||||
}
|
||||
|
||||
if ret != 0 {
|
||||
return "", fmt.Errorf("inference failed")
|
||||
}
|
||||
res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
|
||||
|
||||
res = strings.TrimPrefix(res, " ")
|
||||
res = strings.TrimPrefix(res, text)
|
||||
res = strings.TrimPrefix(res, "\n")
|
||||
|
||||
for _, s := range po.StopPrompts {
|
||||
res = strings.TrimRight(res, s)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
|
||||
po := NewPredictOptions(opts...)
|
||||
|
||||
if po.TokenCallback != nil {
|
||||
setCallback(l.state, po.TokenCallback)
|
||||
}
|
||||
|
||||
input := C.CString(text)
|
||||
defer C.free(unsafe.Pointer(input))
|
||||
if po.Tokens == 0 {
|
||||
po.Tokens = 99999999
|
||||
}
|
||||
out := make([]byte, po.Tokens)
|
||||
|
||||
reverseCount := len(po.StopPrompts)
|
||||
reversePrompt := make([]*C.char, reverseCount)
|
||||
var pass **C.char
|
||||
for i, s := range po.StopPrompts {
|
||||
cs := C.CString(s)
|
||||
defer C.free(unsafe.Pointer(cs))
|
||||
reversePrompt[i] = cs
|
||||
pass = &reversePrompt[0]
|
||||
}
|
||||
|
||||
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
|
||||
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
|
||||
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
|
||||
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
|
||||
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
|
||||
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
|
||||
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
|
||||
C.CString(po.MainGPU), C.CString(po.TensorSplit),
|
||||
C.bool(po.PromptCacheRO),
|
||||
C.CString(po.Grammar),
|
||||
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
|
||||
C.int(po.NDraft),
|
||||
)
|
||||
ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode))
|
||||
C.llama_free_params(params)
|
||||
|
||||
if po.TokenCallback != nil {
|
||||
setCallback(l.state, nil)
|
||||
}
|
||||
|
||||
if ret != 0 {
|
||||
return "", fmt.Errorf("inference failed")
|
||||
}
|
||||
res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
|
||||
|
||||
res = strings.TrimPrefix(res, " ")
|
||||
res = strings.TrimPrefix(res, text)
|
||||
res = strings.TrimPrefix(res, "\n")
|
||||
|
||||
for _, s := range po.StopPrompts {
|
||||
res = strings.TrimRight(res, s)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func (l *LLama) TokenizeString(text string, opts ...PredictOption) (int32, []int32, error) {
|
||||
po := NewPredictOptions(opts...)
|
||||
|
||||
input := C.CString(text)
|
||||
defer C.free(unsafe.Pointer(input))
|
||||
if po.Tokens == 0 {
|
||||
po.Tokens = 4096
|
||||
}
|
||||
out := make([]C.int, po.Tokens)
|
||||
|
||||
var fakeDblPtr **C.char
|
||||
|
||||
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
|
||||
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
|
||||
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
|
||||
C.int(po.Batch), C.int(po.NKeep), fakeDblPtr, C.int(0),
|
||||
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
|
||||
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
|
||||
C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
|
||||
C.CString(po.MainGPU), C.CString(po.TensorSplit),
|
||||
C.bool(po.PromptCacheRO),
|
||||
C.CString(po.Grammar),
|
||||
C.float(po.RopeFreqBase), C.float(po.RopeFreqScale), C.float(po.NegativePromptScale), C.CString(po.NegativePrompt),
|
||||
C.int(po.NDraft),
|
||||
)
|
||||
|
||||
tokRet := C.llama_tokenize_string(params, l.state, (*C.int)(unsafe.Pointer(&out[0])))
|
||||
C.llama_free_params(params)
|
||||
|
||||
if tokRet < 0 {
|
||||
return int32(tokRet), []int32{}, fmt.Errorf("llama_tokenize_string returned negative count %d", tokRet)
|
||||
}
|
||||
|
||||
gTokRet := int32(tokRet)
|
||||
|
||||
gLenOut := min(len(out), int(gTokRet))
|
||||
|
||||
goSlice := make([]int32, gLenOut)
|
||||
for i := 0; i < gLenOut; i++ {
|
||||
goSlice[i] = int32(out[i])
|
||||
}
|
||||
|
||||
return gTokRet, goSlice, nil
|
||||
}
|
||||
|
||||
func (l *LLama) SetTokenCallback(callback func(token string) bool) {
|
||||
setCallback(l.state, callback)
|
||||
}
|
||||
|
||||
var (
|
||||
m sync.RWMutex
|
||||
callbacks = map[uintptr]func(string) bool{}
|
||||
)
|
||||
|
||||
//export tokenCallback
|
||||
func tokenCallback(statePtr unsafe.Pointer, token *C.char) bool {
|
||||
m.RLock()
|
||||
defer m.RUnlock()
|
||||
|
||||
if callback, ok := callbacks[uintptr(statePtr)]; ok {
|
||||
return callback(C.GoString(token))
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func setCallback(statePtr unsafe.Pointer, callback func(string) bool) {
|
||||
m.Lock()
|
||||
defer m.Unlock()
|
||||
|
||||
if callback == nil {
|
||||
delete(callbacks, uintptr(statePtr))
|
||||
} else {
|
||||
callbacks[uintptr(statePtr)] = callback
|
||||
}
|
||||
}
|
||||
9
llama_cublas.go
Normal file
9
llama_cublas.go
Normal file
@ -0,0 +1,9 @@
|
||||
//go:build cublas
|
||||
// +build cublas
|
||||
|
||||
package llama
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/
|
||||
*/
|
||||
import "C"
|
||||
9
llama_openblas.go
Normal file
9
llama_openblas.go
Normal file
@ -0,0 +1,9 @@
|
||||
//go:build openblas
|
||||
// +build openblas
|
||||
|
||||
package llama
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -lopenblas
|
||||
*/
|
||||
import "C"
|
||||
460
options.go
Normal file
460
options.go
Normal file
@ -0,0 +1,460 @@
|
||||
package llama
|
||||
|
||||
type ModelOptions struct {
|
||||
ContextSize int
|
||||
Seed int
|
||||
NBatch int
|
||||
F16Memory bool
|
||||
MLock bool
|
||||
MMap bool
|
||||
LowVRAM bool
|
||||
Embeddings bool
|
||||
NUMA bool
|
||||
NGPULayers int
|
||||
MainGPU string
|
||||
TensorSplit string
|
||||
FreqRopeBase float32
|
||||
FreqRopeScale float32
|
||||
MulMatQ *bool
|
||||
LoraBase string
|
||||
LoraAdapter string
|
||||
Perplexity bool
|
||||
}
|
||||
|
||||
type PredictOptions struct {
|
||||
Seed, Threads, Tokens, TopK, Repeat, Batch, NKeep int
|
||||
TopP, Temperature, Penalty float32
|
||||
NDraft int
|
||||
F16KV bool
|
||||
DebugMode bool
|
||||
StopPrompts []string
|
||||
IgnoreEOS bool
|
||||
|
||||
TailFreeSamplingZ float32
|
||||
TypicalP float32
|
||||
FrequencyPenalty float32
|
||||
PresencePenalty float32
|
||||
Mirostat int
|
||||
MirostatETA float32
|
||||
MirostatTAU float32
|
||||
PenalizeNL bool
|
||||
LogitBias string
|
||||
TokenCallback func(string) bool
|
||||
|
||||
PathPromptCache string
|
||||
MLock, MMap, PromptCacheAll bool
|
||||
PromptCacheRO bool
|
||||
Grammar string
|
||||
MainGPU string
|
||||
TensorSplit string
|
||||
|
||||
// Rope parameters
|
||||
RopeFreqBase float32
|
||||
RopeFreqScale float32
|
||||
|
||||
// Negative prompt parameters
|
||||
NegativePromptScale float32
|
||||
NegativePrompt string
|
||||
}
|
||||
|
||||
type PredictOption func(p *PredictOptions)
|
||||
|
||||
type ModelOption func(p *ModelOptions)
|
||||
|
||||
var DefaultModelOptions ModelOptions = ModelOptions{
|
||||
ContextSize: 512,
|
||||
Seed: 0,
|
||||
F16Memory: false,
|
||||
MLock: false,
|
||||
Embeddings: false,
|
||||
MMap: true,
|
||||
LowVRAM: false,
|
||||
NBatch: 512,
|
||||
FreqRopeBase: 10000,
|
||||
FreqRopeScale: 1.0,
|
||||
}
|
||||
|
||||
var DefaultOptions PredictOptions = PredictOptions{
|
||||
Seed: -1,
|
||||
Threads: 4,
|
||||
Tokens: 128,
|
||||
Penalty: 1.1,
|
||||
Repeat: 64,
|
||||
Batch: 512,
|
||||
NKeep: 64,
|
||||
TopK: 40,
|
||||
TopP: 0.95,
|
||||
TailFreeSamplingZ: 1.0,
|
||||
TypicalP: 1.0,
|
||||
Temperature: 0.8,
|
||||
FrequencyPenalty: 0.0,
|
||||
PresencePenalty: 0.0,
|
||||
Mirostat: 0,
|
||||
MirostatTAU: 5.0,
|
||||
MirostatETA: 0.1,
|
||||
MMap: true,
|
||||
RopeFreqBase: 10000,
|
||||
RopeFreqScale: 1.0,
|
||||
}
|
||||
|
||||
func SetMulMatQ(b bool) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.MulMatQ = &b
|
||||
}
|
||||
}
|
||||
|
||||
func SetLoraBase(s string) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.LoraBase = s
|
||||
}
|
||||
}
|
||||
|
||||
func SetLoraAdapter(s string) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.LoraAdapter = s
|
||||
}
|
||||
}
|
||||
|
||||
// SetContext sets the context size.
|
||||
func SetContext(c int) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.ContextSize = c
|
||||
}
|
||||
}
|
||||
|
||||
func WithRopeFreqBase(f float32) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.FreqRopeBase = f
|
||||
}
|
||||
}
|
||||
|
||||
func WithRopeFreqScale(f float32) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.FreqRopeScale = f
|
||||
}
|
||||
}
|
||||
|
||||
func SetModelSeed(c int) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.Seed = c
|
||||
}
|
||||
}
|
||||
|
||||
// SetContext sets the context size.
|
||||
func SetMMap(b bool) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.MMap = b
|
||||
}
|
||||
}
|
||||
|
||||
// SetNBatch sets the n_Batch
|
||||
func SetNBatch(n_batch int) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.NBatch = n_batch
|
||||
}
|
||||
}
|
||||
|
||||
// Set sets the tensor split for the GPU
|
||||
func SetTensorSplit(maingpu string) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.TensorSplit = maingpu
|
||||
}
|
||||
}
|
||||
|
||||
// SetMainGPU sets the main_gpu
|
||||
func SetMainGPU(maingpu string) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.MainGPU = maingpu
|
||||
}
|
||||
}
|
||||
|
||||
// SetPredictionTensorSplit sets the tensor split for the GPU
|
||||
func SetPredictionTensorSplit(maingpu string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TensorSplit = maingpu
|
||||
}
|
||||
}
|
||||
|
||||
// SetPredictionMainGPU sets the main_gpu
|
||||
func SetPredictionMainGPU(maingpu string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MainGPU = maingpu
|
||||
}
|
||||
}
|
||||
|
||||
// Rope and negative prompt parameters
|
||||
func SetRopeFreqBase(rfb float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.RopeFreqBase = rfb
|
||||
}
|
||||
}
|
||||
|
||||
func SetRopeFreqScale(rfs float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.RopeFreqScale = rfs
|
||||
}
|
||||
}
|
||||
|
||||
func SetNDraft(nd int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.NDraft = nd
|
||||
}
|
||||
}
|
||||
|
||||
func SetPerplexity(b bool) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.Perplexity = b
|
||||
}
|
||||
}
|
||||
|
||||
func SetNegativePromptScale(nps float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.NegativePromptScale = nps
|
||||
}
|
||||
}
|
||||
|
||||
func SetNegativePrompt(np string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.NegativePrompt = np
|
||||
}
|
||||
}
|
||||
|
||||
var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
|
||||
p.LowVRAM = true
|
||||
}
|
||||
|
||||
var EnableNUMA ModelOption = func(p *ModelOptions) {
|
||||
p.NUMA = true
|
||||
}
|
||||
|
||||
var EnableEmbeddings ModelOption = func(p *ModelOptions) {
|
||||
p.Embeddings = true
|
||||
}
|
||||
|
||||
var EnableF16Memory ModelOption = func(p *ModelOptions) {
|
||||
p.F16Memory = true
|
||||
}
|
||||
|
||||
var EnableF16KV PredictOption = func(p *PredictOptions) {
|
||||
p.F16KV = true
|
||||
}
|
||||
|
||||
var Debug PredictOption = func(p *PredictOptions) {
|
||||
p.DebugMode = true
|
||||
}
|
||||
|
||||
var EnablePromptCacheAll PredictOption = func(p *PredictOptions) {
|
||||
p.PromptCacheAll = true
|
||||
}
|
||||
|
||||
var EnablePromptCacheRO PredictOption = func(p *PredictOptions) {
|
||||
p.PromptCacheRO = true
|
||||
}
|
||||
|
||||
var EnableMLock ModelOption = func(p *ModelOptions) {
|
||||
p.MLock = true
|
||||
}
|
||||
|
||||
// Create a new PredictOptions object with the given options.
|
||||
func NewModelOptions(opts ...ModelOption) ModelOptions {
|
||||
p := DefaultModelOptions
|
||||
for _, opt := range opts {
|
||||
opt(&p)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
var IgnoreEOS PredictOption = func(p *PredictOptions) {
|
||||
p.IgnoreEOS = true
|
||||
}
|
||||
|
||||
// WithGrammar sets the grammar to constrain the output of the LLM response
|
||||
func WithGrammar(s string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Grammar = s
|
||||
}
|
||||
}
|
||||
|
||||
// SetMlock sets the memory lock.
|
||||
func SetMlock(b bool) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MLock = b
|
||||
}
|
||||
}
|
||||
|
||||
// SetMemoryMap sets memory mapping.
|
||||
func SetMemoryMap(b bool) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MMap = b
|
||||
}
|
||||
}
|
||||
|
||||
// SetGPULayers sets the number of GPU layers to use to offload computation
|
||||
func SetGPULayers(n int) ModelOption {
|
||||
return func(p *ModelOptions) {
|
||||
p.NGPULayers = n
|
||||
}
|
||||
}
|
||||
|
||||
// SetTokenCallback sets the prompts that will stop predictions.
|
||||
func SetTokenCallback(fn func(string) bool) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TokenCallback = fn
|
||||
}
|
||||
}
|
||||
|
||||
// SetStopWords sets the prompts that will stop predictions.
|
||||
func SetStopWords(stop ...string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.StopPrompts = stop
|
||||
}
|
||||
}
|
||||
|
||||
// SetSeed sets the random seed for sampling text generation.
|
||||
func SetSeed(seed int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Seed = seed
|
||||
}
|
||||
}
|
||||
|
||||
// SetThreads sets the number of threads to use for text generation.
|
||||
func SetThreads(threads int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Threads = threads
|
||||
}
|
||||
}
|
||||
|
||||
// SetTokens sets the number of tokens to generate.
|
||||
func SetTokens(tokens int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Tokens = tokens
|
||||
}
|
||||
}
|
||||
|
||||
// SetTopK sets the value for top-K sampling.
|
||||
func SetTopK(topk int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TopK = topk
|
||||
}
|
||||
}
|
||||
|
||||
// SetTopP sets the value for nucleus sampling.
|
||||
func SetTopP(topp float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TopP = topp
|
||||
}
|
||||
}
|
||||
|
||||
// SetTemperature sets the temperature value for text generation.
|
||||
func SetTemperature(temp float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Temperature = temp
|
||||
}
|
||||
}
|
||||
|
||||
// SetPathPromptCache sets the session file to store the prompt cache.
|
||||
func SetPathPromptCache(f string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.PathPromptCache = f
|
||||
}
|
||||
}
|
||||
|
||||
// SetPenalty sets the repetition penalty for text generation.
|
||||
func SetPenalty(penalty float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Penalty = penalty
|
||||
}
|
||||
}
|
||||
|
||||
// SetRepeat sets the number of times to repeat text generation.
|
||||
func SetRepeat(repeat int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Repeat = repeat
|
||||
}
|
||||
}
|
||||
|
||||
// SetBatch sets the batch size.
|
||||
func SetBatch(size int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Batch = size
|
||||
}
|
||||
}
|
||||
|
||||
// SetKeep sets the number of tokens from initial prompt to keep.
|
||||
func SetNKeep(n int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.NKeep = n
|
||||
}
|
||||
}
|
||||
|
||||
// Create a new PredictOptions object with the given options.
|
||||
func NewPredictOptions(opts ...PredictOption) PredictOptions {
|
||||
p := DefaultOptions
|
||||
for _, opt := range opts {
|
||||
opt(&p)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// SetTailFreeSamplingZ sets the tail free sampling, parameter z.
|
||||
func SetTailFreeSamplingZ(tfz float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TailFreeSamplingZ = tfz
|
||||
}
|
||||
}
|
||||
|
||||
// SetTypicalP sets the typicality parameter, p_typical.
|
||||
func SetTypicalP(tp float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.TypicalP = tp
|
||||
}
|
||||
}
|
||||
|
||||
// SetFrequencyPenalty sets the frequency penalty parameter, freq_penalty.
|
||||
func SetFrequencyPenalty(fp float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.FrequencyPenalty = fp
|
||||
}
|
||||
}
|
||||
|
||||
// SetPresencePenalty sets the presence penalty parameter, presence_penalty.
|
||||
func SetPresencePenalty(pp float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.PresencePenalty = pp
|
||||
}
|
||||
}
|
||||
|
||||
// SetMirostat sets the mirostat parameter.
|
||||
func SetMirostat(m int) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.Mirostat = m
|
||||
}
|
||||
}
|
||||
|
||||
// SetMirostatETA sets the mirostat ETA parameter.
|
||||
func SetMirostatETA(me float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MirostatETA = me
|
||||
}
|
||||
}
|
||||
|
||||
// SetMirostatTAU sets the mirostat TAU parameter.
|
||||
func SetMirostatTAU(mt float32) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.MirostatTAU = mt
|
||||
}
|
||||
}
|
||||
|
||||
// SetPenalizeNL sets whether to penalize newlines or not.
|
||||
func SetPenalizeNL(pnl bool) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.PenalizeNL = pnl
|
||||
}
|
||||
}
|
||||
|
||||
// SetLogitBias sets the logit bias parameter.
|
||||
func SetLogitBias(lb string) PredictOption {
|
||||
return func(p *PredictOptions) {
|
||||
p.LogitBias = lb
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user