2025-10-23 13:06:22 +07:00

362 lines
12 KiB
C

// Copyright 2020-2021 Alpha Cephei Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* This header contains the C API for Vosk speech recognition system */
#ifndef VOSK_API_H
#define VOSK_API_H
#ifdef __cplusplus
extern "C" {
#endif
/** Model stores all the data required for recognition
* it contains static data and can be shared across processing
* threads. */
typedef struct VoskModel VoskModel;
/** Speaker model is the same as model but contains the data
* for speaker identification. */
typedef struct VoskSpkModel VoskSpkModel;
/** Recognizer object is the main object which processes data.
* Each recognizer usually runs in own thread and takes audio as input.
* Once audio is processed recognizer returns JSON object as a string
* which represent decoded information - words, confidences, times, n-best lists,
* speaker information and so on */
typedef struct VoskRecognizer VoskRecognizer;
/**
* Batch model object
*/
typedef struct VoskBatchModel VoskBatchModel;
/**
* Batch recognizer object
*/
typedef struct VoskBatchRecognizer VoskBatchRecognizer;
/** Loads model data from the file and returns the model object
*
* @param model_path: the path of the model on the filesystem
* @returns model object or NULL if problem occured */
VoskModel *vosk_model_new(const char *model_path);
/** Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too. */
void vosk_model_free(VoskModel *model);
/** Check if a word can be recognized by the model
* @param word: the word
* @returns the word symbol if @param word exists inside the model
* or -1 otherwise.
* Reminding that word symbol 0 is for <epsilon> */
int vosk_model_find_word(VoskModel *model, const char *word);
/** Loads speaker model data from the file and returns the model object
*
* @param model_path: the path of the model on the filesystem
* @returns model object or NULL if problem occured */
VoskSpkModel *vosk_spk_model_new(const char *model_path);
/** Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too. */
void vosk_spk_model_free(VoskSpkModel *model);
/** Creates the recognizer object
*
* The recognizers process the speech and return text using shared model data
* @param model VoskModel containing static data for recognizer. Model can be
* shared across recognizers, even running in different threads.
* @param sample_rate The sample rate of the audio you going to feed into the recognizer.
* Make sure this rate matches the audio content, it is a common
* issue causing accuracy problems.
* @returns recognizer object or NULL if problem occured */
VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate);
/** Creates the recognizer object with speaker recognition
*
* With the speaker recognition mode the recognizer not just recognize
* text but also return speaker vectors one can use for speaker identification
*
* @param model VoskModel containing static data for recognizer. Model can be
* shared across recognizers, even running in different threads.
* @param sample_rate The sample rate of the audio you going to feed into the recognizer.
* Make sure this rate matches the audio content, it is a common
* issue causing accuracy problems.
* @param spk_model speaker model for speaker identification
* @returns recognizer object or NULL if problem occured */
VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, VoskSpkModel *spk_model);
/** Creates the recognizer object with the phrase list
*
* Sometimes when you want to improve recognition accuracy and when you don't need
* to recognize large vocabulary you can specify a list of phrases to recognize. This
* will improve recognizer speed and accuracy but might return [unk] if user said
* something different.
*
* Only recognizers with lookahead models support this type of quick configuration.
* Precompiled HCLG graph models are not supported.
*
* @param model VoskModel containing static data for recognizer. Model can be
* shared across recognizers, even running in different threads.
* @param sample_rate The sample rate of the audio you going to feed into the recognizer.
* Make sure this rate matches the audio content, it is a common
* issue causing accuracy problems.
* @param grammar The string with the list of phrases to recognize as JSON array of strings,
* for example "["one two three four five", "[unk]"]".
*
* @returns recognizer object or NULL if problem occured */
VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar);
/** Adds speaker model to already initialized recognizer
*
* Can add speaker recognition model to already created recognizer. Helps to initialize
* speaker recognition for grammar-based recognizer.
*
* @param spk_model Speaker recognition model */
void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model);
/** Reconfigures recognizer to use grammar
*
* @param recognizer Already running VoskRecognizer
* @param grammar Set of phrases in JSON array of strings or "[]" to use default model graph.
* See also vosk_recognizer_new_grm
*/
void vosk_recognizer_set_grm(VoskRecognizer *recognizer, char const *grammar);
/** Configures recognizer to output n-best results
*
* <pre>
* {
* "alternatives": [
* { "text": "one two three four five", "confidence": 0.97 },
* { "text": "one two three for five", "confidence": 0.03 },
* ]
* }
* </pre>
*
* @param max_alternatives - maximum alternatives to return from recognition results
*/
void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_alternatives);
/** Enables words with times in the output
*
* <pre>
* "result" : [{
* "conf" : 1.000000,
* "end" : 1.110000,
* "start" : 0.870000,
* "word" : "what"
* }, {
* "conf" : 1.000000,
* "end" : 1.530000,
* "start" : 1.110000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 1.950000,
* "start" : 1.530000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.340000,
* "start" : 1.950000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.610000,
* "start" : 2.340000,
* "word" : "one"
* }],
* </pre>
*
* @param words - boolean value
*/
void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words);
/** Like above return words and confidences in partial results
*
* @param partial_words - boolean value
*/
void vosk_recognizer_set_partial_words(VoskRecognizer *recognizer, int partial_words);
/** Set NLSML output
* @param nlsml - boolean value
*/
void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml);
/** Accept voice data
*
* accept and process new chunk of voice data
*
* @param data - audio data in PCM 16-bit mono format
* @param length - length of the audio data
* @returns 1 if silence is occured and you can retrieve a new utterance with result method
* 0 if decoding continues
* -1 if exception occured */
int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length);
/** Same as above but the version with the short data for language bindings where you have
* audio as array of shorts */
int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length);
/** Same as above but the version with the float data for language bindings where you have
* audio as array of floats */
int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length);
/** Returns speech recognition result
*
* @returns the result in JSON format which contains decoded line, decoded
* words, times in seconds and confidences. You can parse this result
* with any json parser
*
* <pre>
* {
* "text" : "what zero zero zero one"
* }
* </pre>
*
* If alternatives enabled it returns result with alternatives, see also vosk_recognizer_set_max_alternatives().
*
* If word times enabled returns word time, see also vosk_recognizer_set_word_times().
*/
const char *vosk_recognizer_result(VoskRecognizer *recognizer);
/** Returns partial speech recognition
*
* @returns partial speech recognition text which is not yet finalized.
* result may change as recognizer process more data.
*
* <pre>
* {
* "partial" : "cyril one eight zero"
* }
* </pre>
*/
const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer);
/** Returns speech recognition result. Same as result, but doesn't wait for silence
* You usually call it in the end of the stream to get final bits of audio. It
* flushes the feature pipeline, so all remaining audio chunks got processed.
*
* @returns speech result in JSON format.
*/
const char *vosk_recognizer_final_result(VoskRecognizer *recognizer);
/** Resets the recognizer
*
* Resets current results so the recognition can continue from scratch */
void vosk_recognizer_reset(VoskRecognizer *recognizer);
/** Releases recognizer object
*
* Underlying model is also unreferenced and if needed released */
void vosk_recognizer_free(VoskRecognizer *recognizer);
/** Set log level for Kaldi messages
*
* @param log_level the level
* 0 - default value to print info and error messages but no debug
* less than 0 - don't print info messages
* greather than 0 - more verbose mode
*/
void vosk_set_log_level(int log_level);
/**
* Init, automatically select a CUDA device and allow multithreading.
* Must be called once from the main thread.
* Has no effect if HAVE_CUDA flag is not set.
*/
void vosk_gpu_init();
/**
* Init CUDA device in a multi-threaded environment.
* Must be called for each thread.
* Has no effect if HAVE_CUDA flag is not set.
*/
void vosk_gpu_thread_init();
/** Creates the batch recognizer object
*
* @returns model object or NULL if problem occured */
VoskBatchModel *vosk_batch_model_new(const char *model_path);
/** Releases batch model object */
void vosk_batch_model_free(VoskBatchModel *model);
/** Wait for the processing */
void vosk_batch_model_wait(VoskBatchModel *model);
/** Creates batch recognizer object
* @returns recognizer object or NULL if problem occured */
VoskBatchRecognizer *vosk_batch_recognizer_new(VoskBatchModel *model, float sample_rate);
/** Releases batch recognizer object */
void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer);
/** Accept batch voice data */
void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, const char *data, int length);
/** Set NLSML output
* @param nlsml - boolean value
*/
void vosk_batch_recognizer_set_nlsml(VoskBatchRecognizer *recognizer, int nlsml);
/** Closes the stream */
void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer);
/** Return results */
const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer);
/** Release and free first retrieved result */
void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer);
/** Get amount of pending chunks for more intelligent waiting */
int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer);
#ifdef __cplusplus
}
#endif
#endif /* VOSK_API_H */