package main import ( "os" "runtime" "strconv" "time" "go-whisper-api/api" "go-whisper-api/config" _ "github.com/joho/godotenv/autoload" "github.com/mattn/go-isatty" "github.com/rs/zerolog" "github.com/rs/zerolog/log" "github.com/urfave/cli/v2" ) var ( Version string ) func main() { isTerm := isatty.IsTerminal(os.Stdout.Fd()) zerolog.SetGlobalLevel(zerolog.InfoLevel) log.Logger = log.Output( zerolog.ConsoleWriter{ Out: os.Stderr, NoColor: !isTerm, }, ) zerolog.CallerMarshalFunc = func(pc uintptr, file string, line int) string { short := file for i := len(file) - 1; i > 0; i-- { if file[i] == '/' { short = file[i+1:] break } } file = short return file + ":" + strconv.Itoa(line) } app := cli.NewApp() app.Name = "go-whisper-api" app.Usage = "HTTP API for speech-to-text (SPR + OpenAI-compatible)." app.Copyright = "Copyright (c) " + strconv.Itoa(time.Now().Year()) + " Bo-Yi Wu" app.Authors = []*cli.Author{ { Name: "Bo-Yi Wu", Email: "appleboy.tw@gmail.com", }, } app.Version = Version app.Commands = []*cli.Command{ { Name: "serve", Aliases: []string{"s", "api"}, Usage: "start HTTP API server", Flags: serveFlags(), Action: runServe, }, } app.Flags = append(append([]cli.Flag{configFlag()}, punctuationFlags()...), serveFlags()...) app.Action = runServe if err := app.Run(os.Args); err != nil { log.Fatal().Err(err).Msg("can't run app") } } func configFlag() cli.Flag { return &cli.StringFlag{ Name: "config", Usage: "path to YAML config file (default: config.yaml if present)", EnvVars: []string{"CONFIG_PATH", "GO_WHISPER_CONFIG"}, } } func punctuationFlags() []cli.Flag { return []cli.Flag{ &cli.BoolFlag{ Name: "punctuation-enabled", Usage: "master switch: enable or disable punctuation (YAML: punctuation.enabled)", EnvVars: []string{"PUNCTUATION_ENABLED"}, }, &cli.BoolFlag{ Name: "punctuation-default-on", Usage: "apply punctuation by default when query/flags omitted (YAML: punctuation.default_on)", }, &cli.StringFlag{ Name: "punctuation-engine", Usage: "punctuation engine: off, heuristic, sherpa, sherpa-online, http, xlm", EnvVars: []string{"PUNCTUATION_ENGINE"}, }, } } func vadFlags() []cli.Flag { return []cli.Flag{ &cli.BoolFlag{ Name: "vad", Usage: "enable voice activity detection (Silero VAD)", EnvVars: []string{"API_VAD"}, }, &cli.StringFlag{ Name: "vad-model", Usage: "path to ggml Silero VAD model (e.g. models/ggml-silero-v6.2.0.bin)", EnvVars: []string{"API_VAD_MODEL"}, }, &cli.Float64Flag{ Name: "vad-threshold", Usage: "VAD speech threshold (0.0-1.0)", EnvVars: []string{"API_VAD_THRESHOLD"}, }, &cli.IntFlag{ Name: "vad-min-speech-ms", Usage: "minimum speech duration in ms", EnvVars: []string{"API_VAD_MIN_SPEECH_MS"}, }, &cli.IntFlag{ Name: "vad-min-silence-ms", Usage: "minimum silence between segments in ms", EnvVars: []string{"API_VAD_MIN_SILENCE_MS"}, }, &cli.Float64Flag{ Name: "vad-max-speech-sec", Usage: "maximum speech segment length in seconds (0 = unlimited)", EnvVars: []string{"API_VAD_MAX_SPEECH_SEC"}, }, &cli.IntFlag{ Name: "vad-speech-pad-ms", Usage: "padding around speech segments in ms", EnvVars: []string{"API_VAD_SPEECH_PAD_MS"}, }, &cli.Float64Flag{ Name: "vad-samples-overlap", Usage: "overlap between VAD segments in seconds", EnvVars: []string{"API_VAD_SAMPLES_OVERLAP"}, }, } } func serveFlags() []cli.Flag { flags := []cli.Flag{ &cli.StringFlag{ Name: "addr", Usage: "HTTP listen address", EnvVars: []string{"API_ADDR"}, Value: ":8080", }, &cli.StringFlag{ Name: "models-dir", Usage: "directory with ggml *.bin whisper models", EnvVars: []string{"API_MODELS_DIR"}, Value: "./models", }, &cli.StringFlag{ Name: "cache-dir", Usage: "directory for async task cache (waiting/ready)", EnvVars: []string{"API_CACHE_DIR"}, Value: "./cache", }, &cli.StringFlag{ Name: "language", Usage: "default language for speech recognition", EnvVars: []string{"API_LANGUAGE"}, Value: "auto", }, &cli.UintFlag{ Name: "threads", Usage: "number of threads for whisper", EnvVars: []string{"API_THREADS"}, Value: uint(runtime.NumCPU()), }, &cli.BoolFlag{ Name: "debug", Usage: "enable debug mode", EnvVars: []string{"API_DEBUG"}, }, &cli.BoolFlag{ Name: "speedup", Usage: "speed up audio by x2", EnvVars: []string{"API_SPEEDUP"}, }, &cli.BoolFlag{ Name: "translate", Usage: "translate to english", EnvVars: []string{"API_TRANSLATE"}, }, &cli.StringFlag{ Name: "prompt", Usage: "initial prompt", EnvVars: []string{"API_PROMPT"}, }, &cli.UintFlag{ Name: "max-context", Usage: "maximum text context tokens", EnvVars: []string{"API_MAX_CONTEXT"}, Value: 32, }, &cli.UintFlag{ Name: "beam-size", Usage: "beam size for beam search", EnvVars: []string{"API_BEAM_SIZE"}, Value: 5, }, &cli.Float64Flag{ Name: "entropy-thold", Usage: "entropy threshold", EnvVars: []string{"API_ENTROPY_THOLD"}, Value: 2.4, }, &cli.BoolFlag{ Name: "default-punctuation", Usage: "enable punctuation on STT by default", EnvVars: []string{"API_DEFAULT_PUNCTUATION"}, }, } return append(flags, vadFlags()...) } func runServe(c *cli.Context) error { apiCfg, err := config.APIFromCLI(c) if err != nil { return err } if apiCfg.Debug { zerolog.SetGlobalLevel(zerolog.DebugLevel) log.Logger = log.With().Caller().Logger() } vad := apiCfg.VAD.WithDefaults() if vad.Enabled { vad.Model = vad.ResolveModelPath(apiCfg.ModelsDir) if err := vad.Validate(); err != nil { return err } apiCfg.VAD = vad } tc, err := config.TranscodeFromCLI(c) if err != nil { return err } pc, err := config.PunctuationFromCLI(c) if err != nil { return err } dc, err := config.DiarizationFromCLI(c) if err != nil { return err } return api.Run(c.Context, apiCfg, tc, pc, dc) }