admin b5c083e06f
Some checks failed
CodeQL / Analyze (go) (push) Successful in 6m28s
Docker Image / build-docker (push) Failing after 13m26s
Lint and Testing / lint (push) Successful in 11m17s
Lint and Testing / test (push) Successful in 11m17s
Lint and Testing / golangci (push) Successful in 2m40s
first commit
2026-06-04 18:10:52 +07:00

252 lines
7.5 KiB
Go

package main
import (
"os"
"runtime"
"strconv"
"time"
"go-whisper-api/api"
"go-whisper-api/config"
_ "github.com/joho/godotenv/autoload"
"github.com/mattn/go-isatty"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
)
var (
Version string
)
func main() {
isTerm := isatty.IsTerminal(os.Stdout.Fd())
zerolog.SetGlobalLevel(zerolog.InfoLevel)
log.Logger = log.Output(
zerolog.ConsoleWriter{
Out: os.Stderr,
NoColor: !isTerm,
},
)
zerolog.CallerMarshalFunc = func(pc uintptr, file string, line int) string {
short := file
for i := len(file) - 1; i > 0; i-- {
if file[i] == '/' {
short = file[i+1:]
break
}
}
file = short
return file + ":" + strconv.Itoa(line)
}
app := cli.NewApp()
app.Name = "go-whisper-api"
app.Usage = "HTTP API for speech-to-text (SPR + OpenAI-compatible)."
app.Copyright = "Copyright (c) " + strconv.Itoa(time.Now().Year()) + " Bo-Yi Wu"
app.Authors = []*cli.Author{
{
Name: "Bo-Yi Wu",
Email: "appleboy.tw@gmail.com",
},
}
app.Version = Version
app.Commands = []*cli.Command{
{
Name: "serve",
Aliases: []string{"s", "api"},
Usage: "start HTTP API server",
Flags: serveFlags(),
Action: runServe,
},
}
app.Flags = append(append([]cli.Flag{configFlag()}, punctuationFlags()...), serveFlags()...)
app.Action = runServe
if err := app.Run(os.Args); err != nil {
log.Fatal().Err(err).Msg("can't run app")
}
}
func configFlag() cli.Flag {
return &cli.StringFlag{
Name: "config",
Usage: "path to YAML config file (default: config.yaml if present)",
EnvVars: []string{"CONFIG_PATH", "GO_WHISPER_CONFIG"},
}
}
func punctuationFlags() []cli.Flag {
return []cli.Flag{
&cli.BoolFlag{
Name: "punctuation-enabled",
Usage: "master switch: enable or disable punctuation (YAML: punctuation.enabled)",
EnvVars: []string{"PUNCTUATION_ENABLED"},
},
&cli.BoolFlag{
Name: "punctuation-default-on",
Usage: "apply punctuation by default when query/flags omitted (YAML: punctuation.default_on)",
},
&cli.StringFlag{
Name: "punctuation-engine",
Usage: "punctuation engine: off, heuristic, sherpa, sherpa-online, http, xlm",
EnvVars: []string{"PUNCTUATION_ENGINE"},
},
}
}
func vadFlags() []cli.Flag {
return []cli.Flag{
&cli.BoolFlag{
Name: "vad",
Usage: "enable voice activity detection (Silero VAD)",
EnvVars: []string{"API_VAD"},
},
&cli.StringFlag{
Name: "vad-model",
Usage: "path to ggml Silero VAD model (e.g. models/ggml-silero-v6.2.0.bin)",
EnvVars: []string{"API_VAD_MODEL"},
},
&cli.Float64Flag{
Name: "vad-threshold",
Usage: "VAD speech threshold (0.0-1.0)",
EnvVars: []string{"API_VAD_THRESHOLD"},
},
&cli.IntFlag{
Name: "vad-min-speech-ms",
Usage: "minimum speech duration in ms",
EnvVars: []string{"API_VAD_MIN_SPEECH_MS"},
},
&cli.IntFlag{
Name: "vad-min-silence-ms",
Usage: "minimum silence between segments in ms",
EnvVars: []string{"API_VAD_MIN_SILENCE_MS"},
},
&cli.Float64Flag{
Name: "vad-max-speech-sec",
Usage: "maximum speech segment length in seconds (0 = unlimited)",
EnvVars: []string{"API_VAD_MAX_SPEECH_SEC"},
},
&cli.IntFlag{
Name: "vad-speech-pad-ms",
Usage: "padding around speech segments in ms",
EnvVars: []string{"API_VAD_SPEECH_PAD_MS"},
},
&cli.Float64Flag{
Name: "vad-samples-overlap",
Usage: "overlap between VAD segments in seconds",
EnvVars: []string{"API_VAD_SAMPLES_OVERLAP"},
},
}
}
func serveFlags() []cli.Flag {
flags := []cli.Flag{
&cli.StringFlag{
Name: "addr",
Usage: "HTTP listen address",
EnvVars: []string{"API_ADDR"},
Value: ":8080",
},
&cli.StringFlag{
Name: "models-dir",
Usage: "directory with ggml *.bin whisper models",
EnvVars: []string{"API_MODELS_DIR"},
Value: "./models",
},
&cli.StringFlag{
Name: "cache-dir",
Usage: "directory for async task cache (waiting/ready)",
EnvVars: []string{"API_CACHE_DIR"},
Value: "./cache",
},
&cli.StringFlag{
Name: "language",
Usage: "default language for speech recognition",
EnvVars: []string{"API_LANGUAGE"},
Value: "auto",
},
&cli.UintFlag{
Name: "threads",
Usage: "number of threads for whisper",
EnvVars: []string{"API_THREADS"},
Value: uint(runtime.NumCPU()),
},
&cli.BoolFlag{
Name: "debug",
Usage: "enable debug mode",
EnvVars: []string{"API_DEBUG"},
},
&cli.BoolFlag{
Name: "speedup",
Usage: "speed up audio by x2",
EnvVars: []string{"API_SPEEDUP"},
},
&cli.BoolFlag{
Name: "translate",
Usage: "translate to english",
EnvVars: []string{"API_TRANSLATE"},
},
&cli.StringFlag{
Name: "prompt",
Usage: "initial prompt",
EnvVars: []string{"API_PROMPT"},
},
&cli.UintFlag{
Name: "max-context",
Usage: "maximum text context tokens",
EnvVars: []string{"API_MAX_CONTEXT"},
Value: 32,
},
&cli.UintFlag{
Name: "beam-size",
Usage: "beam size for beam search",
EnvVars: []string{"API_BEAM_SIZE"},
Value: 5,
},
&cli.Float64Flag{
Name: "entropy-thold",
Usage: "entropy threshold",
EnvVars: []string{"API_ENTROPY_THOLD"},
Value: 2.4,
},
&cli.BoolFlag{
Name: "default-punctuation",
Usage: "enable punctuation on STT by default",
EnvVars: []string{"API_DEFAULT_PUNCTUATION"},
},
}
return append(flags, vadFlags()...)
}
func runServe(c *cli.Context) error {
apiCfg, err := config.APIFromCLI(c)
if err != nil {
return err
}
if apiCfg.Debug {
zerolog.SetGlobalLevel(zerolog.DebugLevel)
log.Logger = log.With().Caller().Logger()
}
vad := apiCfg.VAD.WithDefaults()
if vad.Enabled {
vad.Model = vad.ResolveModelPath(apiCfg.ModelsDir)
if err := vad.Validate(); err != nil {
return err
}
apiCfg.VAD = vad
}
tc, err := config.TranscodeFromCLI(c)
if err != nil {
return err
}
pc, err := config.PunctuationFromCLI(c)
if err != nil {
return err
}
dc, err := config.DiarizationFromCLI(c)
if err != nil {
return err
}
return api.Run(c.Context, apiCfg, tc, pc, dc)
}