first commit
Some checks failed
Docker Image / build-docker (push) Failing after 1m26s
Lint and Testing / lint (push) Successful in 43s
Lint and Testing / test (push) Successful in 5m38s
Lint and Testing / golangci (push) Successful in 1m14s
CodeQL / Analyze (go) (push) Successful in 6m23s

This commit is contained in:
admin 2026-06-04 19:25:56 +07:00
parent b5c083e06f
commit 318b736244
8 changed files with 832 additions and 373 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,131 @@
package apidoc_test
import (
"encoding/json"
"os"
"path/filepath"
"strings"
"testing"
)
func swaggerPath(t *testing.T) string {
t.Helper()
path := filepath.Join("..", "..", "api", "swagger.json")
abs, err := filepath.Abs(path)
if err != nil {
t.Fatal(err)
}
if _, err := os.Stat(abs); err != nil {
t.Fatal(err)
}
return abs
}
var documentedPaths = []struct {
path string
method string
}{
{"/swagger.json", "get"},
{"/spr/models", "get"},
{"/spr/hostname", "get"},
{"/spr/queue", "get"},
{"/spr/queue/{taskID}", "get"},
{"/spr/queue/{taskID}", "delete"},
{"/spr/stt/{id}", "post"},
{"/spr/result/{taskID}", "get"},
{"/spr/audio/{taskID}", "get"},
{"/spr/waveform/{taskID}", "get"},
{"/spr/import/{id}", "post"},
{"/spr/export/{id}", "get"},
{"/spr/delete/{id}", "delete"},
{"/v1/models", "get"},
{"/v1/audio/transcriptions", "post"},
{"/v1/audio/transcriptions/", "post"},
}
func TestSwaggerJSON_validAndComplete(t *testing.T) {
data, err := os.ReadFile(swaggerPath(t))
if err != nil {
t.Fatal(err)
}
var spec struct {
Swagger string `json:"swagger"`
Paths map[string]map[string]json.RawMessage `json:"paths"`
Info struct {
Title string `json:"title"`
} `json:"info"`
}
if err := json.Unmarshal(data, &spec); err != nil {
t.Fatalf("invalid JSON: %v", err)
}
if spec.Swagger != "2.0" {
t.Fatalf("swagger version: got %q", spec.Swagger)
}
for _, want := range documentedPaths {
methods, ok := spec.Paths[want.path]
if !ok {
t.Errorf("missing path %s", want.path)
continue
}
if _, ok := methods[want.method]; !ok {
t.Errorf("path %s missing method %s", want.path, want.method)
}
}
}
func TestSwaggerJSON_sttQueryParamsMatchImplementation(t *testing.T) {
data, err := os.ReadFile(swaggerPath(t))
if err != nil {
t.Fatal(err)
}
var spec struct {
Paths map[string]struct {
Post struct {
Parameters []struct {
Name string `json:"name"`
In string `json:"in"`
} `json:"parameters"`
} `json:"post"`
} `json:"paths"`
}
if err := json.Unmarshal(data, &spec); err != nil {
t.Fatal(err)
}
stt := spec.Paths["/spr/stt/{id}"]
var queryNames []string
for _, p := range stt.Post.Parameters {
if p.In == "query" {
queryNames = append(queryNames, p.Name)
}
}
for _, required := range []string{"async", "language", "punctuation", "speakers", "speaker_counter"} {
if !contains(queryNames, required) {
t.Errorf("STT missing query param %q", required)
}
}
for _, removed := range []string{"webhook", "toxicity", "normalization", "vad", "classifiers"} {
if contains(queryNames, removed) {
t.Errorf("STT documents unused param %q", removed)
}
}
}
func TestSwaggerJSON_hasOpenAIPaths(t *testing.T) {
data, err := os.ReadFile(swaggerPath(t))
if err != nil {
t.Fatal(err)
}
s := string(data)
if !strings.Contains(s, "/v1/audio/transcriptions") {
t.Fatal("missing OpenAI transcription path")
}
}
func contains(ss []string, s string) bool {
for _, x := range ss {
if x == s {
return true
}
}
return false
}

View File

@ -69,14 +69,6 @@ func heuristicEN(s string) string {
return s
}
func hasTerminalPunct(s string) bool {
s = strings.TrimSpace(s)
if s == "" {
return false
}
r, _ := utf8.DecodeLastRuneInString(s)
return r == '.' || r == '?' || r == '!' || r == '…'
}
func ensureTerminalPunct(s string) string {
if hasTerminalPunct(s) {

70
punctuation/normalize.go Normal file
View File

@ -0,0 +1,70 @@
package punctuation
import (
"strings"
"unicode"
"unicode/utf8"
)
// terminalPunctRunes — знаки, после которых не добавляем ещё одну фразовую точку.
var terminalPunctRunes = map[rune]bool{
'.': true, '?': true, '!': true, '…': true,
',': true, ';': true, ':': true,
')': true, ']': true, '"': true, '\'': true,
'»': true, '”': true, '': true,
'。': true, '': true, '': true, '': true,
}
// CleanExcessive collapses duplicate and conflicting punctuation marks.
func CleanExcessive(s string) string {
s = strings.TrimSpace(s)
if s == "" {
return s
}
var b strings.Builder
b.Grow(len(s))
prevClass := 0 // 0 none, 1 comma-like, 2 end, 3 other punct
for i := 0; i < len(s); {
r, size := utf8.DecodeRuneInString(s[i:])
cls := punctClass(r)
if cls != 0 && cls == prevClass {
i += size
continue
}
if cls == 2 && prevClass == 1 {
// drop sentence end right after comma-like (e.g. "привет,.")
i += size
continue
}
b.WriteRune(r)
if cls != 0 {
prevClass = cls
} else if !unicode.IsSpace(r) {
prevClass = 0
}
i += size
}
return strings.TrimSpace(b.String())
}
func punctClass(r rune) int {
switch r {
case ',', '', '、', '،', ';', '؛', ':':
return 1
case '.', '?', '!', '…', '。', '', '':
return 2
}
if unicode.IsPunct(r) {
return 3
}
return 0
}
func hasTerminalPunct(s string) bool {
s = strings.TrimSpace(s)
if s == "" {
return false
}
r, _ := utf8.DecodeLastRuneInString(s)
return terminalPunctRunes[r]
}

View File

@ -0,0 +1,46 @@
package punctuation
import (
"context"
"strings"
"testing"
)
func TestCleanExcessive(t *testing.T) {
cases := []struct {
in, want string
}{
{"привет,,", "привет,"},
{"привет,.", "привет,"},
{"hello..", "hello."},
{"what??", "what?"},
{"ok!!!", "ok!"},
{"а. б. в.", "а. б. в."},
}
for _, tc := range cases {
got := CleanExcessive(tc.in)
if got != tc.want {
t.Errorf("CleanExcessive(%q) = %q, want %q", tc.in, got, tc.want)
}
}
}
func TestHasTerminalPunct_comma(t *testing.T) {
if !hasTerminalPunct("привет,") {
t.Fatal("comma should count as terminal for heuristic")
}
if hasTerminalPunct("привет") {
t.Fatal("bare word should not")
}
}
func TestHeuristic_noCommaPeriod(t *testing.T) {
h := Heuristic{}
out, err := h.Restore(context.Background(), "привет, мир", "ru")
if err != nil {
t.Fatal(err)
}
if strings.Contains(out, ",.") {
t.Fatalf("unexpected comma+period: %q", out)
}
}

View File

@ -116,7 +116,11 @@ func Apply(ctx context.Context, r Restorer, enabled bool, text, language string)
if text == "" {
return text, nil
}
return r.Restore(ctx, text, language)
out, err := r.Restore(ctx, text, language)
if err != nil {
return "", err
}
return CleanExcessive(out), nil
}
func Close(r Restorer) {

View File

@ -135,7 +135,8 @@ func min32(a, b float32) float32 {
return b
}
// PunctuateSegments runs punctuation on each segment separately (preserves line breaks).
// PunctuateSegments runs punctuation per Whisper segment (legacy helper).
// Prefer punctuating the full transcript after FormatSegments (see Engine.Result).
func PunctuateSegments(segments []wpkg.Segment, restore func(text string) (string, error)) ([]wpkg.Segment, error) {
out := make([]wpkg.Segment, len(segments))
copy(out, segments)

View File

@ -186,13 +186,12 @@ func (e *Engine) SetTranscriptText(text string) {
func (e *Engine) Result() TranscriptResult {
segments := e.segments
if e.runOpts.PunctuateRestore != nil {
updated, err := PunctuateSegments(segments, e.runOpts.PunctuateRestore)
if err == nil {
segments = updated
text := FormatSegments(segments, e.runOpts.Turns, e.runOpts.Format)
if e.runOpts.PunctuateRestore != nil && text != "" {
if updated, err := e.runOpts.PunctuateRestore(text); err == nil && strings.TrimSpace(updated) != "" {
text = updated
}
}
text := FormatSegments(segments, e.runOpts.Turns, e.runOpts.Format)
var words []Word
for _, segment := range segments {
words = append(words, segmentWords(segment)...)