admin 318b736244
Some checks failed
Docker Image / build-docker (push) Failing after 1m26s
Lint and Testing / lint (push) Successful in 43s
Lint and Testing / test (push) Successful in 5m38s
Lint and Testing / golangci (push) Successful in 1m14s
CodeQL / Analyze (go) (push) Successful in 6m23s
first commit
2026-06-04 19:25:56 +07:00

79 lines
2.0 KiB
Go

package punctuation
import (
"context"
"regexp"
"strings"
"unicode"
"unicode/utf8"
)
type Heuristic struct{}
func (Heuristic) Active() bool {
return true
}
func (Heuristic) Restore(ctx context.Context, text, language string) (string, error) {
_ = ctx
text = strings.TrimSpace(text)
if text == "" {
return text, nil
}
text = normalizeSpaces(text)
text = capitalizeFirst(text)
lang := strings.ToLower(strings.TrimSpace(language))
if lang == "ru" || lang == "rus" || lang == "russian" || lang == "auto" {
text = heuristicRU(text)
} else {
text = heuristicEN(text)
}
return ensureTerminalPunct(text), nil
}
func normalizeSpaces(s string) string {
return strings.Join(strings.Fields(s), " ")
}
func capitalizeFirst(s string) string {
r, size := utf8.DecodeRuneInString(s)
if r == utf8.RuneError {
return s
}
return string(unicode.ToUpper(r)) + s[size:]
}
var (
reQuestionRU = regexp.MustCompile(`(?i)(^|.*\s)(как|что|где|когда|почему|зачем|кто|чей|какой|какая|какое|какие|сколько|зачем|откуда|куда|ли)(\s+[^.?!]+)$`)
reQuestionEN = regexp.MustCompile(`(?i)^(who|what|when|where|why|how|which|whose|whom|is|are|am|was|were|do|does|did|can|could|would|will|shall|should)\b`)
)
func heuristicRU(s string) string {
if reQuestionRU.MatchString(s) && !strings.HasSuffix(s, "?") {
return s + "?"
}
if !hasTerminalPunct(s) && len(strings.Fields(s)) <= 24 {
return s + "."
}
return s
}
func heuristicEN(s string) string {
lower := strings.ToLower(s)
if reQuestionEN.MatchString(lower) && !strings.HasSuffix(s, "?") {
return s + "?"
}
if !hasTerminalPunct(s) && len(strings.Fields(s)) <= 24 {
return s + "."
}
return s
}
func ensureTerminalPunct(s string) string {
if hasTerminalPunct(s) {
return s
}
return s + "."
}