79 lines
2.0 KiB
Go
79 lines
2.0 KiB
Go
package punctuation
|
|
|
|
import (
|
|
"context"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type Heuristic struct{}
|
|
|
|
func (Heuristic) Active() bool {
|
|
return true
|
|
}
|
|
|
|
func (Heuristic) Restore(ctx context.Context, text, language string) (string, error) {
|
|
_ = ctx
|
|
text = strings.TrimSpace(text)
|
|
if text == "" {
|
|
return text, nil
|
|
}
|
|
text = normalizeSpaces(text)
|
|
text = capitalizeFirst(text)
|
|
lang := strings.ToLower(strings.TrimSpace(language))
|
|
if lang == "ru" || lang == "rus" || lang == "russian" || lang == "auto" {
|
|
text = heuristicRU(text)
|
|
} else {
|
|
text = heuristicEN(text)
|
|
}
|
|
return ensureTerminalPunct(text), nil
|
|
}
|
|
|
|
func normalizeSpaces(s string) string {
|
|
return strings.Join(strings.Fields(s), " ")
|
|
}
|
|
|
|
func capitalizeFirst(s string) string {
|
|
r, size := utf8.DecodeRuneInString(s)
|
|
if r == utf8.RuneError {
|
|
return s
|
|
}
|
|
return string(unicode.ToUpper(r)) + s[size:]
|
|
}
|
|
|
|
var (
|
|
reQuestionRU = regexp.MustCompile(`(?i)(^|.*\s)(как|что|где|когда|почему|зачем|кто|чей|какой|какая|какое|какие|сколько|зачем|откуда|куда|ли)(\s+[^.?!]+)$`)
|
|
reQuestionEN = regexp.MustCompile(`(?i)^(who|what|when|where|why|how|which|whose|whom|is|are|am|was|were|do|does|did|can|could|would|will|shall|should)\b`)
|
|
)
|
|
|
|
func heuristicRU(s string) string {
|
|
if reQuestionRU.MatchString(s) && !strings.HasSuffix(s, "?") {
|
|
return s + "?"
|
|
}
|
|
if !hasTerminalPunct(s) && len(strings.Fields(s)) <= 24 {
|
|
return s + "."
|
|
}
|
|
return s
|
|
}
|
|
|
|
func heuristicEN(s string) string {
|
|
lower := strings.ToLower(s)
|
|
if reQuestionEN.MatchString(lower) && !strings.HasSuffix(s, "?") {
|
|
return s + "?"
|
|
}
|
|
if !hasTerminalPunct(s) && len(strings.Fields(s)) <= 24 {
|
|
return s + "."
|
|
}
|
|
return s
|
|
}
|
|
|
|
|
|
func ensureTerminalPunct(s string) string {
|
|
if hasTerminalPunct(s) {
|
|
return s
|
|
}
|
|
return s + "."
|
|
}
|