admin 318b736244
Some checks failed
Docker Image / build-docker (push) Failing after 1m26s
Lint and Testing / lint (push) Successful in 43s
Lint and Testing / test (push) Successful in 5m38s
Lint and Testing / golangci (push) Successful in 1m14s
CodeQL / Analyze (go) (push) Successful in 6m23s
first commit
2026-06-04 19:25:56 +07:00

71 lines
1.8 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package punctuation
import (
"strings"
"unicode"
"unicode/utf8"
)
// terminalPunctRunes — знаки, после которых не добавляем ещё одну фразовую точку.
var terminalPunctRunes = map[rune]bool{
'.': true, '?': true, '!': true, '…': true,
',': true, ';': true, ':': true,
')': true, ']': true, '"': true, '\'': true,
'»': true, '”': true, '': true,
'。': true, '': true, '': true, '': true,
}
// CleanExcessive collapses duplicate and conflicting punctuation marks.
func CleanExcessive(s string) string {
s = strings.TrimSpace(s)
if s == "" {
return s
}
var b strings.Builder
b.Grow(len(s))
prevClass := 0 // 0 none, 1 comma-like, 2 end, 3 other punct
for i := 0; i < len(s); {
r, size := utf8.DecodeRuneInString(s[i:])
cls := punctClass(r)
if cls != 0 && cls == prevClass {
i += size
continue
}
if cls == 2 && prevClass == 1 {
// drop sentence end right after comma-like (e.g. "привет,.")
i += size
continue
}
b.WriteRune(r)
if cls != 0 {
prevClass = cls
} else if !unicode.IsSpace(r) {
prevClass = 0
}
i += size
}
return strings.TrimSpace(b.String())
}
func punctClass(r rune) int {
switch r {
case ',', '', '、', '،', ';', '؛', ':':
return 1
case '.', '?', '!', '…', '。', '', '':
return 2
}
if unicode.IsPunct(r) {
return 3
}
return 0
}
func hasTerminalPunct(s string) bool {
s = strings.TrimSpace(s)
if s == "" {
return false
}
r, _ := utf8.DecodeLastRuneInString(s)
return terminalPunctRunes[r]
}