71 lines
1.8 KiB
Go
71 lines
1.8 KiB
Go
package punctuation
|
||
|
||
import (
|
||
"strings"
|
||
"unicode"
|
||
"unicode/utf8"
|
||
)
|
||
|
||
// terminalPunctRunes — знаки, после которых не добавляем ещё одну фразовую точку.
|
||
var terminalPunctRunes = map[rune]bool{
|
||
'.': true, '?': true, '!': true, '…': true,
|
||
',': true, ';': true, ':': true,
|
||
')': true, ']': true, '"': true, '\'': true,
|
||
'»': true, '”': true, '’': true,
|
||
'。': true, ',': true, '?': true, '!': true,
|
||
}
|
||
|
||
// CleanExcessive collapses duplicate and conflicting punctuation marks.
|
||
func CleanExcessive(s string) string {
|
||
s = strings.TrimSpace(s)
|
||
if s == "" {
|
||
return s
|
||
}
|
||
var b strings.Builder
|
||
b.Grow(len(s))
|
||
prevClass := 0 // 0 none, 1 comma-like, 2 end, 3 other punct
|
||
for i := 0; i < len(s); {
|
||
r, size := utf8.DecodeRuneInString(s[i:])
|
||
cls := punctClass(r)
|
||
if cls != 0 && cls == prevClass {
|
||
i += size
|
||
continue
|
||
}
|
||
if cls == 2 && prevClass == 1 {
|
||
// drop sentence end right after comma-like (e.g. "привет,.")
|
||
i += size
|
||
continue
|
||
}
|
||
b.WriteRune(r)
|
||
if cls != 0 {
|
||
prevClass = cls
|
||
} else if !unicode.IsSpace(r) {
|
||
prevClass = 0
|
||
}
|
||
i += size
|
||
}
|
||
return strings.TrimSpace(b.String())
|
||
}
|
||
|
||
func punctClass(r rune) int {
|
||
switch r {
|
||
case ',', ',', '、', '،', ';', '؛', ':':
|
||
return 1
|
||
case '.', '?', '!', '…', '。', '?', '!':
|
||
return 2
|
||
}
|
||
if unicode.IsPunct(r) {
|
||
return 3
|
||
}
|
||
return 0
|
||
}
|
||
|
||
func hasTerminalPunct(s string) bool {
|
||
s = strings.TrimSpace(s)
|
||
if s == "" {
|
||
return false
|
||
}
|
||
r, _ := utf8.DecodeLastRuneInString(s)
|
||
return terminalPunctRunes[r]
|
||
}
|