admin b5c083e06f
Some checks failed
CodeQL / Analyze (go) (push) Successful in 6m28s
Docker Image / build-docker (push) Failing after 13m26s
Lint and Testing / lint (push) Successful in 11m17s
Lint and Testing / test (push) Successful in 11m17s
Lint and Testing / golangci (push) Successful in 2m40s
first commit
2026-06-04 18:10:52 +07:00

57 lines
1.3 KiB
Go

package garbage
import (
"regexp"
"strings"
)
var spaceCollapse = regexp.MustCompile(`\s+`)
// Word is a timed token for garbage filtering (mirrors whisper.Word JSON shape).
type Word struct {
Word string `json:"word"`
Start int `json:"start"`
Stop int `json:"stop"`
}
// FilterText removes configured artifact substrings and normalizes whitespace.
func FilterText(text string, patterns []string) string {
for _, p := range patterns {
p = strings.TrimSpace(p)
if p == "" {
continue
}
text = strings.ReplaceAll(text, p, " ")
}
return strings.TrimSpace(spaceCollapse.ReplaceAllString(text, " "))
}
// FilterWords drops tokens that match any garbage pattern.
func FilterWords(words []Word, patterns []string) []Word {
if len(words) == 0 {
return words
}
out := make([]Word, 0, len(words))
for _, w := range words {
if matchesGarbage(w.Word, patterns) {
continue
}
out = append(out, w)
}
return out
}
func matchesGarbage(word string, patterns []string) bool {
word = strings.TrimSpace(word)
for _, p := range patterns {
p = strings.TrimSpace(p)
if p == "" {
continue
}
if word == p || strings.Contains(word, p) {
return true
}
}
return false
}