Some checks failed
CodeQL / Analyze (go) (push) Successful in 6m28s
Docker Image / build-docker (push) Failing after 13m26s
Lint and Testing / lint (push) Successful in 11m17s
Lint and Testing / test (push) Successful in 11m17s
Lint and Testing / golangci (push) Successful in 2m40s
57 lines
1.3 KiB
Go
57 lines
1.3 KiB
Go
package garbage
|
|
|
|
import (
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
var spaceCollapse = regexp.MustCompile(`\s+`)
|
|
|
|
// Word is a timed token for garbage filtering (mirrors whisper.Word JSON shape).
|
|
type Word struct {
|
|
Word string `json:"word"`
|
|
Start int `json:"start"`
|
|
Stop int `json:"stop"`
|
|
}
|
|
|
|
// FilterText removes configured artifact substrings and normalizes whitespace.
|
|
func FilterText(text string, patterns []string) string {
|
|
for _, p := range patterns {
|
|
p = strings.TrimSpace(p)
|
|
if p == "" {
|
|
continue
|
|
}
|
|
text = strings.ReplaceAll(text, p, " ")
|
|
}
|
|
return strings.TrimSpace(spaceCollapse.ReplaceAllString(text, " "))
|
|
}
|
|
|
|
// FilterWords drops tokens that match any garbage pattern.
|
|
func FilterWords(words []Word, patterns []string) []Word {
|
|
if len(words) == 0 {
|
|
return words
|
|
}
|
|
out := make([]Word, 0, len(words))
|
|
for _, w := range words {
|
|
if matchesGarbage(w.Word, patterns) {
|
|
continue
|
|
}
|
|
out = append(out, w)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func matchesGarbage(word string, patterns []string) bool {
|
|
word = strings.TrimSpace(word)
|
|
for _, p := range patterns {
|
|
p = strings.TrimSpace(p)
|
|
if p == "" {
|
|
continue
|
|
}
|
|
if word == p || strings.Contains(word, p) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|