156 lines
4.0 KiB
Go
156 lines
4.0 KiB
Go
package whisper
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
wpkg "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
|
)
|
|
|
|
// Turn is a speaker-active time range from diarization (seconds).
|
|
type Turn struct {
|
|
Start float32
|
|
End float32
|
|
Speaker int
|
|
}
|
|
|
|
// FormatOptions controls joining Whisper segments into one string with newlines.
|
|
type FormatOptions struct {
|
|
PauseGap time.Duration
|
|
SpeakerLabel string
|
|
UseSpeakers bool
|
|
}
|
|
|
|
// FormatSegments joins segment texts with \n on long pauses and optional speaker labels.
|
|
func FormatSegments(segments []wpkg.Segment, turns []Turn, opts FormatOptions) string {
|
|
if len(segments) == 0 {
|
|
return ""
|
|
}
|
|
if opts.PauseGap <= 0 {
|
|
opts.PauseGap = 1500 * time.Millisecond
|
|
}
|
|
label := strings.TrimSpace(opts.SpeakerLabel)
|
|
if label == "" {
|
|
label = "Спикер"
|
|
}
|
|
|
|
lines := make([]segmentLine, len(segments))
|
|
for i, seg := range segments {
|
|
lines[i] = segmentLine{
|
|
Text: strings.TrimSpace(seg.Text),
|
|
Start: seg.Start,
|
|
End: seg.End,
|
|
Speaker: -1,
|
|
}
|
|
}
|
|
if opts.UseSpeakers && len(turns) > 0 {
|
|
assignSpeakers(lines, turns)
|
|
}
|
|
|
|
var b strings.Builder
|
|
prevSpeaker := -2
|
|
prevIdx := -1
|
|
for i, line := range lines {
|
|
if line.Text == "" {
|
|
continue
|
|
}
|
|
if b.Len() > 0 && prevIdx >= 0 {
|
|
speakerBreak := opts.UseSpeakers && line.Speaker >= 0 && line.Speaker != prevSpeaker
|
|
pauseBreak := line.Start-lines[prevIdx].End >= opts.PauseGap
|
|
switch {
|
|
case speakerBreak:
|
|
b.WriteString("\n\n")
|
|
fmt.Fprintf(&b, "%s %d: ", label, line.Speaker+1)
|
|
case pauseBreak:
|
|
b.WriteString("\n")
|
|
default:
|
|
if !strings.HasSuffix(b.String(), " ") && !strings.HasSuffix(b.String(), "\n") {
|
|
b.WriteByte(' ')
|
|
}
|
|
}
|
|
} else if opts.UseSpeakers && line.Speaker >= 0 {
|
|
fmt.Fprintf(&b, "%s %d: ", label, line.Speaker+1)
|
|
}
|
|
b.WriteString(line.Text)
|
|
if line.Speaker >= 0 {
|
|
prevSpeaker = line.Speaker
|
|
}
|
|
prevIdx = i
|
|
}
|
|
return strings.TrimSpace(b.String())
|
|
}
|
|
|
|
type segmentLine struct {
|
|
Text string
|
|
Start time.Duration
|
|
End time.Duration
|
|
Speaker int
|
|
}
|
|
|
|
func assignSpeakers(lines []segmentLine, turns []Turn) {
|
|
for i := range lines {
|
|
mid := lines[i].Start + (lines[i].End-lines[i].Start)/2
|
|
lines[i].Speaker = speakerAt(mid, turns)
|
|
}
|
|
}
|
|
|
|
func speakerAt(t time.Duration, turns []Turn) int {
|
|
sec := float32(t.Seconds())
|
|
bestSpeaker := -1
|
|
bestOverlap := float32(0)
|
|
for _, tr := range turns {
|
|
if sec >= tr.Start && sec < tr.End {
|
|
return tr.Speaker
|
|
}
|
|
overlap := intervalOverlap(sec, sec, tr.Start, tr.End)
|
|
if overlap > bestOverlap {
|
|
bestOverlap = overlap
|
|
bestSpeaker = tr.Speaker
|
|
}
|
|
}
|
|
return bestSpeaker
|
|
}
|
|
|
|
func intervalOverlap(a0, a1, b0, b1 float32) float32 {
|
|
start := max32(a0, b0)
|
|
end := min32(a1, b1)
|
|
if end <= start {
|
|
return 0
|
|
}
|
|
return end - start
|
|
}
|
|
|
|
func max32(a, b float32) float32 {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func min32(a, b float32) float32 {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
// PunctuateSegments runs punctuation per Whisper segment (legacy helper).
|
|
// Prefer punctuating the full transcript after FormatSegments (see Engine.Result).
|
|
func PunctuateSegments(segments []wpkg.Segment, restore func(text string) (string, error)) ([]wpkg.Segment, error) {
|
|
out := make([]wpkg.Segment, len(segments))
|
|
copy(out, segments)
|
|
for i := range out {
|
|
t := strings.TrimSpace(out[i].Text)
|
|
if t == "" {
|
|
continue
|
|
}
|
|
p, err := restore(t)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out[i].Text = " " + strings.TrimSpace(p)
|
|
}
|
|
return out, nil
|
|
}
|