package whisper import ( "fmt" "strings" "time" wpkg "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper" ) // Turn is a speaker-active time range from diarization (seconds). type Turn struct { Start float32 End float32 Speaker int } // FormatOptions controls joining Whisper segments into one string with newlines. type FormatOptions struct { PauseGap time.Duration SpeakerLabel string UseSpeakers bool } // FormatSegments joins segment texts with \n on long pauses and optional speaker labels. func FormatSegments(segments []wpkg.Segment, turns []Turn, opts FormatOptions) string { if len(segments) == 0 { return "" } if opts.PauseGap <= 0 { opts.PauseGap = 1500 * time.Millisecond } label := strings.TrimSpace(opts.SpeakerLabel) if label == "" { label = "Спикер" } lines := make([]segmentLine, len(segments)) for i, seg := range segments { lines[i] = segmentLine{ Text: strings.TrimSpace(seg.Text), Start: seg.Start, End: seg.End, Speaker: -1, } } if opts.UseSpeakers && len(turns) > 0 { assignSpeakers(lines, turns) } var b strings.Builder prevSpeaker := -2 prevIdx := -1 for i, line := range lines { if line.Text == "" { continue } if b.Len() > 0 && prevIdx >= 0 { speakerBreak := opts.UseSpeakers && line.Speaker >= 0 && line.Speaker != prevSpeaker pauseBreak := line.Start-lines[prevIdx].End >= opts.PauseGap switch { case speakerBreak: b.WriteString("\n\n") fmt.Fprintf(&b, "%s %d: ", label, line.Speaker+1) case pauseBreak: b.WriteString("\n") default: if !strings.HasSuffix(b.String(), " ") && !strings.HasSuffix(b.String(), "\n") { b.WriteByte(' ') } } } else if opts.UseSpeakers && line.Speaker >= 0 { fmt.Fprintf(&b, "%s %d: ", label, line.Speaker+1) } b.WriteString(line.Text) if line.Speaker >= 0 { prevSpeaker = line.Speaker } prevIdx = i } return strings.TrimSpace(b.String()) } type segmentLine struct { Text string Start time.Duration End time.Duration Speaker int } func assignSpeakers(lines []segmentLine, turns []Turn) { for i := range lines { mid := lines[i].Start + (lines[i].End-lines[i].Start)/2 lines[i].Speaker = speakerAt(mid, turns) } } func speakerAt(t time.Duration, turns []Turn) int { sec := float32(t.Seconds()) bestSpeaker := -1 bestOverlap := float32(0) for _, tr := range turns { if sec >= tr.Start && sec < tr.End { return tr.Speaker } overlap := intervalOverlap(sec, sec, tr.Start, tr.End) if overlap > bestOverlap { bestOverlap = overlap bestSpeaker = tr.Speaker } } return bestSpeaker } func intervalOverlap(a0, a1, b0, b1 float32) float32 { start := max32(a0, b0) end := min32(a1, b1) if end <= start { return 0 } return end - start } func max32(a, b float32) float32 { if a > b { return a } return b } func min32(a, b float32) float32 { if a < b { return a } return b } // PunctuateSegments runs punctuation per Whisper segment (legacy helper). // Prefer punctuating the full transcript after FormatSegments (see Engine.Result). func PunctuateSegments(segments []wpkg.Segment, restore func(text string) (string, error)) ([]wpkg.Segment, error) { out := make([]wpkg.Segment, len(segments)) copy(out, segments) for i := range out { t := strings.TrimSpace(out[i].Text) if t == "" { continue } p, err := restore(t) if err != nil { return nil, err } out[i].Text = " " + strings.TrimSpace(p) } return out, nil }