go-whisper-api/transcode/mp4_aac_decode.go
admin b5c083e06f
Some checks failed
CodeQL / Analyze (go) (push) Successful in 6m28s
Docker Image / build-docker (push) Failing after 13m26s
Lint and Testing / lint (push) Successful in 11m17s
Lint and Testing / test (push) Successful in 11m17s
Lint and Testing / golangci (push) Successful in 2m40s
first commit
2026-06-04 18:10:52 +07:00

289 lines
9.2 KiB
Go

package transcode
import (
"errors"
"fmt"
"io"
"github.com/Eyevinn/mp4ff/mp4"
"github.com/olivier-w/climp-aac-decoder/aacfile"
aacdec "github.com/skrashevich/go-aac/pkg/decoder"
)
type mp4AACSample struct {
offset int64
size int
}
func isMP4SampleDeltaError(err error) bool {
var uf *aacfile.UnsupportedFeatureError
if !errors.As(err, &uf) {
return false
}
return uf.Feature == "MP4 sample delta" || uf.Feature == "MP4 sample delta layout"
}
// decodeMP4AACRelaxed demuxes MP4/M4A with mp4ff (ignoring stts sample deltas) and
// decodes raw AAC frames with go-aac. Used when climp-aac-decoder rejects stts
// entries whose delta is not exactly 1024 (common in ffmpeg/phone muxers).
func decodeMP4AACRelaxed(r io.ReaderAt, size int64) ([]float64, int, int, error) {
asc, samples, leading, err := demuxMP4AAC(r, size)
if err != nil {
return nil, 0, 0, err
}
dec := aacdec.New()
if err := dec.SetASC(asc); err != nil {
return nil, 0, 0, fmt.Errorf("aac config: %w", err)
}
ch := dec.Config.ChanConfig
if ch < 1 {
return nil, 0, 0, fmt.Errorf("aac config: invalid channel count %d", ch)
}
sr := dec.Config.SampleRate
if sr <= 0 {
return nil, 0, 0, fmt.Errorf("aac config: invalid sample rate %d", sr)
}
maxSize := 0
for _, s := range samples {
if s.size > maxSize {
maxSize = s.size
}
}
buf := make([]byte, maxSize)
var pcm []float32
for i, s := range samples {
if cap(buf) < s.size {
buf = make([]byte, s.size)
}
frame := buf[:s.size]
if _, err := r.ReadAt(frame, s.offset); err != nil {
return nil, 0, 0, fmt.Errorf("read mp4 aac sample %d: %w", i, err)
}
out, err := dec.DecodeFrame(frame)
if err != nil {
return nil, 0, 0, fmt.Errorf("decode mp4 aac sample %d: %w", i, err)
}
pcm = append(pcm, out...)
}
skipSamples := leading * ch
if skipSamples > len(pcm) {
skipSamples = len(pcm)
}
pcm = pcm[skipSamples:]
samplesF64 := make([]float64, len(pcm))
for i, v := range pcm {
samplesF64[i] = float64(v)
}
if ch > 1 {
samplesF64 = float32InterleavedToMono(samplesF64, ch)
ch = 1
}
return samplesF64, sr, ch, nil
}
func float32InterleavedToMono(samples []float64, channels int) []float64 {
if channels <= 1 {
return samples
}
nFrames := len(samples) / channels
out := make([]float64, nFrames)
for i := 0; i < nFrames; i++ {
var sum float64
for c := 0; c < channels; c++ {
sum += samples[i*channels+c]
}
out[i] = sum / float64(channels)
}
return out
}
func demuxMP4AAC(r io.ReaderAt, size int64) (asc []byte, samples []mp4AACSample, leading int, err error) {
file, err := mp4.DecodeFile(io.NewSectionReader(r, 0, size), mp4.WithDecodeMode(mp4.DecModeLazyMdat))
if err != nil {
return nil, nil, 0, fmt.Errorf("mp4 decode: %w", err)
}
if file.IsFragmented() {
return nil, nil, 0, fmt.Errorf("fragmented mp4 is not supported")
}
if file.Moov == nil {
return nil, nil, 0, fmt.Errorf("mp4: missing moov")
}
var audioTracks []*mp4.TrakBox
for _, trak := range file.Moov.Traks {
if trak != nil && trak.Mdia != nil && trak.Mdia.Hdlr != nil && trak.Mdia.Hdlr.HandlerType == "soun" {
audioTracks = append(audioTracks, trak)
}
}
if len(audioTracks) != 1 {
return nil, nil, 0, fmt.Errorf("mp4: expected one audio track, found %d", len(audioTracks))
}
trak := audioTracks[0]
if trak.Mdia == nil || trak.Mdia.Minf == nil || trak.Mdia.Minf.Stbl == nil || trak.Mdia.Minf.Stbl.Stsd == nil {
return nil, nil, 0, fmt.Errorf("mp4: incomplete audio track")
}
stsd := trak.Mdia.Minf.Stbl.Stsd
if len(stsd.Children) != 1 {
return nil, nil, 0, fmt.Errorf("mp4: multiple sample descriptions")
}
if stsd.Enca != nil {
return nil, nil, 0, fmt.Errorf("mp4: encrypted audio")
}
sampleEntry := stsd.Mp4a
if sampleEntry == nil {
return nil, nil, 0, fmt.Errorf("mp4: unsupported audio sample entry %s", stsd.Children[0].Type())
}
if sampleEntry.Sinf != nil {
return nil, nil, 0, fmt.Errorf("mp4: encrypted audio")
}
if sampleEntry.Esds == nil ||
sampleEntry.Esds.DecConfigDescriptor == nil ||
sampleEntry.Esds.DecConfigDescriptor.DecSpecificInfo == nil ||
len(sampleEntry.Esds.DecConfigDescriptor.DecSpecificInfo.DecConfig) == 0 {
return nil, nil, 0, fmt.Errorf("mp4: missing AudioSpecificConfig")
}
asc = append([]byte(nil), sampleEntry.Esds.DecConfigDescriptor.DecSpecificInfo.DecConfig...)
leading, _ = mp4LeadingTrimRelaxed(trak)
samples, err = buildMP4AACSamples(trak, size)
if err != nil {
return nil, nil, 0, err
}
if len(samples) == 0 {
return nil, nil, 0, fmt.Errorf("mp4: no audio samples")
}
return asc, samples, leading, nil
}
func mp4LeadingTrimRelaxed(trak *mp4.TrakBox) (int, error) {
if trak.Edts == nil || len(trak.Edts.Elst) == 0 {
return 0, nil
}
if len(trak.Edts.Elst) != 1 || len(trak.Edts.Elst[0].Entries) != 1 {
return 0, nil
}
entry := trak.Edts.Elst[0].Entries[0]
if entry.MediaRateInteger != 1 || entry.MediaRateFraction != 0 {
return 0, nil
}
if entry.MediaTime < 0 {
return 0, nil
}
return int(entry.MediaTime), nil
}
func buildMP4AACSamples(trak *mp4.TrakBox, size int64) ([]mp4AACSample, error) {
if trak.Mdia == nil || trak.Mdia.Minf == nil || trak.Mdia.Minf.Stbl == nil {
return nil, fmt.Errorf("mp4: incomplete sample table")
}
stbl := trak.Mdia.Minf.Stbl
if stbl.Stsc == nil || stbl.Stsz == nil {
return nil, fmt.Errorf("mp4: incomplete sample table")
}
if stbl.Stco == nil && stbl.Co64 == nil {
return nil, fmt.Errorf("mp4: missing chunk offsets")
}
if len(stbl.Stsc.Entries) == 0 {
return nil, fmt.Errorf("mp4: empty chunk map")
}
totalSamples := int(trak.GetNrSamples())
if totalSamples <= 0 {
return nil, fmt.Errorf("mp4: empty sample table")
}
sampleSizes, err := mp4AACSampleSizes(stbl.Stsz, totalSamples)
if err != nil {
return nil, err
}
chunkOffsets, err := mp4AACChunkOffsets(stbl)
if err != nil {
return nil, err
}
out := make([]mp4AACSample, 0, totalSamples)
sampleIndex := 0
entryIndex := 0
entry := stbl.Stsc.Entries[entryIndex]
for chunkIndex := 0; chunkIndex < len(chunkOffsets) && sampleIndex < totalSamples; chunkIndex++ {
chunkNr := uint32(chunkIndex + 1)
for entryIndex+1 < len(stbl.Stsc.Entries) && chunkNr >= stbl.Stsc.Entries[entryIndex+1].FirstChunk {
entryIndex++
entry = stbl.Stsc.Entries[entryIndex]
}
if entry.SamplesPerChunk == 0 {
return nil, fmt.Errorf("mp4: zero samples per chunk")
}
offset := chunkOffsets[chunkIndex]
samplesPerChunk := int(entry.SamplesPerChunk)
for i := 0; i < samplesPerChunk && sampleIndex < totalSamples; i++ {
sampleSize := sampleSizes[sampleIndex]
end := offset + int64(sampleSize)
if offset < 0 || end < offset || end > size {
return nil, fmt.Errorf("mp4: invalid sample bounds at sample %d", sampleIndex+1)
}
out = append(out, mp4AACSample{offset: offset, size: sampleSize})
offset = end
sampleIndex++
}
}
if sampleIndex != totalSamples {
return nil, fmt.Errorf("mp4: sample table mismatch")
}
return out, nil
}
func mp4AACSampleSizes(stsz *mp4.StszBox, totalSamples int) ([]int, error) {
if stsz == nil {
return nil, fmt.Errorf("mp4: missing sample sizes")
}
if int(stsz.GetNrSamples()) != totalSamples {
return nil, fmt.Errorf("mp4: sample size count mismatch")
}
sizes := make([]int, totalSamples)
if stsz.SampleUniformSize != 0 {
sz := int(stsz.SampleUniformSize)
for i := range sizes {
sizes[i] = sz
}
return sizes, nil
}
if len(stsz.SampleSize) != totalSamples {
return nil, fmt.Errorf("mp4: sample size table mismatch")
}
for i, sz := range stsz.SampleSize {
sizes[i] = int(sz)
}
return sizes, nil
}
func mp4AACChunkOffsets(stbl *mp4.StblBox) ([]int64, error) {
switch {
case stbl == nil:
return nil, fmt.Errorf("mp4: incomplete sample table")
case stbl.Stco != nil:
offsets := make([]int64, len(stbl.Stco.ChunkOffset))
for i, off := range stbl.Stco.ChunkOffset {
offsets[i] = int64(off)
}
return offsets, nil
case stbl.Co64 != nil:
offsets := make([]int64, len(stbl.Co64.ChunkOffset))
for i, off := range stbl.Co64.ChunkOffset {
if off > uint64(^uint64(0)>>1) {
return nil, fmt.Errorf("mp4: invalid chunk offset")
}
offsets[i] = int64(off)
}
return offsets, nil
default:
return nil, fmt.Errorf("mp4: missing chunk offsets")
}
}