package transcode import ( "errors" "fmt" "io" "github.com/Eyevinn/mp4ff/mp4" "github.com/olivier-w/climp-aac-decoder/aacfile" aacdec "github.com/skrashevich/go-aac/pkg/decoder" ) type mp4AACSample struct { offset int64 size int } func isMP4SampleDeltaError(err error) bool { var uf *aacfile.UnsupportedFeatureError if !errors.As(err, &uf) { return false } return uf.Feature == "MP4 sample delta" || uf.Feature == "MP4 sample delta layout" } // decodeMP4AACRelaxed demuxes MP4/M4A with mp4ff (ignoring stts sample deltas) and // decodes raw AAC frames with go-aac. Used when climp-aac-decoder rejects stts // entries whose delta is not exactly 1024 (common in ffmpeg/phone muxers). func decodeMP4AACRelaxed(r io.ReaderAt, size int64) ([]float64, int, int, error) { asc, samples, leading, err := demuxMP4AAC(r, size) if err != nil { return nil, 0, 0, err } dec := aacdec.New() if err := dec.SetASC(asc); err != nil { return nil, 0, 0, fmt.Errorf("aac config: %w", err) } ch := dec.Config.ChanConfig if ch < 1 { return nil, 0, 0, fmt.Errorf("aac config: invalid channel count %d", ch) } sr := dec.Config.SampleRate if sr <= 0 { return nil, 0, 0, fmt.Errorf("aac config: invalid sample rate %d", sr) } maxSize := 0 for _, s := range samples { if s.size > maxSize { maxSize = s.size } } buf := make([]byte, maxSize) var pcm []float32 for i, s := range samples { if cap(buf) < s.size { buf = make([]byte, s.size) } frame := buf[:s.size] if _, err := r.ReadAt(frame, s.offset); err != nil { return nil, 0, 0, fmt.Errorf("read mp4 aac sample %d: %w", i, err) } out, err := dec.DecodeFrame(frame) if err != nil { return nil, 0, 0, fmt.Errorf("decode mp4 aac sample %d: %w", i, err) } pcm = append(pcm, out...) } skipSamples := leading * ch if skipSamples > len(pcm) { skipSamples = len(pcm) } pcm = pcm[skipSamples:] samplesF64 := make([]float64, len(pcm)) for i, v := range pcm { samplesF64[i] = float64(v) } if ch > 1 { samplesF64 = float32InterleavedToMono(samplesF64, ch) ch = 1 } return samplesF64, sr, ch, nil } func float32InterleavedToMono(samples []float64, channels int) []float64 { if channels <= 1 { return samples } nFrames := len(samples) / channels out := make([]float64, nFrames) for i := 0; i < nFrames; i++ { var sum float64 for c := 0; c < channels; c++ { sum += samples[i*channels+c] } out[i] = sum / float64(channels) } return out } func demuxMP4AAC(r io.ReaderAt, size int64) (asc []byte, samples []mp4AACSample, leading int, err error) { file, err := mp4.DecodeFile(io.NewSectionReader(r, 0, size), mp4.WithDecodeMode(mp4.DecModeLazyMdat)) if err != nil { return nil, nil, 0, fmt.Errorf("mp4 decode: %w", err) } if file.IsFragmented() { return nil, nil, 0, fmt.Errorf("fragmented mp4 is not supported") } if file.Moov == nil { return nil, nil, 0, fmt.Errorf("mp4: missing moov") } var audioTracks []*mp4.TrakBox for _, trak := range file.Moov.Traks { if trak != nil && trak.Mdia != nil && trak.Mdia.Hdlr != nil && trak.Mdia.Hdlr.HandlerType == "soun" { audioTracks = append(audioTracks, trak) } } if len(audioTracks) != 1 { return nil, nil, 0, fmt.Errorf("mp4: expected one audio track, found %d", len(audioTracks)) } trak := audioTracks[0] if trak.Mdia == nil || trak.Mdia.Minf == nil || trak.Mdia.Minf.Stbl == nil || trak.Mdia.Minf.Stbl.Stsd == nil { return nil, nil, 0, fmt.Errorf("mp4: incomplete audio track") } stsd := trak.Mdia.Minf.Stbl.Stsd if len(stsd.Children) != 1 { return nil, nil, 0, fmt.Errorf("mp4: multiple sample descriptions") } if stsd.Enca != nil { return nil, nil, 0, fmt.Errorf("mp4: encrypted audio") } sampleEntry := stsd.Mp4a if sampleEntry == nil { return nil, nil, 0, fmt.Errorf("mp4: unsupported audio sample entry %s", stsd.Children[0].Type()) } if sampleEntry.Sinf != nil { return nil, nil, 0, fmt.Errorf("mp4: encrypted audio") } if sampleEntry.Esds == nil || sampleEntry.Esds.DecConfigDescriptor == nil || sampleEntry.Esds.DecConfigDescriptor.DecSpecificInfo == nil || len(sampleEntry.Esds.DecConfigDescriptor.DecSpecificInfo.DecConfig) == 0 { return nil, nil, 0, fmt.Errorf("mp4: missing AudioSpecificConfig") } asc = append([]byte(nil), sampleEntry.Esds.DecConfigDescriptor.DecSpecificInfo.DecConfig...) leading, _ = mp4LeadingTrimRelaxed(trak) samples, err = buildMP4AACSamples(trak, size) if err != nil { return nil, nil, 0, err } if len(samples) == 0 { return nil, nil, 0, fmt.Errorf("mp4: no audio samples") } return asc, samples, leading, nil } func mp4LeadingTrimRelaxed(trak *mp4.TrakBox) (int, error) { if trak.Edts == nil || len(trak.Edts.Elst) == 0 { return 0, nil } if len(trak.Edts.Elst) != 1 || len(trak.Edts.Elst[0].Entries) != 1 { return 0, nil } entry := trak.Edts.Elst[0].Entries[0] if entry.MediaRateInteger != 1 || entry.MediaRateFraction != 0 { return 0, nil } if entry.MediaTime < 0 { return 0, nil } return int(entry.MediaTime), nil } func buildMP4AACSamples(trak *mp4.TrakBox, size int64) ([]mp4AACSample, error) { if trak.Mdia == nil || trak.Mdia.Minf == nil || trak.Mdia.Minf.Stbl == nil { return nil, fmt.Errorf("mp4: incomplete sample table") } stbl := trak.Mdia.Minf.Stbl if stbl.Stsc == nil || stbl.Stsz == nil { return nil, fmt.Errorf("mp4: incomplete sample table") } if stbl.Stco == nil && stbl.Co64 == nil { return nil, fmt.Errorf("mp4: missing chunk offsets") } if len(stbl.Stsc.Entries) == 0 { return nil, fmt.Errorf("mp4: empty chunk map") } totalSamples := int(trak.GetNrSamples()) if totalSamples <= 0 { return nil, fmt.Errorf("mp4: empty sample table") } sampleSizes, err := mp4AACSampleSizes(stbl.Stsz, totalSamples) if err != nil { return nil, err } chunkOffsets, err := mp4AACChunkOffsets(stbl) if err != nil { return nil, err } out := make([]mp4AACSample, 0, totalSamples) sampleIndex := 0 entryIndex := 0 entry := stbl.Stsc.Entries[entryIndex] for chunkIndex := 0; chunkIndex < len(chunkOffsets) && sampleIndex < totalSamples; chunkIndex++ { chunkNr := uint32(chunkIndex + 1) for entryIndex+1 < len(stbl.Stsc.Entries) && chunkNr >= stbl.Stsc.Entries[entryIndex+1].FirstChunk { entryIndex++ entry = stbl.Stsc.Entries[entryIndex] } if entry.SamplesPerChunk == 0 { return nil, fmt.Errorf("mp4: zero samples per chunk") } offset := chunkOffsets[chunkIndex] samplesPerChunk := int(entry.SamplesPerChunk) for i := 0; i < samplesPerChunk && sampleIndex < totalSamples; i++ { sampleSize := sampleSizes[sampleIndex] end := offset + int64(sampleSize) if offset < 0 || end < offset || end > size { return nil, fmt.Errorf("mp4: invalid sample bounds at sample %d", sampleIndex+1) } out = append(out, mp4AACSample{offset: offset, size: sampleSize}) offset = end sampleIndex++ } } if sampleIndex != totalSamples { return nil, fmt.Errorf("mp4: sample table mismatch") } return out, nil } func mp4AACSampleSizes(stsz *mp4.StszBox, totalSamples int) ([]int, error) { if stsz == nil { return nil, fmt.Errorf("mp4: missing sample sizes") } if int(stsz.GetNrSamples()) != totalSamples { return nil, fmt.Errorf("mp4: sample size count mismatch") } sizes := make([]int, totalSamples) if stsz.SampleUniformSize != 0 { sz := int(stsz.SampleUniformSize) for i := range sizes { sizes[i] = sz } return sizes, nil } if len(stsz.SampleSize) != totalSamples { return nil, fmt.Errorf("mp4: sample size table mismatch") } for i, sz := range stsz.SampleSize { sizes[i] = int(sz) } return sizes, nil } func mp4AACChunkOffsets(stbl *mp4.StblBox) ([]int64, error) { switch { case stbl == nil: return nil, fmt.Errorf("mp4: incomplete sample table") case stbl.Stco != nil: offsets := make([]int64, len(stbl.Stco.ChunkOffset)) for i, off := range stbl.Stco.ChunkOffset { offsets[i] = int64(off) } return offsets, nil case stbl.Co64 != nil: offsets := make([]int64, len(stbl.Co64.ChunkOffset)) for i, off := range stbl.Co64.ChunkOffset { if off > uint64(^uint64(0)>>1) { return nil, fmt.Errorf("mp4: invalid chunk offset") } offsets[i] = int64(off) } return offsets, nil default: return nil, fmt.Errorf("mp4: missing chunk offsets") } }