scuffle_transmuxer/
lib.rs

1//! A crate for transmuxing video streams.
2#![cfg_attr(feature = "docs", doc = "\n\nSee the [changelog][changelog] for a full release history.")]
3#![cfg_attr(feature = "docs", doc = "## Feature flags")]
4#![cfg_attr(feature = "docs", doc = document_features::document_features!())]
5//! ## License
6//!
7//! This project is licensed under the MIT or Apache-2.0 license.
8//! You can choose between one of them if you use this work.
9//!
10//! `SPDX-License-Identifier: MIT OR Apache-2.0`
11#![allow(clippy::single_match)]
12// #![deny(missing_docs)]
13#![deny(unsafe_code)]
14#![deny(unreachable_pub)]
15
16use std::collections::VecDeque;
17use std::fmt::Debug;
18use std::io;
19
20use byteorder::{BigEndian, ReadBytesExt};
21use bytes::{Buf, Bytes};
22use scuffle_flv::audio::AudioData;
23use scuffle_flv::audio::body::AudioTagBody;
24use scuffle_flv::audio::body::legacy::LegacyAudioTagBody;
25use scuffle_flv::audio::body::legacy::aac::AacAudioData;
26use scuffle_flv::audio::header::AudioTagHeader;
27use scuffle_flv::audio::header::legacy::{LegacyAudioTagHeader, SoundType};
28use scuffle_flv::script::{OnMetaData, ScriptData};
29use scuffle_flv::tag::{FlvTag, FlvTagData};
30use scuffle_flv::video::VideoData;
31use scuffle_flv::video::body::VideoTagBody;
32use scuffle_flv::video::body::enhanced::{ExVideoTagBody, VideoPacket, VideoPacketCodedFrames, VideoPacketSequenceStart};
33use scuffle_flv::video::body::legacy::LegacyVideoTagBody;
34use scuffle_flv::video::header::enhanced::VideoFourCc;
35use scuffle_flv::video::header::legacy::{LegacyVideoTagHeader, LegacyVideoTagHeaderAvcPacket};
36use scuffle_flv::video::header::{VideoFrameType, VideoTagHeader, VideoTagHeaderData};
37use scuffle_h264::Sps;
38use scuffle_mp4::BoxType;
39use scuffle_mp4::codec::{AudioCodec, VideoCodec};
40use scuffle_mp4::types::ftyp::{FourCC, Ftyp};
41use scuffle_mp4::types::hdlr::{HandlerType, Hdlr};
42use scuffle_mp4::types::mdat::Mdat;
43use scuffle_mp4::types::mdhd::Mdhd;
44use scuffle_mp4::types::mdia::Mdia;
45use scuffle_mp4::types::mfhd::Mfhd;
46use scuffle_mp4::types::minf::Minf;
47use scuffle_mp4::types::moof::Moof;
48use scuffle_mp4::types::moov::Moov;
49use scuffle_mp4::types::mvex::Mvex;
50use scuffle_mp4::types::mvhd::Mvhd;
51use scuffle_mp4::types::smhd::Smhd;
52use scuffle_mp4::types::stbl::Stbl;
53use scuffle_mp4::types::stco::Stco;
54use scuffle_mp4::types::stsc::Stsc;
55use scuffle_mp4::types::stsd::Stsd;
56use scuffle_mp4::types::stsz::Stsz;
57use scuffle_mp4::types::stts::Stts;
58use scuffle_mp4::types::tfdt::Tfdt;
59use scuffle_mp4::types::tfhd::Tfhd;
60use scuffle_mp4::types::tkhd::Tkhd;
61use scuffle_mp4::types::traf::Traf;
62use scuffle_mp4::types::trak::Trak;
63use scuffle_mp4::types::trex::Trex;
64use scuffle_mp4::types::trun::Trun;
65use scuffle_mp4::types::vmhd::Vmhd;
66
67mod codecs;
68mod define;
69mod errors;
70
71pub use define::*;
72pub use errors::TransmuxError;
73
74struct Tags<'a> {
75    video_sequence_header: Option<VideoSequenceHeader>,
76    audio_sequence_header: Option<AudioSequenceHeader>,
77    scriptdata_tag: Option<OnMetaData<'a>>,
78}
79
80#[derive(Debug, Clone)]
81pub struct Transmuxer<'a> {
82    // These durations are measured in timescales
83    /// sample_freq * 1000
84    audio_duration: u64,
85    /// fps * 1000
86    video_duration: u64,
87    sequence_number: u32,
88    last_video_timestamp: u32,
89    settings: Option<(VideoSettings, AudioSettings)>,
90    tags: VecDeque<FlvTag<'a>>,
91}
92
93impl Default for Transmuxer<'_> {
94    fn default() -> Self {
95        Self::new()
96    }
97}
98
99impl<'a> Transmuxer<'a> {
100    pub fn new() -> Self {
101        Self {
102            sequence_number: 1,
103            tags: VecDeque::new(),
104            audio_duration: 0,
105            video_duration: 0,
106            last_video_timestamp: 0,
107            settings: None,
108        }
109    }
110
111    /// Feed raw FLV data to the transmuxer.
112    pub fn demux(&mut self, data: Bytes) -> Result<(), TransmuxError> {
113        let mut cursor = io::Cursor::new(data);
114        while cursor.has_remaining() {
115            cursor.read_u32::<BigEndian>()?; // previous tag size
116            if !cursor.has_remaining() {
117                break;
118            }
119
120            let tag = FlvTag::demux(&mut cursor)?;
121            self.tags.push_back(tag);
122        }
123
124        Ok(())
125    }
126
127    /// Feed a single FLV tag to the transmuxer.
128    pub fn add_tag(&mut self, tag: FlvTag<'a>) {
129        self.tags.push_back(tag);
130    }
131
132    /// Get the next transmuxed packet. This will return `None` if there is not
133    /// enough data to create a packet.
134    pub fn mux(&mut self) -> Result<Option<TransmuxResult>, TransmuxError> {
135        let mut writer = Vec::new();
136
137        let Some((video_settings, _)) = &self.settings else {
138            let Some((video_settings, audio_settings)) = self.init_sequence(&mut writer)? else {
139                if self.tags.len() > 30 {
140                    // We are clearly not getting any sequence headers, so we should just give up
141                    return Err(TransmuxError::NoSequenceHeaders);
142                }
143
144                // We don't have enough tags to create an init segment yet
145                return Ok(None);
146            };
147
148            self.settings = Some((video_settings.clone(), audio_settings.clone()));
149
150            return Ok(Some(TransmuxResult::InitSegment {
151                data: Bytes::from(writer),
152                audio_settings,
153                video_settings,
154            }));
155        };
156
157        loop {
158            let Some(tag) = self.tags.pop_front() else {
159                return Ok(None);
160            };
161
162            let mdat_data;
163            let total_duration;
164            let trun_sample;
165            let mut is_audio = false;
166            let mut is_keyframe = false;
167
168            let duration =
169                if self.last_video_timestamp == 0 || tag.timestamp_ms == 0 || tag.timestamp_ms < self.last_video_timestamp {
170                    1000 // the first frame is always 1000 ticks where the
171                // timescale is 1000 * fps.
172                } else {
173                    // Since the delta is in milliseconds (ie 1/1000 of a second)
174                    // Rounding errors happen. Our presision is only 1/1000 of a second.
175                    // So if we have a 30fps video the delta should be 33.33ms (1000/30)
176                    // But we can only represent this as 33ms or 34ms. So we will get rounding
177                    // errors. To fix this we just check if the delta is 1 more or 1 less than the
178                    // expected delta. And if it is we just use the expected delta.
179                    // The reason we use a timescale which is 1000 * fps is because then we can
180                    // always represent the delta as an integer. If we use a timescale of 1000, we
181                    // would run into the same rounding errors.
182                    let delta = tag.timestamp_ms as f64 - self.last_video_timestamp as f64;
183                    let expected_delta = 1000.0 / video_settings.framerate;
184                    if (delta - expected_delta).abs() <= 1.0 {
185                        1000
186                    } else {
187                        (delta * video_settings.framerate) as u32
188                    }
189                };
190
191            match tag.data {
192                FlvTagData::Audio(AudioData {
193                    body: AudioTagBody::Legacy(LegacyAudioTagBody::Aac(AacAudioData::Raw(data))),
194                    ..
195                }) => {
196                    let (sample, duration) = codecs::aac::trun_sample(&data)?;
197
198                    trun_sample = sample;
199                    mdat_data = data;
200                    total_duration = duration;
201                    is_audio = true;
202                }
203                FlvTagData::Video(VideoData {
204                    header:
205                        VideoTagHeader {
206                            frame_type,
207                            data:
208                                VideoTagHeaderData::Legacy(LegacyVideoTagHeader::AvcPacket(
209                                    LegacyVideoTagHeaderAvcPacket::Nalu { composition_time_offset },
210                                )),
211                        },
212                    body: VideoTagBody::Legacy(LegacyVideoTagBody::Other { data }),
213                    ..
214                }) => {
215                    let composition_time =
216                        ((composition_time_offset as f64 * video_settings.framerate) / 1000.0).floor() * 1000.0;
217
218                    let sample = codecs::avc::trun_sample(frame_type, composition_time as u32, duration, &data)?;
219
220                    trun_sample = sample;
221                    total_duration = duration;
222                    mdat_data = data;
223
224                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
225                }
226                FlvTagData::Video(VideoData {
227                    header: VideoTagHeader { frame_type, .. },
228                    body:
229                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
230                            video_four_cc: VideoFourCc::Av1,
231                            packet: VideoPacket::CodedFrames(VideoPacketCodedFrames::Other(data)),
232                        }),
233                    ..
234                }) => {
235                    let sample = codecs::av1::trun_sample(frame_type, duration, &data)?;
236
237                    trun_sample = sample;
238                    total_duration = duration;
239                    mdat_data = data;
240
241                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
242                }
243                FlvTagData::Video(VideoData {
244                    header: VideoTagHeader { frame_type, .. },
245                    body:
246                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
247                            video_four_cc: VideoFourCc::Hevc,
248                            packet,
249                        }),
250                    ..
251                }) => {
252                    let (composition_time, data) = match packet {
253                        VideoPacket::CodedFrames(VideoPacketCodedFrames::Hevc {
254                            composition_time_offset,
255                            data,
256                        }) => (Some(composition_time_offset), data),
257                        VideoPacket::CodedFramesX { data } => (None, data),
258                        _ => continue,
259                    };
260
261                    let composition_time =
262                        ((composition_time.unwrap_or_default() as f64 * video_settings.framerate) / 1000.0).floor() * 1000.0;
263
264                    let sample = codecs::hevc::trun_sample(frame_type, composition_time as i32, duration, &data)?;
265
266                    trun_sample = sample;
267                    total_duration = duration;
268                    mdat_data = data;
269
270                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
271                }
272                _ => {
273                    // We don't support anything else
274                    continue;
275                }
276            }
277
278            let trafs = {
279                let (main_duration, main_id) = if is_audio {
280                    (self.audio_duration, 2)
281                } else {
282                    (self.video_duration, 1)
283                };
284
285                let mut traf = Traf::new(
286                    Tfhd::new(main_id, None, None, None, None, None),
287                    Some(Trun::new(vec![trun_sample], None)),
288                    Some(Tfdt::new(main_duration)),
289                );
290                traf.optimize();
291
292                vec![traf]
293            };
294
295            let mut moof = Moof::new(Mfhd::new(self.sequence_number), trafs);
296
297            // We need to get the moof size so that we can set the data offsets.
298            let moof_size = moof.size();
299
300            // We just created the moof, and therefore we know that the first traf is the
301            // video traf and the second traf is the audio traf. So we can just unwrap them
302            // and set the data offsets.
303            let traf = moof.traf.get_mut(0).expect("we just created the moof with a traf");
304
305            // Again we know that these exist because we just created it.
306            let trun = traf.trun.as_mut().expect("we just created the video traf with a trun");
307
308            // We now define the offsets.
309            // So the video offset will be the size of the moof + 8 bytes for the mdat
310            // header.
311            trun.data_offset = Some(moof_size as i32 + 8);
312
313            // We then write the moof to the writer.
314            moof.mux(&mut writer)?;
315
316            // We create an mdat box and write it to the writer.
317            Mdat::new(vec![mdat_data]).mux(&mut writer)?;
318
319            // Increase our sequence number and duration.
320            self.sequence_number += 1;
321
322            if is_audio {
323                self.audio_duration += total_duration as u64;
324                return Ok(Some(TransmuxResult::MediaSegment(MediaSegment {
325                    data: Bytes::from(writer),
326                    ty: MediaType::Audio,
327                    keyframe: false,
328                    timestamp: self.audio_duration - total_duration as u64,
329                })));
330            } else {
331                self.video_duration += total_duration as u64;
332                self.last_video_timestamp = tag.timestamp_ms;
333                return Ok(Some(TransmuxResult::MediaSegment(MediaSegment {
334                    data: Bytes::from(writer),
335                    ty: MediaType::Video,
336                    keyframe: is_keyframe,
337                    timestamp: self.video_duration - total_duration as u64,
338                })));
339            }
340        }
341    }
342
343    /// Internal function to find the tags we need to create the init segment.
344    fn find_tags(&self) -> Tags<'a> {
345        let tags = self.tags.iter();
346        let mut video_sequence_header = None;
347        let mut audio_sequence_header = None;
348        let mut scriptdata_tag = None;
349
350        for tag in tags {
351            if video_sequence_header.is_some() && audio_sequence_header.is_some() && scriptdata_tag.is_some() {
352                break;
353            }
354
355            match &tag.data {
356                FlvTagData::Video(VideoData {
357                    body: VideoTagBody::Legacy(LegacyVideoTagBody::AvcVideoPacketSeqHdr(data)),
358                    ..
359                }) => {
360                    video_sequence_header = Some(VideoSequenceHeader::Avc(data.clone()));
361                }
362                FlvTagData::Video(VideoData {
363                    body:
364                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
365                            video_four_cc: VideoFourCc::Av1,
366                            packet: VideoPacket::SequenceStart(VideoPacketSequenceStart::Av1(config)),
367                        }),
368                    ..
369                }) => {
370                    video_sequence_header = Some(VideoSequenceHeader::Av1(config.clone()));
371                }
372                FlvTagData::Video(VideoData {
373                    body:
374                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
375                            video_four_cc: VideoFourCc::Hevc,
376                            packet: VideoPacket::SequenceStart(VideoPacketSequenceStart::Hevc(config)),
377                        }),
378                    ..
379                }) => {
380                    video_sequence_header = Some(VideoSequenceHeader::Hevc(config.clone()));
381                }
382                FlvTagData::Audio(AudioData {
383                    body: AudioTagBody::Legacy(LegacyAudioTagBody::Aac(AacAudioData::SequenceHeader(data))),
384                    header:
385                        AudioTagHeader::Legacy(LegacyAudioTagHeader {
386                            sound_size, sound_type, ..
387                        }),
388                    ..
389                }) => {
390                    audio_sequence_header = Some(AudioSequenceHeader {
391                        data: AudioSequenceHeaderData::Aac(data.clone()),
392                        sound_size: *sound_size,
393                        sound_type: *sound_type,
394                    });
395                }
396                FlvTagData::ScriptData(ScriptData::OnMetaData(metadata)) => {
397                    scriptdata_tag = Some(*metadata.clone());
398                }
399                _ => {}
400            }
401        }
402
403        Tags {
404            video_sequence_header,
405            audio_sequence_header,
406            scriptdata_tag,
407        }
408    }
409
410    /// Create the init segment.
411    fn init_sequence(
412        &mut self,
413        writer: &mut impl io::Write,
414    ) -> Result<Option<(VideoSettings, AudioSettings)>, TransmuxError> {
415        // We need to find the tag that is the video sequence header
416        // and the audio sequence header
417        let Tags {
418            video_sequence_header,
419            audio_sequence_header,
420            scriptdata_tag,
421        } = self.find_tags();
422
423        let Some(video_sequence_header) = video_sequence_header else {
424            return Ok(None);
425        };
426        let Some(audio_sequence_header) = audio_sequence_header else {
427            return Ok(None);
428        };
429
430        let video_codec;
431        let audio_codec;
432        let video_width;
433        let video_height;
434        let audio_channels;
435        let audio_sample_rate;
436        let mut video_fps = 0.0;
437
438        let mut estimated_video_bitrate = 0;
439        let mut estimated_audio_bitrate = 0;
440
441        if let Some(scriptdata_tag) = scriptdata_tag {
442            video_fps = scriptdata_tag.framerate.unwrap_or(0.0);
443            estimated_video_bitrate = scriptdata_tag.videodatarate.map(|v| (v * 1024.0) as u32).unwrap_or(0);
444            estimated_audio_bitrate = scriptdata_tag.audiodatarate.map(|v| (v * 1024.0) as u32).unwrap_or(0);
445        }
446
447        let mut compatable_brands = vec![FourCC::Iso5, FourCC::Iso6];
448
449        let video_stsd_entry = match video_sequence_header {
450            VideoSequenceHeader::Avc(config) => {
451                compatable_brands.push(FourCC::Avc1);
452                video_codec = VideoCodec::Avc {
453                    constraint_set: config.profile_compatibility,
454                    level: config.level_indication,
455                    profile: config.profile_indication,
456                };
457
458                let sps = Sps::parse_with_emulation_prevention(io::Cursor::new(&config.sps[0]))
459                    .map_err(|_| TransmuxError::InvalidAVCDecoderConfigurationRecord)?;
460                video_width = sps.width() as u32;
461                video_height = sps.height() as u32;
462
463                let frame_rate = sps.frame_rate();
464                if let Some(frame_rate) = frame_rate {
465                    video_fps = frame_rate;
466                }
467
468                codecs::avc::stsd_entry(config, &sps)?
469            }
470            VideoSequenceHeader::Av1(config) => {
471                compatable_brands.push(FourCC::Av01);
472                let (entry, seq_obu) = codecs::av1::stsd_entry(config)?;
473
474                video_height = seq_obu.max_frame_height as u32;
475                video_width = seq_obu.max_frame_width as u32;
476
477                let op_point = &seq_obu.operating_points[0];
478
479                video_codec = VideoCodec::Av1 {
480                    profile: seq_obu.seq_profile,
481                    level: op_point.seq_level_idx,
482                    tier: op_point.seq_tier,
483                    depth: seq_obu.color_config.bit_depth as u8,
484                    monochrome: seq_obu.color_config.mono_chrome,
485                    sub_sampling_x: seq_obu.color_config.subsampling_x,
486                    sub_sampling_y: seq_obu.color_config.subsampling_y,
487                    color_primaries: seq_obu.color_config.color_primaries,
488                    transfer_characteristics: seq_obu.color_config.transfer_characteristics,
489                    matrix_coefficients: seq_obu.color_config.matrix_coefficients,
490                    full_range_flag: seq_obu.color_config.full_color_range,
491                };
492
493                entry
494            }
495            VideoSequenceHeader::Hevc(config) => {
496                compatable_brands.push(FourCC::Hev1);
497                video_codec = VideoCodec::Hevc {
498                    constraint_indicator: config.general_constraint_indicator_flags,
499                    level: config.general_level_idc,
500                    profile: config.general_profile_idc,
501                    profile_compatibility: config.general_profile_compatibility_flags,
502                    tier: config.general_tier_flag,
503                    general_profile_space: config.general_profile_space,
504                };
505
506                let (entry, sps) = codecs::hevc::stsd_entry(config)?;
507                if let Some(info) = sps.vui_parameters.as_ref().and_then(|p| p.vui_timing_info.as_ref()) {
508                    video_fps = info.time_scale.get() as f64 / info.num_units_in_tick.get() as f64;
509                }
510
511                video_width = sps.cropped_width() as u32;
512                video_height = sps.cropped_height() as u32;
513
514                entry
515            }
516        };
517
518        let audio_stsd_entry = match audio_sequence_header.data {
519            AudioSequenceHeaderData::Aac(data) => {
520                compatable_brands.push(FourCC::Mp41);
521                let (entry, config) =
522                    codecs::aac::stsd_entry(audio_sequence_header.sound_size, audio_sequence_header.sound_type, data)?;
523
524                audio_sample_rate = config.sampling_frequency;
525
526                audio_codec = AudioCodec::Aac {
527                    object_type: config.audio_object_type,
528                };
529                audio_channels = match audio_sequence_header.sound_type {
530                    SoundType::Mono => 1,
531                    SoundType::Stereo => 2,
532                    _ => return Err(TransmuxError::InvalidAudioChannels),
533                };
534
535                entry
536            }
537        };
538
539        if video_fps == 0.0 {
540            return Err(TransmuxError::InvalidVideoFrameRate);
541        }
542
543        if video_width == 0 || video_height == 0 {
544            return Err(TransmuxError::InvalidVideoDimensions);
545        }
546
547        if audio_sample_rate == 0 {
548            return Err(TransmuxError::InvalidAudioSampleRate);
549        }
550
551        // The reason we multiply the FPS by 1000 is to avoid rounding errors
552        // Consider If we had a video with a framerate of 30fps. That would imply each
553        // frame is 33.333333ms So we are limited to a u32 and therefore we could only
554        // represent 33.333333ms as 33ms. So this value is 30 * 1000 = 30000 timescale
555        // units per second, making each frame 1000 units long instead of 33ms long.
556        let video_timescale = (1000.0 * video_fps) as u32;
557
558        Ftyp::new(FourCC::Iso5, 512, compatable_brands).mux(writer)?;
559        Moov::new(
560            Mvhd::new(0, 0, 1000, 0, 1),
561            vec![
562                Trak::new(
563                    Tkhd::new(0, 0, 1, 0, Some((video_width, video_height))),
564                    None,
565                    Mdia::new(
566                        Mdhd::new(0, 0, video_timescale, 0),
567                        Hdlr::new(HandlerType::Vide, "VideoHandler".to_string()),
568                        Minf::new(
569                            Stbl::new(
570                                Stsd::new(vec![video_stsd_entry]),
571                                Stts::new(vec![]),
572                                Stsc::new(vec![]),
573                                Stco::new(vec![]),
574                                Some(Stsz::new(0, vec![])),
575                            ),
576                            Some(Vmhd::new()),
577                            None,
578                        ),
579                    ),
580                ),
581                Trak::new(
582                    Tkhd::new(0, 0, 2, 0, None),
583                    None,
584                    Mdia::new(
585                        Mdhd::new(0, 0, audio_sample_rate, 0),
586                        Hdlr::new(HandlerType::Soun, "SoundHandler".to_string()),
587                        Minf::new(
588                            Stbl::new(
589                                Stsd::new(vec![audio_stsd_entry]),
590                                Stts::new(vec![]),
591                                Stsc::new(vec![]),
592                                Stco::new(vec![]),
593                                Some(Stsz::new(0, vec![])),
594                            ),
595                            None,
596                            Some(Smhd::new()),
597                        ),
598                    ),
599                ),
600            ],
601            Some(Mvex::new(vec![Trex::new(1), Trex::new(2)], None)),
602        )
603        .mux(writer)?;
604
605        Ok(Some((
606            VideoSettings {
607                width: video_width,
608                height: video_height,
609                framerate: video_fps,
610                codec: video_codec,
611                bitrate: estimated_video_bitrate,
612                timescale: video_timescale,
613            },
614            AudioSettings {
615                codec: audio_codec,
616                sample_rate: audio_sample_rate,
617                channels: audio_channels,
618                bitrate: estimated_audio_bitrate,
619                timescale: audio_sample_rate,
620            },
621        )))
622    }
623}
624
625/// Changelogs generated by [scuffle_changelog]
626#[cfg(feature = "docs")]
627#[scuffle_changelog::changelog]
628pub mod changelog {}
629
630#[cfg(test)]
631mod tests;