diff options
| author | jacqueline <me@jacqueline.id.au> | 2023-08-01 10:14:23 +1000 |
|---|---|---|
| committer | jacqueline <me@jacqueline.id.au> | 2023-08-01 10:14:23 +1000 |
| commit | 23393312b7183fa61d4a6ba9e97af21f2337a8af (patch) | |
| tree | 16e8c632a30fffb470156cd33519aa0607e72d8a /lib/opus/doc/draft-ietf-codec-oggopus.xml | |
| parent | d41de537a0e31e1c5ad18b5024c781f6e4a07bbb (diff) | |
| download | tangara-fw-23393312b7183fa61d4a6ba9e97af21f2337a8af.tar.gz | |
checkin in opus reference decoder
Diffstat (limited to 'lib/opus/doc/draft-ietf-codec-oggopus.xml')
| -rw-r--r-- | lib/opus/doc/draft-ietf-codec-oggopus.xml | 1873 |
1 files changed, 1873 insertions, 0 deletions
diff --git a/lib/opus/doc/draft-ietf-codec-oggopus.xml b/lib/opus/doc/draft-ietf-codec-oggopus.xml new file mode 100644 index 00000000..128816e9 --- /dev/null +++ b/lib/opus/doc/draft-ietf-codec-oggopus.xml @@ -0,0 +1,1873 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- + Copyright (c) 2012-2016 Xiph.Org Foundation and contributors + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Special permission is granted to remove the above copyright notice, list of + conditions, and disclaimer when submitting this document, with or without + modification, to the IETF. +--> +<!DOCTYPE rfc SYSTEM 'rfc2629.dtd' [ +<!ENTITY rfc2119 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml'> +<!ENTITY rfc3533 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3533.xml'> +<!ENTITY rfc3629 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3629.xml'> +<!ENTITY rfc4732 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4732.xml'> +<!ENTITY rfc5226 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5226.xml'> +<!ENTITY rfc5334 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5334.xml'> +<!ENTITY rfc6381 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6381.xml'> +<!ENTITY rfc6716 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6716.xml'> +<!ENTITY rfc6982 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6982.xml'> +<!ENTITY rfc7587 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.7587.xml'> +]> +<?rfc toc="yes" symrefs="yes" ?> + +<rfc ipr="trust200902" category="std" docName="draft-ietf-codec-oggopus-14" + updates="5334"> + +<front> +<title abbrev="Ogg Opus">Ogg Encapsulation for the Opus Audio Codec</title> +<author initials="T.B." surname="Terriberry" fullname="Timothy B. Terriberry"> +<organization>Mozilla Corporation</organization> +<address> +<postal> +<street>650 Castro Street</street> +<city>Mountain View</city> +<region>CA</region> +<code>94041</code> +<country>USA</country> +</postal> +<phone>+1 650 903-0800</phone> +<email>tterribe@xiph.org</email> +</address> +</author> + +<author initials="R." surname="Lee" fullname="Ron Lee"> +<organization>Voicetronix</organization> +<address> +<postal> +<street>246 Pulteney Street, Level 1</street> +<city>Adelaide</city> +<region>SA</region> +<code>5000</code> +<country>Australia</country> +</postal> +<phone>+61 8 8232 9112</phone> +<email>ron@debian.org</email> +</address> +</author> + +<author initials="R." surname="Giles" fullname="Ralph Giles"> +<organization>Mozilla Corporation</organization> +<address> +<postal> +<street>163 West Hastings Street</street> +<city>Vancouver</city> +<region>BC</region> +<code>V6B 1H5</code> +<country>Canada</country> +</postal> +<phone>+1 778 785 1540</phone> +<email>giles@xiph.org</email> +</address> +</author> + +<date day="22" month="February" year="2016"/> +<area>RAI</area> +<workgroup>codec</workgroup> + +<abstract> +<t> +This document defines the Ogg encapsulation for the Opus interactive speech and + audio codec. +This allows data encoded in the Opus format to be stored in an Ogg logical + bitstream. +</t> +</abstract> +</front> + +<middle> +<section anchor="intro" title="Introduction"> +<t> +The IETF Opus codec is a low-latency audio codec optimized for both voice and + general-purpose audio. +See <xref target="RFC6716"/> for technical details. +This document defines the encapsulation of Opus in a continuous, logical Ogg + bitstream <xref target="RFC3533"/>. +Ogg encapsulation provides Opus with a long-term storage format supporting + all of the essential features, including metadata, fast and accurate seeking, + corruption detection, recapture after errors, low overhead, and the ability to + multiplex Opus with other codecs (including video) with minimal buffering. +It also provides a live streamable format, capable of delivery over a reliable + stream-oriented transport, without requiring all the data, or even the total + length of the data, up-front, in a form that is identical to the on-disk + storage format. +</t> +<t> +Ogg bitstreams are made up of a series of 'pages', each of which contains data + from one or more 'packets'. +Pages are the fundamental unit of multiplexing in an Ogg stream. +Each page is associated with a particular logical stream and contains a capture + pattern and checksum, flags to mark the beginning and end of the logical + stream, and a 'granule position' that represents an absolute position in the + stream, to aid seeking. +A single page can contain up to 65,025 octets of packet data from up to 255 + different packets. +Packets can be split arbitrarily across pages, and continued from one page to + the next (allowing packets much larger than would fit on a single page). +Each page contains 'lacing values' that indicate how the data is partitioned + into packets, allowing a demultiplexer (demuxer) to recover the packet + boundaries without examining the encoded data. +A packet is said to 'complete' on a page when the page contains the final + lacing value corresponding to that packet. +</t> +<t> +This encapsulation defines the contents of the packet data, including + the necessary headers, the organization of those packets into a logical + stream, and the interpretation of the codec-specific granule position field. +It does not attempt to describe or specify the existing Ogg container format. +Readers unfamiliar with the basic concepts mentioned above are encouraged to + review the details in <xref target="RFC3533"/>. +</t> + +</section> + +<section anchor="terminology" title="Terminology"> +<t> +The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", + "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in <xref target="RFC2119"/>. +</t> + +</section> + +<section anchor="packet_organization" title="Packet Organization"> +<t> +An Ogg Opus stream is organized as follows (see + <xref target="packet-org-example"/> for an example). +</t> + +<figure anchor="packet-org-example" + title="Example packet organization for a logical Ogg Opus stream" + align="center"> +<artwork align="center"><![CDATA[ + Page 0 Pages 1 ... n Pages (n+1) ... + +------------+ +---+ +---+ ... +---+ +-----------+ +---------+ +-- + | | | | | | | | | | | | | + |+----------+| |+-----------------+| |+-------------------+ +----- + |||ID Header|| || Comment Header || ||Audio Data Packet 1| | ... + |+----------+| |+-----------------+| |+-------------------+ +----- + | | | | | | | | | | | | | + +------------+ +---+ +---+ ... +---+ +-----------+ +---------+ +-- + ^ ^ ^ + | | | + | | Mandatory Page Break + | | + | ID header is contained on a single page + | + 'Beginning Of Stream' +]]></artwork> +</figure> + +<t> +There are two mandatory header packets. +The first packet in the logical Ogg bitstream MUST contain the identification + (ID) header, which uniquely identifies a stream as Opus audio. +The format of this header is defined in <xref target="id_header"/>. +It is placed alone (without any other packet data) on the first page of + the logical Ogg bitstream, and completes on that page. +This page has its 'beginning of stream' flag set. +</t> +<t> +The second packet in the logical Ogg bitstream MUST contain the comment header, + which contains user-supplied metadata. +The format of this header is defined in <xref target="comment_header"/>. +It MAY span multiple pages, beginning on the second page of the logical + stream. +However many pages it spans, the comment header packet MUST finish the page on + which it completes. +</t> +<t> +All subsequent pages are audio data pages, and the Ogg packets they contain are + audio data packets. +Each audio data packet contains one Opus packet for each of N different + streams, where N is typically one for mono or stereo, but MAY be greater than + one for multichannel audio. +The value N is specified in the ID header (see + <xref target="channel_mapping"/>), and is fixed over the entire length of the + logical Ogg bitstream. +</t> +<t> +The first (N - 1) Opus packets, if any, are packed one after another + into the Ogg packet, using the self-delimiting framing from Appendix B of + <xref target="RFC6716"/>. +The remaining Opus packet is packed at the end of the Ogg packet using the + regular, undelimited framing from Section 3 of <xref target="RFC6716"/>. +All of the Opus packets in a single Ogg packet MUST be constrained to have the + same duration. +An implementation of this specification SHOULD treat any Opus packet whose + duration is different from that of the first Opus packet in an Ogg packet as + if it were a malformed Opus packet with an invalid Table Of Contents (TOC) + sequence. +</t> +<t> +The TOC sequence at the beginning of each Opus packet indicates the coding + mode, audio bandwidth, channel count, duration (frame size), and number of + frames per packet, as described in Section 3.1 + of <xref target="RFC6716"/>. +The coding mode is one of SILK, Hybrid, or Constrained Energy Lapped Transform + (CELT). +The combination of coding mode, audio bandwidth, and frame size is referred to + as the configuration of an Opus packet. +</t> +<t> +Packets are placed into Ogg pages in order until the end of stream. +Audio data packets might span page boundaries. +The first audio data page could have the 'continued packet' flag set + (indicating the first audio data packet is continued from a previous page) if, + for example, it was a live stream joined mid-broadcast, with the headers + pasted on the front. +If a page has the 'continued packet' flag set and one of the following + conditions is also true: +<list style="symbols"> +<t>the previous page with packet data does not end in a continued packet (does + not end with a lacing value of 255) OR</t> +<t>the page sequence numbers are not consecutive,</t> +</list> + then a demuxer MUST NOT attempt to decode the data for the first packet on the + page unless the demuxer has some special knowledge that would allow it to + interpret this data despite the missing pieces. +An implementation MUST treat a zero-octet audio data packet as if it were a + malformed Opus packet as described in + Section 3.4 of <xref target="RFC6716"/>. +</t> +<t> +A logical stream ends with a page with the 'end of stream' flag set, but + implementations need to be prepared to deal with truncated streams that do not + have a page marked 'end of stream'. +There is no reason for the final packet on the last page to be a continued + packet, i.e., for the final lacing value to be 255. +However, demuxers might encounter such streams, possibly as the result of a + transfer that did not complete or of corruption. +If a packet continues onto a subsequent page (i.e., when the page ends with a + lacing value of 255) and one of the following conditions is also true: +<list style="symbols"> +<t>the next page with packet data does not have the 'continued packet' flag + set OR</t> +<t>there is no next page with packet data OR</t> +<t>the page sequence numbers are not consecutive,</t> +</list> + then a demuxer MUST NOT attempt to decode the data from that packet unless the + demuxer has some special knowledge that would allow it to interpret this data + despite the missing pieces. +There MUST NOT be any more pages in an Opus logical bitstream after a page + marked 'end of stream'. +</t> +</section> + +<section anchor="granpos" title="Granule Position"> +<t> +The granule position MUST be zero for the ID header page and the + page where the comment header completes. +That is, the first page in the logical stream, and the last header + page before the first audio data page both have a granule position of zero. +</t> +<t> +The granule position of an audio data page encodes the total number of PCM + samples in the stream up to and including the last fully-decodable sample from + the last packet completed on that page. +The granule position of the first audio data page will usually be larger than + zero, as described in <xref target="start_granpos_restrictions"/>. +</t> + +<t> +A page that is entirely spanned by a single packet (that completes on a + subsequent page) has no granule position, and the granule position field is + set to the special value '-1' in two's complement. +</t> + +<t> +The granule position of an audio data page is in units of PCM audio samples at + a fixed rate of 48 kHz (per channel; a stereo stream's granule position + does not increment at twice the speed of a mono stream). +It is possible to run an Opus decoder at other sampling rates, + but all Opus packets encode samples at a sampling rate that evenly divides + 48 kHz. +Therefore, the value in the granule position field always counts samples + assuming a 48 kHz decoding rate, and the rest of this specification makes + the same assumption. +</t> + +<t> +The duration of an Opus packet as defined in <xref target="RFC6716"/> can be + any multiple of 2.5 ms, up to a maximum of 120 ms. +This duration is encoded in the TOC sequence at the beginning of each packet. +The number of samples returned by a decoder corresponds to this duration + exactly, even for the first few packets. +For example, a 20 ms packet fed to a decoder running at 48 kHz will + always return 960 samples. +A demuxer can parse the TOC sequence at the beginning of each Ogg packet to + work backwards or forwards from a packet with a known granule position (i.e., + the last packet completed on some page) in order to assign granule positions + to every packet, or even every individual sample. +The one exception is the last page in the stream, as described below. +</t> + +<t> +All other pages with completed packets after the first MUST have a granule + position equal to the number of samples contained in packets that complete on + that page plus the granule position of the most recent page with completed + packets. +This guarantees that a demuxer can assign individual packets the same granule + position when working forwards as when working backwards. +For this to work, there cannot be any gaps. +</t> + +<section anchor="gap-repair" title="Repairing Gaps in Real-time Streams"> +<t> +In order to support capturing a real-time stream that has lost or not + transmitted packets, a multiplexer (muxer) SHOULD emit packets that explicitly + request the use of Packet Loss Concealment (PLC) in place of the missing + packets. +Implementations that fail to do so still MUST NOT increment the granule + position for a page by anything other than the number of samples contained in + packets that actually complete on that page. +</t> +<t> +Only gaps that are a multiple of 2.5 ms are repairable, as these are the + only durations that can be created by packet loss or discontinuous + transmission. +Muxers need not handle other gap sizes. +Creating the necessary packets involves synthesizing a TOC byte (defined in +Section 3.1 of <xref target="RFC6716"/>)—and whatever + additional internal framing is needed—to indicate the packet duration + for each stream. +The actual length of each missing Opus frame inside the packet is zero bytes, + as defined in Section 3.2.1 of <xref target="RFC6716"/>. +</t> + +<t> +Zero-byte frames MAY be packed into packets using any of codes 0, 1, + 2, or 3. +When successive frames have the same configuration, the higher code packings + reduce overhead. +Likewise, if the TOC configuration matches, the muxer MAY further combine the + empty frames with previous or subsequent non-zero-length frames (using + code 2 or VBR code 3). +</t> + +<t> +<xref target="RFC6716"/> does not impose any requirements on the PLC, but this + section outlines choices that are expected to have a positive influence on + most PLC implementations, including the reference implementation. +Synthesized TOC sequences SHOULD maintain the same mode, audio bandwidth, + channel count, and frame size as the previous packet (if any). +This is the simplest and usually the most well-tested case for the PLC to + handle and it covers all losses that do not include a configuration switch, + as defined in Section 4.5 of <xref target="RFC6716"/>. +</t> + +<t> +When a previous packet is available, keeping the audio bandwidth and channel + count the same allows the PLC to provide maximum continuity in the concealment + data it generates. +However, if the size of the gap is not a multiple of the most recent frame + size, then the frame size will have to change for at least some frames. +Such changes SHOULD be delayed as long as possible to simplify + things for PLC implementations. +</t> + +<t> +As an example, a 95 ms gap could be encoded as nineteen 5 ms frames + in two bytes with a single CBR code 3 packet. +If the previous frame size was 20 ms, using four 20 ms frames + followed by three 5 ms frames requires 4 bytes (plus an extra byte + of Ogg lacing overhead), but allows the PLC to use its well-tested steady + state behavior for as long as possible. +The total bitrate of the latter approach, including Ogg overhead, is about + 0.4 kbps, so the impact on file size is minimal. +</t> + +<t> +Changing modes is discouraged, since this causes some decoder implementations + to reset their PLC state. +However, SILK and Hybrid mode frames cannot fill gaps that are not a multiple + of 10 ms. +If switching to CELT mode is needed to match the gap size, a muxer SHOULD do + so at the end of the gap to allow the PLC to function for as long as possible. +</t> + +<t> +In the example above, if the previous frame was a 20 ms SILK mode frame, + the better solution is to synthesize a packet describing four 20 ms SILK + frames, followed by a packet with a single 10 ms SILK + frame, and finally a packet with a 5 ms CELT frame, to fill the 95 ms + gap. +This also requires four bytes to describe the synthesized packet data (two + bytes for a CBR code 3 and one byte each for two code 0 packets) but three + bytes of Ogg lacing overhead are needed to mark the packet boundaries. +At 0.6 kbps, this is still a minimal bitrate impact over a naive, low quality + solution. +</t> + +<t> +Since medium-band audio is an option only in the SILK mode, wideband frames + SHOULD be generated if switching from that configuration to CELT mode, to + ensure that any PLC implementation which does try to migrate state between + the modes will be able to preserve all of the available audio bandwidth. +</t> + +</section> + +<section anchor="preskip" title="Pre-skip"> +<t> +There is some amount of latency introduced during the decoding process, to + allow for overlap in the CELT mode, stereo mixing in the SILK mode, and + resampling. +The encoder might have introduced additional latency through its own resampling + and analysis (though the exact amount is not specified). +Therefore, the first few samples produced by the decoder do not correspond to + real input audio, but are instead composed of padding inserted by the encoder + to compensate for this latency. +These samples need to be stored and decoded, as Opus is an asymptotically + convergent predictive codec, meaning the decoded contents of each frame depend + on the recent history of decoder inputs. +However, a player will want to skip these samples after decoding them. +</t> + +<t> +A 'pre-skip' field in the ID header (see <xref target="id_header"/>) signals + the number of samples that SHOULD be skipped (decoded but discarded) at the + beginning of the stream, though some specific applications might have a reason + for looking at that data. +This amount need not be a multiple of 2.5 ms, MAY be smaller than a single + packet, or MAY span the contents of several packets. +These samples are not valid audio. +</t> + +<t> +For example, if the first Opus frame uses the CELT mode, it will always + produce 120 samples of windowed overlap-add data. +However, the overlap data is initially all zeros (since there is no prior + frame), meaning this cannot, in general, accurately represent the original + audio. +The SILK mode requires additional delay to account for its analysis and + resampling latency. +The encoder delays the original audio to avoid this problem. +</t> + +<t> +The pre-skip field MAY also be used to perform sample-accurate cropping of + already encoded streams. +In this case, a value of at least 3840 samples (80 ms) provides + sufficient history to the decoder that it will have converged + before the stream's output begins. +</t> + +</section> + +<section anchor="pcm_sample_position" title="PCM Sample Position"> +<t> +The PCM sample position is determined from the granule position using the + formula +</t> +<figure align="center"> +<artwork align="center"><![CDATA[ +'PCM sample position' = 'granule position' - 'pre-skip' . +]]></artwork> +</figure> + +<t> +For example, if the granule position of the first audio data page is 59,971, + and the pre-skip is 11,971, then the PCM sample position of the last decoded + sample from that page is 48,000. +</t> +<t> +This can be converted into a playback time using the formula +</t> +<figure align="center"> +<artwork align="center"><![CDATA[ + 'PCM sample position' +'playback time' = --------------------- . + 48000.0 +]]></artwork> +</figure> + +<t> +The initial PCM sample position before any samples are played is normally '0'. +In this case, the PCM sample position of the first audio sample to be played + starts at '1', because it marks the time on the clock + <spanx style="emph">after</spanx> that sample has been played, and a stream + that is exactly one second long has a final PCM sample position of '48000', + as in the example here. +</t> + +<t> +Vorbis streams use a granule position smaller than the number of audio samples + contained in the first audio data page to indicate that some of those samples + are trimmed from the output (see <xref target="vorbis-trim"/>). +However, to do so, Vorbis requires that the first audio data page contains + exactly two packets, in order to allow the decoder to perform PCM position + adjustments before needing to return any PCM data. +Opus uses the pre-skip mechanism for this purpose instead, since the encoder + might introduce more than a single packet's worth of latency, and since very + large packets in streams with a very large number of channels might not fit + on a single page. +</t> +</section> + +<section anchor="end_trimming" title="End Trimming"> +<t> +The page with the 'end of stream' flag set MAY have a granule position that + indicates the page contains less audio data than would normally be returned by + decoding up through the final packet. +This is used to end the stream somewhere other than an even frame boundary. +The granule position of the most recent audio data page with completed packets + is used to make this determination, or '0' is used if there were no previous + audio data pages with a completed packet. +The difference between these granule positions indicates how many samples to + keep after decoding the packets that completed on the final page. +The remaining samples are discarded. +The number of discarded samples SHOULD be no larger than the number decoded + from the last packet. +</t> +</section> + +<section anchor="start_granpos_restrictions" + title="Restrictions on the Initial Granule Position"> +<t> +The granule position of the first audio data page with a completed packet MAY + be larger than the number of samples contained in packets that complete on + that page, however it MUST NOT be smaller, unless that page has the 'end of + stream' flag set. +Allowing a granule position larger than the number of samples allows the + beginning of a stream to be cropped or a live stream to be joined without + rewriting the granule position of all the remaining pages. +This means that the PCM sample position just before the first sample to be + played MAY be larger than '0'. +Synchronization when multiplexing with other logical streams still uses the PCM + sample position relative to '0' to compute sample times. +This does not affect the behavior of pre-skip: exactly 'pre-skip' samples + SHOULD be skipped from the beginning of the decoded output, even if the + initial PCM sample position is greater than zero. +</t> + +<t> +On the other hand, a granule position that is smaller than the number of + decoded samples prevents a demuxer from working backwards to assign each + packet or each individual sample a valid granule position, since granule + positions are non-negative. +An implementation MUST treat any stream as invalid if the granule position + is smaller than the number of samples contained in packets that complete on + the first audio data page with a completed packet, unless that page has the + 'end of stream' flag set. +It MAY defer this action until it decodes the last packet completed on that + page. +</t> + +<t> +If that page has the 'end of stream' flag set, a demuxer MUST treat any stream + as invalid if its granule position is smaller than the 'pre-skip' amount. +This would indicate that there are more samples to be skipped from the initial + decoded output than exist in the stream. +If the granule position is smaller than the number of decoded samples produced + by the packets that complete on that page, then a demuxer MUST use an initial + granule position of '0', and can work forwards from '0' to timestamp + individual packets. +If the granule position is larger than the number of decoded samples available, + then the demuxer MUST still work backwards as described above, even if the + 'end of stream' flag is set, to determine the initial granule position, and + thus the initial PCM sample position. +Both of these will be greater than '0' in this case. +</t> +</section> + +<section anchor="seeking_and_preroll" title="Seeking and Pre-roll"> +<t> +Seeking in Ogg files is best performed using a bisection search for a page + whose granule position corresponds to a PCM position at or before the seek + target. +With appropriately weighted bisection, accurate seeking can be performed in + just one or two bisections on average, even in multi-gigabyte files. +See <xref target="seeking"/> for an example of general implementation guidance. +</t> + +<t> +When seeking within an Ogg Opus stream, an implementation SHOULD start decoding + (and discarding the output) at least 3840 samples (80 ms) prior to + the seek target in order to ensure that the output audio is correct by the + time it reaches the seek target. +This 'pre-roll' is separate from, and unrelated to, the 'pre-skip' used at the + beginning of the stream. +If the point 80 ms prior to the seek target comes before the initial PCM + sample position, an implementation SHOULD start decoding from the beginning of + the stream, applying pre-skip as normal, regardless of whether the pre-skip is + larger or smaller than 80 ms, and then continue to discard samples + to reach the seek target (if any). +</t> +</section> + +</section> + +<section anchor="headers" title="Header Packets"> +<t> +An Ogg Opus logical stream contains exactly two mandatory header packets: + an identification header and a comment header. +</t> + +<section anchor="id_header" title="Identification Header"> + +<figure anchor="id_header_packet" title="ID Header Packet" align="center"> +<artwork align="center"><![CDATA[ + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'O' | 'p' | 'u' | 's' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'H' | 'e' | 'a' | 'd' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Version = 1 | Channel Count | Pre-skip | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Input Sample Rate (Hz) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Output Gain (Q7.8 in dB) | Mapping Family| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ : +| | +: Optional Channel Mapping Table... : +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +]]></artwork> +</figure> + +<t> +The fields in the identification (ID) header have the following meaning: +<list style="numbers"> +<t>Magic Signature: +<vspace blankLines="1"/> +This is an 8-octet (64-bit) field that allows codec identification and is + human-readable. +It contains, in order, the magic numbers: +<list style="empty"> +<t>0x4F 'O'</t> +<t>0x70 'p'</t> +<t>0x75 'u'</t> +<t>0x73 's'</t> +<t>0x48 'H'</t> +<t>0x65 'e'</t> +<t>0x61 'a'</t> +<t>0x64 'd'</t> +</list> +Starting with "Op" helps distinguish it from audio data packets, as this is an + invalid TOC sequence. +<vspace blankLines="1"/> +</t> +<t>Version (8 bits, unsigned): +<vspace blankLines="1"/> +The version number MUST always be '1' for this version of the encapsulation + specification. +Implementations SHOULD treat streams where the upper four bits of the version + number match that of a recognized specification as backwards-compatible with + that specification. +That is, the version number can be split into "major" and "minor" version + sub-fields, with changes to the "minor" sub-field (in the lower four bits) + signaling compatible changes. +For example, an implementation of this specification SHOULD accept any stream + with a version number of '15' or less, and SHOULD assume any stream with a + version number '16' or greater is incompatible. +The initial version '1' was chosen to keep implementations from relying on this + octet as a null terminator for the "OpusHead" string. +<vspace blankLines="1"/> +</t> +<t>Output Channel Count 'C' (8 bits, unsigned): +<vspace blankLines="1"/> +This is the number of output channels. +This might be different than the number of encoded channels, which can change + on a packet-by-packet basis. +This value MUST NOT be zero. +The maximum allowable value depends on the channel mapping family, and might be + as large as 255. +See <xref target="channel_mapping"/> for details. +<vspace blankLines="1"/> +</t> +<t>Pre-skip (16 bits, unsigned, little + endian): +<vspace blankLines="1"/> +This is the number of samples (at 48 kHz) to discard from the decoder + output when starting playback, and also the number to subtract from a page's + granule position to calculate its PCM sample position. +When cropping the beginning of existing Ogg Opus streams, a pre-skip of at + least 3,840 samples (80 ms) is RECOMMENDED to ensure complete + convergence in the decoder. +<vspace blankLines="1"/> +</t> +<t>Input Sample Rate (32 bits, unsigned, little + endian): +<vspace blankLines="1"/> +This is the sample rate of the original input (before encoding), in Hz. +This field is <spanx style="emph">not</spanx> the sample rate to use for + playback of the encoded data. +<vspace blankLines="1"/> +Opus can switch between internal audio bandwidths of 4, 6, 8, 12, and + 20 kHz. +Each packet in the stream can have a different audio bandwidth. +Regardless of the audio bandwidth, the reference decoder supports decoding any + stream at a sample rate of 8, 12, 16, 24, or 48 kHz. +The original sample rate of the audio passed to the encoder is not preserved + by the lossy compression. +<vspace blankLines="1"/> +An Ogg Opus player SHOULD select the playback sample rate according to the + following procedure: +<list style="numbers"> +<t>If the hardware supports 48 kHz playback, decode at 48 kHz.</t> +<t>Otherwise, if the hardware's highest available sample rate is a supported + rate, decode at this sample rate.</t> +<t>Otherwise, if the hardware's highest available sample rate is less than + 48 kHz, decode at the next higher Opus supported rate above the highest + available hardware rate and resample.</t> +<t>Otherwise, decode at 48 kHz and resample.</t> +</list> +However, the 'Input Sample Rate' field allows the muxer to pass the sample + rate of the original input stream as metadata. +This is useful when the user requires the output sample rate to match the + input sample rate. +For example, when not playing the output, an implementation writing PCM format + samples to disk might choose to resample the audio back to the original input + sample rate to reduce surprise to the user, who might reasonably expect to get + back a file with the same sample rate. +<vspace blankLines="1"/> +A value of zero indicates 'unspecified'. +Muxers SHOULD write the actual input sample rate or zero, but implementations + which do something with this field SHOULD take care to behave sanely if given + crazy values (e.g., do not actually upsample the output to 10 MHz if + requested). +Implementations SHOULD support input sample rates between 8 kHz and + 192 kHz (inclusive). +Rates outside this range MAY be ignored by falling back to the default rate of + 48 kHz instead. +<vspace blankLines="1"/> +</t> +<t>Output Gain (16 bits, signed, little endian): +<vspace blankLines="1"/> +This is a gain to be applied when decoding. +It is 20*log10 of the factor by which to scale the decoder output to achieve + the desired playback volume, stored in a 16-bit, signed, two's complement + fixed-point value with 8 fractional bits (i.e., + Q7.8 <xref target="q-notation"/>). +<vspace blankLines="1"/> +To apply the gain, an implementation could use +<figure align="center"> +<artwork align="center"><![CDATA[ +sample *= pow(10, output_gain/(20.0*256)) , +]]></artwork> +</figure> + where output_gain is the raw 16-bit value from the header. +<vspace blankLines="1"/> +Players and media frameworks SHOULD apply it by default. +If a player chooses to apply any volume adjustment or gain modification, such + as the R128_TRACK_GAIN (see <xref target="comment_header"/>), the adjustment + MUST be applied in addition to this output gain in order to achieve playback + at the normalized volume. +<vspace blankLines="1"/> +A muxer SHOULD set this field to zero, and instead apply any gain prior to + encoding, when this is possible and does not conflict with the user's wishes. +A nonzero output gain indicates the gain was adjusted after encoding, or that + a user wished to adjust the gain for playback while preserving the ability + to recover the original signal amplitude. +<vspace blankLines="1"/> +Although the output gain has enormous range (+/- 128 dB, enough to amplify + inaudible sounds to the threshold of physical pain), most applications can + only reasonably use a small portion of this range around zero. +The large range serves in part to ensure that gain can always be losslessly + transferred between OpusHead and R128 gain tags (see below) without + saturating. +<vspace blankLines="1"/> +</t> +<t>Channel Mapping Family (8 bits, unsigned): +<vspace blankLines="1"/> +This octet indicates the order and semantic meaning of the output channels. +<vspace blankLines="1"/> +Each currently specified value of this octet indicates a mapping family, which + defines a set of allowed channel counts, and the ordered set of channel names + for each allowed channel count. +The details are described in <xref target="channel_mapping"/>. +</t> +<t>Channel Mapping Table: +This table defines the mapping from encoded streams to output channels. +Its contents are specified in <xref target="channel_mapping"/>. +</t> +</list> +</t> + +<t> +All fields in the ID headers are REQUIRED, except for the channel mapping + table, which MUST be omitted when the channel mapping family is 0, but + is REQUIRED otherwise. +Implementations SHOULD treat a stream as invalid if it contains an ID header + that does not have enough data for these fields, even if it contain a valid + Magic Signature. +Future versions of this specification, even backwards-compatible versions, + might include additional fields in the ID header. +If an ID header has a compatible major version, but a larger minor version, + an implementation MUST NOT treat it as invalid for containing additional data + not specified here, provided it still completes on the first page. +</t> + +<section anchor="channel_mapping" title="Channel Mapping"> +<t> +An Ogg Opus stream allows mapping one number of Opus streams (N) to a possibly + larger number of decoded channels (M + N) to yet another number of + output channels (C), which might be larger or smaller than the number of + decoded channels. +The order and meaning of these channels are defined by a channel mapping, + which consists of the 'channel mapping family' octet and, for channel mapping + families other than family 0, a channel mapping table, as illustrated in + <xref target="channel_mapping_table"/>. +</t> + +<figure anchor="channel_mapping_table" title="Channel Mapping Table" + align="center"> +<artwork align="center"><![CDATA[ + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+ + | Stream Count | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Coupled Count | Channel Mapping... : ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +]]></artwork> +</figure> + +<t> +The fields in the channel mapping table have the following meaning: +<list style="numbers" counter="8"> +<t>Stream Count 'N' (8 bits, unsigned): +<vspace blankLines="1"/> +This is the total number of streams encoded in each Ogg packet. +This value is necessary to correctly parse the packed Opus packets inside an + Ogg packet, as described in <xref target="packet_organization"/>. +This value MUST NOT be zero, as without at least one Opus packet with a valid + TOC sequence, a demuxer cannot recover the duration of an Ogg packet. +<vspace blankLines="1"/> +For channel mapping family 0, this value defaults to 1, and is not coded. +<vspace blankLines="1"/> +</t> +<t>Coupled Stream Count 'M' (8 bits, unsigned): +This is the number of streams whose decoders are to be configured to produce + two channels (stereo). +This MUST be no larger than the total number of streams, N. +<vspace blankLines="1"/> +Each packet in an Opus stream has an internal channel count of 1 or 2, which + can change from packet to packet. +This is selected by the encoder depending on the bitrate and the audio being + encoded. +The original channel count of the audio passed to the encoder is not + necessarily preserved by the lossy compression. +<vspace blankLines="1"/> +Regardless of the internal channel count, any Opus stream can be decoded as + mono (a single channel) or stereo (two channels) by appropriate initialization + of the decoder. +The 'coupled stream count' field indicates that the decoders for the first M + Opus streams are to be initialized for stereo (two-channel) output, and the + remaining (N - M) decoders are to be initialized for mono (a single + channel) only. +The total number of decoded channels, (M + N), MUST be no larger than + 255, as there is no way to index more channels than that in the channel + mapping. +<vspace blankLines="1"/> +For channel mapping family 0, this value defaults to (C - 1) + (i.e., 0 for mono and 1 for stereo), and is not coded. +<vspace blankLines="1"/> +</t> +<t>Channel Mapping (8*C bits): +This contains one octet per output channel, indicating which decoded channel + is to be used for each one. +Let 'index' be the value of this octet for a particular output channel. +This value MUST either be smaller than (M + N), or be the special + value 255. +If 'index' is less than 2*M, the output MUST be taken from decoding stream + ('index'/2) as stereo and selecting the left channel if 'index' is even, and + the right channel if 'index' is odd. +If 'index' is 2*M or larger, but less than 255, the output MUST be taken from + decoding stream ('index' - M) as mono. +If 'index' is 255, the corresponding output channel MUST contain pure silence. +<vspace blankLines="1"/> +The number of output channels, C, is not constrained to match the number of + decoded channels (M + N). +A single index value MAY appear multiple times, i.e., the same decoded channel + might be mapped to multiple output channels. +Some decoded channels might not be assigned to any output channel, as well. +<vspace blankLines="1"/> +For channel mapping family 0, the first index defaults to 0, and if + C == 2, the second index defaults to 1. +Neither index is coded. +</t> +</list> +</t> + +<t> +After producing the output channels, the channel mapping family determines the + semantic meaning of each one. +There are three defined mapping families in this specification. +</t> + +<section anchor="channel_mapping_0" title="Channel Mapping Family 0"> +<t> +Allowed numbers of channels: 1 or 2. +RTP mapping. +This is the same channel interpretation as <xref target="RFC7587"/>. +</t> +<t> +<list style="symbols"> +<t>1 channel: monophonic (mono).</t> +<t>2 channels: stereo (left, right).</t> +</list> +Special mapping: This channel mapping value also + indicates that the contents consists of a single Opus stream that is stereo if + and only if C == 2, with stream index 0 mapped to output + channel 0 (mono, or left channel) and stream index 1 mapped to + output channel 1 (right channel) if stereo. +When the 'channel mapping family' octet has this value, the channel mapping + table MUST be omitted from the ID header packet. +</t> +</section> + +<section anchor="channel_mapping_1" title="Channel Mapping Family 1"> +<t> +Allowed numbers of channels: 1...8. +Vorbis channel order (see below). +</t> +<t> +Each channel is assigned to a speaker location in a conventional surround + arrangement. +Specific locations depend on the number of channels, and are given below + in order of the corresponding channel indices. +<list style="symbols"> + <t>1 channel: monophonic (mono).</t> + <t>2 channels: stereo (left, right).</t> + <t>3 channels: linear surround (left, center, right)</t> + <t>4 channels: quadraphonic (front left, front right, rear left, rear right).</t> + <t>5 channels: 5.0 surround (front left, front center, front right, rear left, rear right).</t> + <t>6 channels: 5.1 surround (front left, front center, front right, rear left, rear right, LFE).</t> + <t>7 channels: 6.1 surround (front left, front center, front right, side left, side right, rear center, LFE).</t> + <t>8 channels: 7.1 surround (front left, front center, front right, side left, side right, rear left, rear right, LFE)</t> +</list> +</t> +<t> +This set of surround options and speaker location orderings is the same + as those used by the Vorbis codec <xref target="vorbis-mapping"/>. +The ordering is different from the one used by the + WAVE <xref target="wave-multichannel"/> and + Free Lossless Audio Codec (FLAC) <xref target="flac"/> formats, + so correct ordering requires permutation of the output channels when decoding + to or encoding from those formats. +'LFE' here refers to a Low Frequency Effects channel, often mapped to a + subwoofer with no particular spatial position. +Implementations SHOULD identify 'side' or 'rear' speaker locations with + 'surround' and 'back' as appropriate when interfacing with audio formats + or systems which prefer that terminology. +</t> +</section> + +<section anchor="channel_mapping_255" + title="Channel Mapping Family 255"> +<t> +Allowed numbers of channels: 1...255. +No defined channel meaning. +</t> +<t> +Channels are unidentified. +General-purpose players SHOULD NOT attempt to play these streams. +Offline implementations MAY deinterleave the output into separate PCM files, + one per channel. +Implementations SHOULD NOT produce output for channels mapped to stream index + 255 (pure silence) unless they have no other way to indicate the index of + non-silent channels. +</t> +</section> + +<section anchor="channel_mapping_undefined" + title="Undefined Channel Mappings"> +<t> +The remaining channel mapping families (2...254) are reserved. +A demuxer implementation encountering a reserved channel mapping family value + SHOULD act as though the value is 255. +</t> +</section> + +<section anchor="downmix" title="Downmixing"> +<t> +An Ogg Opus player MUST support any valid channel mapping with a channel + mapping family of 0 or 1, even if the number of channels does not match the + physically connected audio hardware. +Players SHOULD perform channel mixing to increase or reduce the number of + channels as needed. +</t> + +<t> +Implementations MAY use the matrices in + Figures <xref target="downmix-matrix-3" format="counter"/> + through <xref target="downmix-matrix-8" format="counter"/> to implement + downmixing from multichannel files using + <xref target="channel_mapping_1">Channel Mapping Family 1</xref>, which are + known to give acceptable results for stereo. +Matrices for 3 and 4 channels are normalized so each coefficient row sums + to 1 to avoid clipping. +For 5 or more channels they are normalized to 2 as a compromise between + clipping and dynamic range reduction. +</t> +<t> +In these matrices the front left and front right channels are generally +passed through directly. +When a surround channel is split between both the left and right stereo + channels, coefficients are chosen so their squares sum to 1, which + helps preserve the perceived intensity. +Rear channels are mixed more diffusely or attenuated to maintain focus + on the front channels. +</t> + +<figure anchor="downmix-matrix-3" + title="Stereo downmix matrix for the linear surround channel mapping" + align="center"> +<artwork align="center"><![CDATA[ +L output = ( 0.585786 * left + 0.414214 * center ) +R output = ( 0.414214 * center + 0.585786 * right ) +]]></artwork> +<postamble> +Exact coefficient values are 1 and 1/sqrt(2), multiplied by + 1/(1 + 1/sqrt(2)) for normalization. +</postamble> +</figure> + +<figure anchor="downmix-matrix-4" + title="Stereo downmix matrix for the quadraphonic channel mapping" + align="center"> +<artwork align="center"><![CDATA[ +/ \ / \ / FL \ +| L output | | 0.422650 0.000000 0.366025 0.211325 | | FR | +| R output | = | 0.000000 0.422650 0.211325 0.366025 | | RL | +\ / \ / \ RR / +]]></artwork> +<postamble> +Exact coefficient values are 1, sqrt(3)/2 and 1/2, multiplied by + 1/(1 + sqrt(3)/2 + 1/2) for normalization. +</postamble> +</figure> + +<figure anchor="downmix-matrix-5" + title="Stereo downmix matrix for the 5.0 surround mapping" + align="center"> +<artwork align="center"><![CDATA[ + / FL \ +/ \ / \ | FC | +| L | | 0.650802 0.460186 0.000000 0.563611 0.325401 | | FR | +| R | = | 0.000000 0.460186 0.650802 0.325401 0.563611 | | RL | +\ / \ / | RR | + \ / +]]></artwork> +<postamble> +Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by + 2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2) + for normalization. +</postamble> +</figure> + +<figure anchor="downmix-matrix-6" + title="Stereo downmix matrix for the 5.1 surround mapping" + align="center"> +<artwork align="center"><![CDATA[ + /FL \ +/ \ / \ |FC | +|L| | 0.529067 0.374107 0.000000 0.458186 0.264534 0.374107 | |FR | +|R| = | 0.000000 0.374107 0.529067 0.264534 0.458186 0.374107 | |RL | +\ / \ / |RR | + \LFE/ +]]></artwork> +<postamble> +Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by +2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2 + 1/sqrt(2)) + for normalization. +</postamble> +</figure> + +<figure anchor="downmix-matrix-7" + title="Stereo downmix matrix for the 6.1 surround mapping" + align="center"> +<artwork align="center"><![CDATA[ + / \ + | 0.455310 0.321953 0.000000 0.394310 0.227655 0.278819 0.321953 | + | 0.000000 0.321953 0.455310 0.227655 0.394310 0.278819 0.321953 | + \ / +]]></artwork> +<postamble> +Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2, 1/2 and + sqrt(3)/2/sqrt(2), multiplied by + 2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2 + + sqrt(3)/2/sqrt(2) + 1/sqrt(2)) for normalization. +The coefficients are in the same order as in <xref target="channel_mapping_1" />, + and the matrices above. +</postamble> +</figure> + +<figure anchor="downmix-matrix-8" + title="Stereo downmix matrix for the 7.1 surround mapping" + align="center"> +<artwork align="center"><![CDATA[ +/ \ +| .388631 .274804 .000000 .336565 .194316 .336565 .194316 .274804 | +| .000000 .274804 .388631 .194316 .336565 .194316 .336565 .274804 | +\ / +]]></artwork> +<postamble> +Exact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by + 2/(2 + 2/sqrt(2) + sqrt(3)) for normalization. +The coefficients are in the same order as in <xref target="channel_mapping_1" />, + and the matrices above. +</postamble> +</figure> + +</section> + +</section> <!-- end channel_mapping_table --> + +</section> <!-- end id_header --> + +<section anchor="comment_header" title="Comment Header"> + +<figure anchor="comment_header_packet" title="Comment Header Packet" + align="center"> +<artwork align="center"><![CDATA[ + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'O' | 'p' | 'u' | 's' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'T' | 'a' | 'g' | 's' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Vendor String Length | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| | +: Vendor String... : +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| User Comment List Length | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| User Comment #0 String Length | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| | +: User Comment #0 String... : +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| User Comment #1 String Length | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +: : +]]></artwork> +</figure> + +<t> +The comment header consists of a 64-bit magic signature, followed by data in + the same format as the <xref target="vorbis-comment"/> header used in Ogg + Vorbis, except (like Ogg Theora and Speex) the final "framing bit" specified + in the Vorbis spec is not present. +<list style="numbers"> +<t>Magic Signature: +<vspace blankLines="1"/> +This is an 8-octet (64-bit) field that allows codec identification and is + human-readable. +It contains, in order, the magic numbers: +<list style="empty"> +<t>0x4F 'O'</t> +<t>0x70 'p'</t> +<t>0x75 'u'</t> +<t>0x73 's'</t> +<t>0x54 'T'</t> +<t>0x61 'a'</t> +<t>0x67 'g'</t> +<t>0x73 's'</t> +</list> +Starting with "Op" helps distinguish it from audio data packets, as this is an + invalid TOC sequence. +<vspace blankLines="1"/> +</t> +<t>Vendor String Length (32 bits, unsigned, little endian): +<vspace blankLines="1"/> +This field gives the length of the following vendor string, in octets. +It MUST NOT indicate that the vendor string is longer than the rest of the + packet. +<vspace blankLines="1"/> +</t> +<t>Vendor String (variable length, UTF-8 vector): +<vspace blankLines="1"/> +This is a simple human-readable tag for vendor information, encoded as a UTF-8 + string <xref target="RFC3629"/>. +No terminating null octet is necessary. +<vspace blankLines="1"/> +This tag is intended to identify the codec encoder and encapsulation + implementations, for tracing differences in technical behavior. +User-facing applications can use the 'ENCODER' user comment tag to identify + themselves. +<vspace blankLines="1"/> +</t> +<t>User Comment List Length (32 bits, unsigned, little endian): +<vspace blankLines="1"/> +This field indicates the number of user-supplied comments. +It MAY indicate there are zero user-supplied comments, in which case there are + no additional fields in the packet. +It MUST NOT indicate that there are so many comments that the comment string + lengths would require more data than is available in the rest of the packet. +<vspace blankLines="1"/> +</t> +<t>User Comment #i String Length (32 bits, unsigned, little endian): +<vspace blankLines="1"/> +This field gives the length of the following user comment string, in octets. +There is one for each user comment indicated by the 'user comment list length' + field. +It MUST NOT indicate that the string is longer than the rest of the packet. +<vspace blankLines="1"/> +</t> +<t>User Comment #i String (variable length, UTF-8 vector): +<vspace blankLines="1"/> +This field contains a single user comment encoded as a UTF-8 + string <xref target="RFC3629"/>. +There is one for each user comment indicated by the 'user comment list length' + field. +</t> +</list> +</t> + +<t> +The vendor string length and user comment list length are REQUIRED, and + implementations SHOULD treat a stream as invalid if it contains a comment + header that does not have enough data for these fields, or that does not + contain enough data for the corresponding vendor string or user comments they + describe. +Making this check before allocating the associated memory to contain the data + helps prevent a possible Denial-of-Service (DoS) attack from small comment + headers that claim to contain strings longer than the entire packet or more + user comments than than could possibly fit in the packet. +</t> + +<t> +Immediately following the user comment list, the comment header MAY + contain zero-padding or other binary data which is not specified here. +If the least-significant bit of the first byte of this data is 1, then editors + SHOULD preserve the contents of this data when updating the tags, but if this + bit is 0, all such data MAY be treated as padding, and truncated or discarded + as desired. +This allows informal experimentation with the format of this binary data until + it can be specified later. +</t> + +<t> +The comment header can be arbitrarily large and might be spread over a large + number of Ogg pages. +Implementations MUST avoid attempting to allocate excessive amounts of memory + when presented with a very large comment header. +To accomplish this, implementations MAY treat a stream as invalid if it has a + comment header larger than 125,829,120 octets (120 MB), and MAY + ignore individual comments that are not fully contained within the first + 61,440 octets of the comment header. +</t> + +<section anchor="comment_format" title="Tag Definitions"> +<t> +The user comment strings follow the NAME=value format described by + <xref target="vorbis-comment"/> with the same recommended tag names: + ARTIST, TITLE, DATE, ALBUM, and so on. +</t> +<t> +Two new comment tags are introduced here: +</t> + +<t>First, an optional gain for track normalization:</t> +<figure align="center"> +<artwork align="left"><![CDATA[ +R128_TRACK_GAIN=-573 +]]></artwork> +</figure> +<t> + representing the volume shift needed to normalize the track's volume + during isolated playback, in random shuffle, and so on. +The gain is a Q7.8 fixed point number in dB, as in the ID header's 'output + gain' field. +This tag is similar to the REPLAYGAIN_TRACK_GAIN tag in + Vorbis <xref target="replay-gain"/>, except that the normal volume + reference is the <xref target="EBU-R128"/> standard. +</t> +<t>Second, an optional gain for album normalization:</t> +<figure align="center"> +<artwork align="left"><![CDATA[ +R128_ALBUM_GAIN=111 +]]></artwork> +</figure> +<t> + representing the volume shift needed to normalize the overall volume when + played as part of a particular collection of tracks. +The gain is also a Q7.8 fixed point number in dB, as in the ID header's + 'output gain' field. +The values '-573' and '111' given here are just examples. +</t> +<t> +An Ogg Opus stream MUST NOT have more than one of each of these tags, and if + present their values MUST be an integer from -32768 to 32767, inclusive, + represented in ASCII as a base 10 number with no whitespace. +A leading '+' or '-' character is valid. +Leading zeros are also permitted, but the value MUST be represented by + no more than 6 characters. +Other non-digit characters MUST NOT be present. +</t> +<t> +If present, R128_TRACK_GAIN and R128_ALBUM_GAIN MUST correctly represent + the R128 normalization gain relative to the 'output gain' field specified + in the ID header. +If a player chooses to make use of the R128_TRACK_GAIN tag or the + R128_ALBUM_GAIN tag, it MUST apply those gains + <spanx style="emph">in addition</spanx> to the 'output gain' value. +If a tool modifies the ID header's 'output gain' field, it MUST also update or + remove the R128_TRACK_GAIN and R128_ALBUM_GAIN comment tags if present. +A muxer SHOULD place the gain it wants other tools to use by default into the + 'output gain' field, and not the comment tag. +</t> +<t> +To avoid confusion with multiple normalization schemes, an Opus comment header + SHOULD NOT contain any of the REPLAYGAIN_TRACK_GAIN, REPLAYGAIN_TRACK_PEAK, + REPLAYGAIN_ALBUM_GAIN, or REPLAYGAIN_ALBUM_PEAK tags, unless they are only + to be used in some context where there is guaranteed to be no such confusion. +<xref target="EBU-R128"/> normalization is preferred to the earlier + REPLAYGAIN schemes because of its clear definition and adoption by industry. +Peak normalizations are difficult to calculate reliably for lossy codecs + because of variation in excursion heights due to decoder differences. +In the authors' investigations they were not applied consistently or broadly + enough to merit inclusion here. +</t> +</section> <!-- end comment_format --> +</section> <!-- end comment_header --> + +</section> <!-- end headers --> + +<section anchor="packet_size_limits" title="Packet Size Limits"> +<t> +Technically, valid Opus packets can be arbitrarily large due to the padding + format, although the amount of non-padding data they can contain is bounded. +These packets might be spread over a similarly enormous number of Ogg pages. +When encoding, implementations SHOULD limit the use of padding in audio data + packets to no more than is necessary to make a variable bitrate (VBR) stream + constant bitrate (CBR), unless they have no reasonable way to determine what + is necessary. +Demuxers SHOULD treat audio data packets as invalid (treat them as if they were + malformed Opus packets with an invalid TOC sequence) if they are larger than + 61,440 octets per Opus stream, unless they have a specific reason for + allowing extra padding. +Such packets necessarily contain more padding than needed to make a stream CBR. +Demuxers MUST avoid attempting to allocate excessive amounts of memory when + presented with a very large packet. +Demuxers MAY treat audio data packets as invalid or partially process them if + they are larger than 61,440 octets in an Ogg Opus stream with channel + mapping families 0 or 1. +Demuxers MAY treat audio data packets as invalid or partially process them in + any Ogg Opus stream if the packet is larger than 61,440 octets and also + larger than 7,680 octets per Opus stream. +The presence of an extremely large packet in the stream could indicate a + memory exhaustion attack or stream corruption. +</t> +<t> +In an Ogg Opus stream, the largest possible valid packet that does not use + padding has a size of (61,298*N - 2) octets. +With 255 streams, this is 15,630,988 octets and can + span up to 61,298 Ogg pages, all but one of which will have a granule + position of -1. +This is of course a very extreme packet, consisting of 255 streams, each + containing 120 ms of audio encoded as 2.5 ms frames, each frame + using the maximum possible number of octets (1275) and stored in the least + efficient manner allowed (a VBR code 3 Opus packet). +Even in such a packet, most of the data will be zeros as 2.5 ms frames + cannot actually use all 1275 octets. +</t> +<t> +The largest packet consisting of entirely useful data is + (15,326*N - 2) octets. +This corresponds to 120 ms of audio encoded as 10 ms frames in either + SILK or Hybrid mode, but at a data rate of over 1 Mbps, which makes little + sense for the quality achieved. +</t> +<t> +A more reasonable limit is (7,664*N - 2) octets. +This corresponds to 120 ms of audio encoded as 20 ms stereo CELT mode + frames, with a total bitrate just under 511 kbps (not counting the Ogg + encapsulation overhead). +For channel mapping family 1, N=8 provides a reasonable upper bound, as it + allows for each of the 8 possible output channels to be decoded from a + separate stereo Opus stream. +This gives a size of 61,310 octets, which is rounded up to a multiple of + 1,024 octets to yield the audio data packet size of 61,440 octets + that any implementation is expected to be able to process successfully. +</t> +</section> + +<section anchor="encoder" title="Encoder Guidelines"> +<t> +When encoding Opus streams, Ogg muxers SHOULD take into account the + algorithmic delay of the Opus encoder. +</t> +<t> +In encoders derived from the reference + implementation <xref target="RFC6716"/>, the number of samples can be + queried with: +</t> +<figure align="center"> +<artwork align="center"><![CDATA[ + opus_encoder_ctl(encoder_state, OPUS_GET_LOOKAHEAD(&delay_samples)); +]]></artwork> +</figure> +<t> +To achieve good quality in the very first samples of a stream, implementations + MAY use linear predictive coding (LPC) extrapolation to generate at least 120 + extra samples at the beginning to avoid the Opus encoder having to encode a + discontinuous signal. +For more information on linear prediction, see + <xref target="linear-prediction"/>. +For an input file containing 'length' samples, the implementation SHOULD set + the pre-skip header value to (delay_samples + extra_samples), encode + at least (length + delay_samples + extra_samples) + samples, and set the granule position of the last page to + (length + delay_samples + extra_samples). +This ensures that the encoded file has the same duration as the original, with + no time offset. The best way to pad the end of the stream is to also use LPC + extrapolation, but zero-padding is also acceptable. +</t> + +<section anchor="lpc" title="LPC Extrapolation"> +<t> +The first step in LPC extrapolation is to compute linear prediction + coefficients. <xref target="lpc-sample"/> +When extending the end of the signal, order-N (typically with N ranging from 8 + to 40) LPC analysis is performed on a window near the end of the signal. +The last N samples are used as memory to an infinite impulse response (IIR) + filter. +</t> +<t> +The filter is then applied on a zero input to extrapolate the end of the signal. +Let a(k) be the kth LPC coefficient and x(n) be the nth sample of the signal, + each new sample past the end of the signal is computed as: +</t> +<figure align="center"> +<artwork align="center"><![CDATA[ + N + --- +x(n) = \ a(k)*x(n-k) + / + --- + k=1 +]]></artwork> +</figure> +<t> +The process is repeated independently for each channel. +It is possible to extend the beginning of the signal by applying the same + process backward in time. +When extending the beginning of the signal, it is best to apply a "fade in" to + the extrapolated signal, e.g. by multiplying it by a half-Hanning window + <xref target="hanning"/>. +</t> + +</section> + +<section anchor="continuous_chaining" title="Continuous Chaining"> +<t> +In some applications, such as Internet radio, it is desirable to cut a long + stream into smaller chains, e.g. so the comment header can be updated. +This can be done simply by separating the input streams into segments and + encoding each segment independently. +The drawback of this approach is that it creates a small discontinuity + at the boundary due to the lossy nature of Opus. +A muxer MAY avoid this discontinuity by using the following procedure: +<list style="numbers"> +<t>Encode the last frame of the first segment as an independent frame by + turning off all forms of inter-frame prediction. +De-emphasis is allowed.</t> +<t>Set the granule position of the last page to a point near the end of the + last frame.</t> +<t>Begin the second segment with a copy of the last frame of the first + segment.</t> +<t>Set the pre-skip value of the second stream in such a way as to properly + join the two streams.</t> +<t>Continue the encoding process normally from there, without any reset to + the encoder.</t> +</list> +</t> +<t> +In encoders derived from the reference implementation, inter-frame prediction + can be turned off by calling: +</t> +<figure align="center"> +<artwork align="center"><![CDATA[ + opus_encoder_ctl(encoder_state, OPUS_SET_PREDICTION_DISABLED(1)); +]]></artwork> +</figure> +<t> +For best results, this implementation requires that prediction be explicitly + enabled again before resuming normal encoding, even after a reset. +</t> + +</section> + +</section> + +<section anchor="implementation" title="Implementation Status"> +<t> +A brief summary of major implementations of this draft is available + at <eref target="https://wiki.xiph.org/OggOpusImplementation"/>, + along with their status. +</t> +<t> +[Note to RFC Editor: please remove this entire section before + final publication per <xref target="RFC6982"/>, along with + its references.] +</t> +</section> + +<section anchor="security" title="Security Considerations"> +<t> +Implementations of the Opus codec need to take appropriate security + considerations into account, as outlined in <xref target="RFC4732"/>. +This is just as much a problem for the container as it is for the codec itself. +Malicious payloads and/or input streams can be used to attack codec + implementations. +Implementations MUST NOT overrun their allocated memory nor consume excessive + resources when decoding payloads or processing input streams. +Although problems in encoding applications are typically rarer, this still + applies to a muxer, as vulnerabilities would allow an attacker to attack + transcoding gateways. +</t> + +<t> +Header parsing code contains the most likely area for potential overruns. +It is important for implementations to ensure their buffers contain enough + data for all of the required fields before attempting to read it (for example, + for all of the channel map data in the ID header). +Implementations would do well to validate the indices of the channel map, also, + to ensure they meet all of the restrictions outlined in + <xref target="channel_mapping"/>, in order to avoid attempting to read data + from channels that do not exist. +</t> + +<t> +To avoid excessive resource usage, we advise implementations to be especially + wary of streams that might cause them to process far more data than was + actually transmitted. +For example, a relatively small comment header may contain values for the + string lengths or user comment list length that imply that it is many + gigabytes in size. +Even computing the size of the required buffer could overflow a 32-bit integer, + and actually attempting to allocate such a buffer before verifying it would be + a reasonable size is a bad idea. +After reading the user comment list length, implementations might wish to + verify that the header contains at least the minimum amount of data for that + many comments (4 additional octets per comment, to indicate each has a + length of zero) before proceeding any further, again taking care to avoid + overflow in these calculations. +If allocating an array of pointers to point at these strings, the size of the + pointers may be larger than 4 octets, potentially requiring a separate + overflow check. +</t> + +<t> +Another bug in this class we have observed more than once involves the handling + of invalid data at the end of a stream. +Often, implementations will seek to the end of a stream to locate the last + timestamp in order to compute its total duration. +If they do not find a valid capture pattern and Ogg page from the desired + logical stream, they will back up and try again. +If care is not taken to avoid re-scanning data that was already scanned, this + search can quickly devolve into something with a complexity that is quadratic + in the amount of invalid data. +</t> + +<t> +In general when seeking, implementations will wish to be cautious about the + effects of invalid granule position values, and ensure all algorithms will + continue to make progress and eventually terminate, even if these are missing + or out-of-order. +</t> + +<t> +Like most other container formats, Ogg Opus streams SHOULD NOT be used with + insecure ciphers or cipher modes that are vulnerable to known-plaintext + attacks. +Elements such as the Ogg page capture pattern and the magic signatures in the + ID header and the comment header all have easily predictable values, in + addition to various elements of the codec data itself. +</t> +</section> + +<section anchor="content_type" title="Content Type"> +<t> +An "Ogg Opus file" consists of one or more sequentially multiplexed segments, + each containing exactly one Ogg Opus stream. +The RECOMMENDED mime-type for Ogg Opus files is "audio/ogg". +</t> + +<t> +If more specificity is desired, one MAY indicate the presence of Opus streams + using the codecs parameter defined in <xref target="RFC6381"/> and + <xref target="RFC5334"/>, e.g., +</t> +<figure> +<artwork align="center"><![CDATA[ + audio/ogg; codecs=opus +]]></artwork> +</figure> +<t> + for an Ogg Opus file. +</t> + +<t> +The RECOMMENDED filename extension for Ogg Opus files is '.opus'. +</t> + +<t> +When Opus is concurrently multiplexed with other streams in an Ogg container, + one SHOULD use one of the "audio/ogg", "video/ogg", or "application/ogg" + mime-types, as defined in <xref target="RFC5334"/>. +Such streams are not strictly "Ogg Opus files" as described above, + since they contain more than a single Opus stream per sequentially + multiplexed segment, e.g. video or multiple audio tracks. +In such cases the the '.opus' filename extension is NOT RECOMMENDED. +</t> + +<t> +In either case, this document updates <xref target="RFC5334"/> + to add 'opus' as a codecs parameter value with char[8]: 'OpusHead' + as Codec Identifier. +</t> +</section> + +<section anchor="iana" title="IANA Considerations"> +<t> +This document updates the IANA Media Types registry to add .opus + as a file extension for "audio/ogg", and to add itself as a reference + alongside <xref target="RFC5334"/> for "audio/ogg", "video/ogg", and + "application/ogg" Media Types. +</t> +<t> +This document defines a new registry "Opus Channel Mapping Families" to + indicate how the semantic meanings of the channels in a multi-channel Opus + stream are described. +IANA is requested to create a new name space of "Opus Channel Mapping + Families". +This will be a new registry on the IANA Matrix, and not a subregistry of an + existing registry. +Modifications to this registry follow the "Specification Required" registration + policy as defined in <xref target="RFC5226"/>. +Each registry entry consists of a Channel Mapping Family Number, which is + specified in decimal in the range 0 to 255, inclusive, and a Reference (or + list of references) +Each Reference must point to sufficient documentation to describe what + information is coded in the Opus identification header for this channel + mapping family, how a demuxer determines the Stream Count ('N') and Coupled + Stream Count ('M') from this information, and how it determines the proper + interpretation of each of the decoded channels. +</t> +<t> +This document defines three initial assignments for this registry. +</t> +<texttable> +<ttcol>Value</ttcol><ttcol>Reference</ttcol> +<c>0</c><c>[RFCXXXX] <xref target="channel_mapping_0"/></c> +<c>1</c><c>[RFCXXXX] <xref target="channel_mapping_1"/></c> +<c>255</c><c>[RFCXXXX] <xref target="channel_mapping_255"/></c> +</texttable> +<t> +The designated expert will determine if the Reference points to a specification + that meets the requirements for permanence and ready availability laid out + in <xref target="RFC5226"/> and that it specifies the information + described above with sufficient clarity to allow interoperable + implementations. +</t> +</section> + +<section anchor="Acknowledgments" title="Acknowledgments"> +<t> +Thanks to Ben Campbell, Joel M. Halpern, Mark Harris, Greg Maxwell, + Christopher "Monty" Montgomery, Jean-Marc Valin, Stephan Wenger, and Mo Zanaty + for their valuable contributions to this document. +Additional thanks to Andrew D'Addesio, Greg Maxwell, and Vincent Penquerc'h for + their feedback based on early implementations. +</t> +</section> + +<section title="RFC Editor Notes"> +<t> +In <xref target="iana"/>, "RFCXXXX" is to be replaced with the RFC number + assigned to this draft. +</t> +</section> + +</middle> +<back> +<references title="Normative References"> + &rfc2119; + &rfc3533; + &rfc3629; + &rfc5226; + &rfc5334; + &rfc6381; + &rfc6716; + +<reference anchor="EBU-R128" target="https://tech.ebu.ch/loudness"> +<front> + <title>Loudness Recommendation EBU R128</title> + <author> + <organization>EBU Technical Committee</organization> + </author> + <date month="August" year="2011"/> +</front> +</reference> + +<reference anchor="vorbis-comment" + target="https://www.xiph.org/vorbis/doc/v-comment.html"> +<front> +<title>Ogg Vorbis I Format Specification: Comment Field and Header + Specification</title> +<author initials="C." surname="Montgomery" + fullname="Christopher "Monty" Montgomery"/> +<date month="July" year="2002"/> +</front> +</reference> + +</references> + +<references title="Informative References"> + +<!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.3550.xml"?--> + &rfc4732; + &rfc6982; + &rfc7587; + +<reference anchor="flac" + target="https://xiph.org/flac/format.html"> + <front> + <title>FLAC - Free Lossless Audio Codec Format Description</title> + <author initials="J." surname="Coalson" fullname="Josh Coalson"/> + <date month="January" year="2008"/> + </front> +</reference> + +<reference anchor="hanning" + target="https://en.wikipedia.org/w/index.php?title=Window_function&oldid=703074467#Hann_.28Hanning.29_window"> + <front> + <title>Hann window</title> + <author> + <organization>Wikipedia</organization> + </author> + <date month="February" year="2016"/> + </front> +</reference> + +<reference anchor="linear-prediction" + target="https://en.wikipedia.org/w/index.php?title=Linear_predictive_coding&oldid=687498962"> + <front> + <title>Linear Predictive Coding</title> + <author> + <organization>Wikipedia</organization> + </author> + <date month="October" year="2015"/> + </front> +</reference> + +<reference anchor="lpc-sample" + target="https://svn.xiph.org/trunk/vorbis/lib/lpc.c"> +<front> + <title>Autocorrelation LPC coeff generation algorithm + (Vorbis source code)</title> +<author initials="J." surname="Degener" fullname="Jutta Degener"/> +<author initials="C." surname="Bormann" fullname="Carsten Bormann"/> +<date month="November" year="1994"/> +</front> +</reference> + +<reference anchor="q-notation" + target="https://en.wikipedia.org/w/index.php?title=Q_%28number_format%29&oldid=697252615"> +<front> +<title>Q (number format)</title> +<author><organization>Wikipedia</organization></author> +<date month="December" year="2015"/> +</front> +</reference> + +<reference anchor="replay-gain" + target="https://wiki.xiph.org/VorbisComment#Replay_Gain"> +<front> +<title>VorbisComment: Replay Gain</title> +<author initials="C." surname="Parker" fullname="Conrad Parker"/> +<author initials="M." surname="Leese" fullname="Martin Leese"/> +<date month="June" year="2009"/> +</front> +</reference> + +<reference anchor="seeking" + target="https://wiki.xiph.org/Seeking"> +<front> +<title>Granulepos Encoding and How Seeking Really Works</title> +<author initials="S." surname="Pfeiffer" fullname="Silvia Pfeiffer"/> +<author initials="C." surname="Parker" fullname="Conrad Parker"/> +<author initials="G." surname="Maxwell" fullname="Greg Maxwell"/> +<date month="May" year="2012"/> +</front> +</reference> + +<reference anchor="vorbis-mapping" + target="https://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-810004.3.9"> +<front> +<title>The Vorbis I Specification, Section 4.3.9 Output Channel Order</title> +<author initials="C." surname="Montgomery" + fullname="Christopher "Monty" Montgomery"/> +<date month="January" year="2010"/> +</front> +</reference> + +<reference anchor="vorbis-trim" + target="https://xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-132000A.2"> + <front> + <title>The Vorbis I Specification, Appendix A: Embedding Vorbis + into an Ogg stream</title> + <author initials="C." surname="Montgomery" + fullname="Christopher "Monty" Montgomery"/> + <date month="November" year="2008"/> + </front> +</reference> + +<reference anchor="wave-multichannel" + target="http://msdn.microsoft.com/en-us/windows/hardware/gg463006.aspx"> + <front> + <title>Multiple Channel Audio Data and WAVE Files</title> + <author> + <organization>Microsoft Corporation</organization> + </author> + <date month="March" year="2007"/> + </front> +</reference> + +</references> + +</back> +</rfc> |
