413 lines
13 KiB
Python
413 lines
13 KiB
Python
# EBML/Matroska parser
|
|
# Copyright (C) 2010 Johannes Sasongko <sasongko@gmail.com>
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2, or (at your option)
|
|
# any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
#
|
|
#
|
|
# The developers of the Exaile media player hereby grant permission
|
|
# for non-GPL compatible GStreamer and Exaile plugins to be used and
|
|
# distributed together with GStreamer and Exaile. This permission is
|
|
# above and beyond the permissions granted by the GPL license by which
|
|
# Exaile is covered. If you modify this code, you may extend this
|
|
# exception to your version of the code, but you are not obligated to
|
|
# do so. If you do not wish to do so, delete this exception statement
|
|
# from your version.
|
|
|
|
|
|
# This code is heavily based on public domain code by "Omion" (from the
|
|
# Hydrogenaudio forums), as obtained from Matroska's Subversion repository at
|
|
# revision 858 (2004-10-03), under "/trunk/Perl.Parser/MatroskaParser.pm".
|
|
|
|
|
|
import sys
|
|
from struct import unpack
|
|
|
|
SINT, UINT, FLOAT, STRING, UTF8, DATE, MASTER, BINARY = range(8)
|
|
|
|
class EbmlException(Exception): pass
|
|
|
|
class BinaryData(str): pass
|
|
class UnknownData: pass
|
|
|
|
class Ebml:
|
|
"""EBML parser.
|
|
|
|
Usage: Ebml(location, tags).parse()
|
|
tags is a dictionary of the form { id: (name, type) }.
|
|
"""
|
|
|
|
## Constructor and destructor
|
|
|
|
def __init__(self, location, tags):
|
|
self.tags = tags
|
|
self.open(location)
|
|
|
|
def __del__(self):
|
|
self.close()
|
|
|
|
## File access.
|
|
## These can be overridden to provide network support.
|
|
|
|
def open(self, location):
|
|
"""Open a location and set self.size."""
|
|
self.file = f = open(location, 'rb')
|
|
f = self.file
|
|
f.seek(0, 2)
|
|
self.size = f.tell()
|
|
f.seek(0, 0)
|
|
|
|
def seek(self, offset, mode):
|
|
self.file.seek(offset, mode)
|
|
|
|
def tell(self):
|
|
return self.file.tell()
|
|
|
|
def read(self, length):
|
|
return self.file.read(length)
|
|
|
|
def close(self):
|
|
self.file.close()
|
|
|
|
## Element reading
|
|
|
|
def readSize(self):
|
|
b1 = self.read(1)
|
|
b1b = ord(b1)
|
|
if b1b & 0x80:
|
|
# 1 byte
|
|
return b1b & 0x7f
|
|
elif b1b & 0x40:
|
|
# 2 bytes
|
|
# JS: BE-ushort
|
|
return unpack(">H", chr(0x40 ^ b1b) + self.read(1))[0]
|
|
elif b1b & 0x20:
|
|
# 3 bytes
|
|
# JS: BE-ulong
|
|
return unpack(">L", "\0" + chr(0x20 ^ b1b) + self.read(2))[0]
|
|
elif b1b & 0x10:
|
|
# 4 bytes
|
|
# JS: BE-ulong
|
|
return unpack(">L", chr(0x10 ^ b1b) + self.read(3))[0]
|
|
elif b1b & 0x08:
|
|
# 5 bytes
|
|
# JS: uchar BE-ulong. We change this to BE uchar ulong.
|
|
high, low = unpack(">BL", chr(0x08 ^ b1b) + self.read(4))
|
|
return high * 4294967296 + low
|
|
elif b1b & 0x04:
|
|
# 6 bytes
|
|
# JS: BE-slong BE-ulong
|
|
high, low = unpack(">HL", chr(0x04 ^ b1b) + self.read(5))
|
|
return high * 4294967296 + low
|
|
elif b1b & 0x02:
|
|
# 7 bytes
|
|
# JS: BE-ulong BE-ulong
|
|
high, low = unpack(">LL",
|
|
"\0" + chr(0x02 ^ b1b) + self.read(6))
|
|
return high * 4294967296 + low
|
|
elif b1b & 0x01:
|
|
# 8 bytes
|
|
# JS: BE-ulong BE-ulong
|
|
high, low = unpack(">LL", chr(0x01 ^ b1b) + self.read(7))
|
|
return high * 4294967296 + low
|
|
else:
|
|
raise EbmlException(
|
|
"invalid element size with leading byte 0x%X" % b1b)
|
|
|
|
def readInteger(self, length):
|
|
if length == 1:
|
|
# 1 byte
|
|
return ord(self.read(1))
|
|
elif length == 2:
|
|
# 2 bytes
|
|
return unpack(">H", self.read(2))[0]
|
|
elif length == 3:
|
|
# 3 bytes
|
|
return unpack(">L", "\0" + self.read(3))[0]
|
|
elif length == 4:
|
|
# 4 bytes
|
|
return unpack(">L", self.read(4))[0]
|
|
elif length == 5:
|
|
# 5 bytes
|
|
high, low = unpack(">BL", self.read(5))
|
|
return high * 4294967296 + low
|
|
elif length == 6:
|
|
# 6 bytes
|
|
high, low = unpack(">HL", self.read(6))
|
|
return high * 4294967296 + low
|
|
elif length == 7:
|
|
# 7 bytes
|
|
high, low = unpack(">LL", "\0" + (self.read(7)))
|
|
return high * 4294967296 + low
|
|
elif length == 8:
|
|
# 8 bytes
|
|
high, low = unpack(">LL", self.read(8))
|
|
return high * 4294967296 + low
|
|
else:
|
|
raise EbmlException(
|
|
"don't know how to read %r-byte integer" % length)
|
|
|
|
def readFloat(self, length):
|
|
# Need to reverse the bytes for little-endian machines
|
|
if length == 4:
|
|
# single
|
|
return unpack('@f', self.read(4)[::-1])[0]
|
|
elif length == 8:
|
|
# double
|
|
return unpack('@d', self.read(8)[::-1])[0]
|
|
elif length == 10:
|
|
# extended (don't know how to handle it)
|
|
return 'EXTENDED'
|
|
else:
|
|
raise EbmlException("don't know how to read %r-byte float" % length)
|
|
|
|
def readID(self):
|
|
b1 = self.read(1)
|
|
b1b = ord(b1)
|
|
if b1b & 0x80:
|
|
# 1 byte
|
|
return b1b & 0x7f
|
|
elif b1b & 0x40:
|
|
# 2 bytes
|
|
return unpack(">H", chr(0x40 ^ b1b) + self.read(1))[0]
|
|
elif b1b & 0x20:
|
|
# 3 bytes
|
|
return unpack(">L", "\0" + chr(0x20 ^ b1b) + self.read(2))[0]
|
|
elif b1b & 0x10:
|
|
# 4 bytes
|
|
return unpack(">L", chr(0x10 ^ b1b) + self.read(3))[0]
|
|
else:
|
|
raise EbmlException(
|
|
"invalid element ID with leading byte 0x%X" % b1b)
|
|
|
|
## Parsing
|
|
|
|
def parse(self, from_=0, to=None):
|
|
"""Parses EBML from `from_` to `to`.
|
|
|
|
Note that not all streams support seeking backwards, so prepare to handle
|
|
an exception if you try to parse from arbitrary position.
|
|
"""
|
|
if to is None:
|
|
to = self.size
|
|
self.seek(from_, 0)
|
|
node = {}
|
|
# Iterate over current node's children.
|
|
while self.tell() < to:
|
|
try:
|
|
id = self.readID()
|
|
except EbmlException, e:
|
|
# Invalid EBML header. We can't reliably get any more data from
|
|
# this level, so just return anything we have.
|
|
print >>sys.stderr, "ERROR:", e
|
|
return node
|
|
size = self.readSize()
|
|
try:
|
|
key, type_ = self.tags[id]
|
|
except KeyError:
|
|
self.seek(size, 1)
|
|
else:
|
|
try:
|
|
if type_ is MASTER:
|
|
tell = self.tell()
|
|
value = self.parse(tell, tell + size)
|
|
elif type_ in (SINT, UINT, DATE):
|
|
value = self.readInteger(size)
|
|
elif type_ is FLOAT:
|
|
value = self.readFloat(size)
|
|
elif type_ is STRING:
|
|
value = unicode(self.read(size), 'ascii')
|
|
elif type_ is UTF8:
|
|
value = unicode(self.read(size), 'utf-8')
|
|
elif type_ is BINARY:
|
|
value = BinaryData(self.read(size))
|
|
else:
|
|
assert False
|
|
except (EbmlException, UnicodeDecodeError), e:
|
|
print >>sys.stderr, "WARNING:", e
|
|
try:
|
|
parentval = node[key]
|
|
except KeyError:
|
|
parentval = node[key] = []
|
|
parentval.append(value)
|
|
return node
|
|
|
|
'''Hydrus Dev deleted this!
|
|
## GIO-specific code
|
|
|
|
import gio
|
|
|
|
class GioEbml(Ebml):
|
|
# NOTE: All seeks are faked using InputStream.skip because we need to use
|
|
# BufferedInputStream but it does not implement Seekable.
|
|
|
|
def open(self, location):
|
|
f = gio.File(location)
|
|
self.buffer = gio.BufferedInputStream(f.read())
|
|
self._tell = 0
|
|
|
|
self.size = f.query_info('standard::size').get_size()
|
|
|
|
def seek(self, offset, mode):
|
|
if mode == 0:
|
|
skip = offset - self._tell
|
|
elif mode == 1:
|
|
skip = offset
|
|
elif mode == 2:
|
|
skip = self.size - self._tell + offset
|
|
else:
|
|
raise ValueError("invalid seek mode: %r" % offset)
|
|
if skip < 0:
|
|
raise gio.Error("cannot seek backwards from %d" % self._tell)
|
|
self._tell += skip
|
|
self.buffer.skip(skip)
|
|
|
|
def tell(self):
|
|
return self._tell
|
|
|
|
def read(self, length):
|
|
result = self.buffer.read(length)
|
|
self._tell += len(result)
|
|
return result
|
|
|
|
def close(self):
|
|
self.buffer.close()
|
|
'''
|
|
|
|
## Matroska-specific code
|
|
|
|
# Interesting Matroska tags.
|
|
# Tags not defined here are skipped while parsing.
|
|
MatroskaTags = {
|
|
0xa45dfa3: ('EBML', MASTER ),
|
|
0x0282: ('DocType', STRING), # hydrus dev added this
|
|
# Segment
|
|
0x08538067: ('Segment', MASTER),
|
|
# Segment Information
|
|
0x0549A966: ('Info', MASTER),
|
|
0x3384: ('SegmentFilename', UTF8),
|
|
0x0AD7B1: ('TimecodeScale', UINT),
|
|
0x0489: ('Duration', FLOAT),
|
|
0x0461: ('DateUTC', DATE),
|
|
0x3BA9: ('Title', UTF8),
|
|
0x0D80: ('MuxingApp', UTF8),
|
|
0x1741: ('WritingApp', UTF8),
|
|
# Track
|
|
0x0654AE6B: ('Tracks', MASTER),
|
|
0x2E: ('TrackEntry', MASTER),
|
|
0x57: ('TrackNumber', UINT),
|
|
0x03: ('TrackType', UINT),
|
|
0x29: ('FlagEnabled', UINT),
|
|
0x08: ('FlagDefault', UINT),
|
|
0x03E383: ('DefaultDuration', UINT),
|
|
0x03314F: ('TrackTimecodeScale', FLOAT),
|
|
0x137F: ('TrackOffset', SINT),
|
|
0x136E: ('Name', UTF8),
|
|
0x02B59C: ('Language', STRING),
|
|
0x06: ('CodecID', STRING),
|
|
0x058688: ('CodecName', UTF8),
|
|
0x1A9697: ('CodecSettings', UTF8),
|
|
0x1B4040: ('CodecInfoURL', STRING),
|
|
0x06B240: ('CodecDownloadURL', STRING),
|
|
0x2A: ('CodecDecodeAll', UINT),
|
|
0x2FAB: ('TrackOverlay', UINT),
|
|
# Video
|
|
0x60: ('Video', MASTER),
|
|
0x30: ('PixelWidth', UINT), # hydrus dev added this
|
|
0x3A: ('PixelHeight', UINT), # hydrus dev added this
|
|
# Audio
|
|
0x61: ('Audio', MASTER),
|
|
0x35: ('SamplingFrequency', UINT),
|
|
0x38B5: ('OutputSamplingFrequency', UINT),
|
|
0x1F: ('Channels', UINT),
|
|
0x3D7B: ('ChannelPositions', BINARY),
|
|
0x2264: ('BitDepth', UINT),
|
|
# Content Encoding
|
|
0x2D80: ('ContentEncodings', MASTER),
|
|
0x2240: ('ContentEncoding', MASTER),
|
|
0x1031: ('ContentEncodingOrder', UINT),
|
|
0x1032: ('ContentEncodingScope', UINT),
|
|
0x1033: ('ContentEncodingType', UINT),
|
|
0x1034: ('ContentCompression', MASTER),
|
|
0x0254: ('ContentCompAlgo', UINT),
|
|
0x0255: ('ContentCompSettings', BINARY),
|
|
# Chapters
|
|
0x0043A770: ('Chapters', MASTER),
|
|
0x05B9: ('EditionEntry', MASTER),
|
|
0x05BC: ('EditionUID', UINT),
|
|
0x05BD: ('EditionFlagHidden', UINT),
|
|
0x05DB: ('EditionFlagDefault', UINT),
|
|
0x05DD: ('EditionManaged', UINT),
|
|
0x36: ('ChapterAtom', MASTER),
|
|
0x33C4: ('ChapterUID', UINT),
|
|
0x11: ('ChapterTimeStart', UINT),
|
|
0x12: ('ChapterTimeEnd', UINT),
|
|
0x18: ('ChapterFlagHidden', UINT),
|
|
0x0598: ('ChapterFlagEnabled', UINT),
|
|
0x23C3: ('ChapterPhysicalEquiv', UINT),
|
|
0x0F: ('ChapterTrack', MASTER),
|
|
0x09: ('ChapterTrackNumber', UINT),
|
|
0x00: ('ChapterDisplay', MASTER),
|
|
0x05: ('ChapString', UTF8),
|
|
0x037C: ('ChapLanguage', STRING),
|
|
0x037E: ('ChapCountry', STRING),
|
|
# Tagging
|
|
0x0254C367: ('Tags', MASTER),
|
|
0x3373: ('Tag', MASTER),
|
|
0x23C0: ('Targets', MASTER),
|
|
0x28CA: ('TargetTypevalue', UINT),
|
|
0x23CA: ('TargetType', STRING),
|
|
0x23C9: ('EditionUID', UINT),
|
|
0x23C4: ('ChapterUID', UINT),
|
|
0x23C5: ('TrackUID', UINT),
|
|
0x23C6: ('AttachmentUID', UINT),
|
|
0x27C8: ('SimpleTag', MASTER),
|
|
0x05A3: ('TagName', UTF8),
|
|
0x047A: ('TagLanguage', STRING),
|
|
0x0484: ('TagDefault', UINT),
|
|
0x0487: ('TagString', UTF8),
|
|
0x0485: ('TagBinary', BINARY),
|
|
}
|
|
|
|
def parse(location):
|
|
return Ebml(location, MatroskaTags).parse()
|
|
|
|
def dump(location):
|
|
from pprint import pprint
|
|
pprint(parse(location))
|
|
|
|
def dump_tags(location):
|
|
from pprint import pprint
|
|
mka = parse(location)
|
|
segment = mka['Segment'][0]
|
|
info = segment['Info'][0]
|
|
length = info['Duration'][0] * info['TimecodeScale'][0] / 1e9
|
|
print "Length = %f seconds" % length
|
|
pprint(segment['Tags'][0]['Tag'])
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
location = sys.argv[1]
|
|
if sys.platform == 'win32' and '://' not in location:
|
|
# XXX: This is most likely a bug in the Win32 GIO port; it converts
|
|
# paths into UTF-8 and requires them to be specified in UTF-8 as well.
|
|
# Here we decode the path according to the FS encoding to get the
|
|
# Unicode representation first. If the path is in a different encoding,
|
|
# this step will fail.
|
|
location = location.decode(sys.getfilesystemencoding()).encode('utf-8')
|
|
dump_tags(location)
|
|
|
|
|
|
# vi: et sts=4 sw=4 ts=4
|