Merge pull request #2427 from ujjwal96/protobuf
Kaitai parser for protobuf
This commit is contained in:
commit
3814f171dd
|
@ -1,6 +1,63 @@
|
|||
import subprocess
|
||||
import io
|
||||
|
||||
from kaitaistruct import KaitaiStream
|
||||
from . import base
|
||||
from mitmproxy.contrib.kaitaistruct import google_protobuf
|
||||
|
||||
|
||||
def write_buf(out, field_tag, body, indent_level):
|
||||
if body is not None:
|
||||
out.write("{: <{level}}{}: {}\n".format('', field_tag, body if isinstance(body, int) else str(body, 'utf-8'),
|
||||
level=indent_level))
|
||||
elif field_tag is not None:
|
||||
out.write(' ' * indent_level + str(field_tag) + " {\n")
|
||||
else:
|
||||
out.write(' ' * indent_level + "}\n")
|
||||
|
||||
|
||||
def format_pbuf(raw):
|
||||
out = io.StringIO()
|
||||
stack = []
|
||||
|
||||
try:
|
||||
buf = google_protobuf.GoogleProtobuf(KaitaiStream(io.BytesIO(raw)))
|
||||
except:
|
||||
return False
|
||||
stack.extend([(pair, 0) for pair in buf.pairs[::-1]])
|
||||
|
||||
while len(stack):
|
||||
pair, indent_level = stack.pop()
|
||||
|
||||
if pair.wire_type == pair.WireTypes.group_start:
|
||||
body = None
|
||||
elif pair.wire_type == pair.WireTypes.group_end:
|
||||
body = None
|
||||
pair._m_field_tag = None
|
||||
elif pair.wire_type == pair.WireTypes.len_delimited:
|
||||
body = pair.value.body
|
||||
elif pair.wire_type == pair.WireTypes.varint:
|
||||
body = pair.value.value
|
||||
else:
|
||||
body = pair.value
|
||||
|
||||
try:
|
||||
next_buf = google_protobuf.GoogleProtobuf(KaitaiStream(io.BytesIO(body)))
|
||||
stack.extend([(pair, indent_level + 2) for pair in next_buf.pairs[::-1]])
|
||||
write_buf(out, pair.field_tag, None, indent_level)
|
||||
except:
|
||||
write_buf(out, pair.field_tag, body, indent_level)
|
||||
|
||||
if stack:
|
||||
prev_level = stack[-1][1]
|
||||
else:
|
||||
prev_level = 0
|
||||
|
||||
if prev_level < indent_level:
|
||||
levels = int((indent_level - prev_level) / 2)
|
||||
for i in range(1, levels + 1):
|
||||
write_buf(out, None, None, indent_level - i * 2)
|
||||
|
||||
return out.getvalue()
|
||||
|
||||
|
||||
class ViewProtobuf(base.View):
|
||||
|
@ -15,28 +72,9 @@ class ViewProtobuf(base.View):
|
|||
"application/x-protobuffer",
|
||||
]
|
||||
|
||||
def is_available(self):
|
||||
try:
|
||||
p = subprocess.Popen(
|
||||
["protoc", "--version"],
|
||||
stdout=subprocess.PIPE
|
||||
)
|
||||
out, _ = p.communicate()
|
||||
return out.startswith(b"libprotoc")
|
||||
except:
|
||||
return False
|
||||
|
||||
def __call__(self, data, **metadata):
|
||||
if not self.is_available():
|
||||
raise NotImplementedError("protoc not found. Please make sure 'protoc' is available in $PATH.")
|
||||
|
||||
# if Popen raises OSError, it will be caught in
|
||||
# get_content_view and fall back to Raw
|
||||
p = subprocess.Popen(['protoc', '--decode_raw'],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
decoded, _ = p.communicate(input=data)
|
||||
decoded = format_pbuf(data)
|
||||
if not decoded:
|
||||
raise ValueError("Failed to parse input.")
|
||||
|
||||
return "Protobuf", base.format_text(decoded)
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
|
||||
|
||||
from pkg_resources import parse_version
|
||||
from kaitaistruct import __version__ as ks_version, KaitaiStruct, KaitaiStream, BytesIO
|
||||
from enum import Enum
|
||||
|
||||
|
||||
if parse_version(ks_version) < parse_version('0.7'):
|
||||
raise Exception("Incompatible Kaitai Struct Python API: 0.7 or later is required, but you have %s" % (ks_version))
|
||||
|
||||
from .vlq_base128_le import VlqBase128Le
|
||||
class GoogleProtobuf(KaitaiStruct):
|
||||
"""Google Protocol Buffers (AKA protobuf) is a popular data
|
||||
serialization scheme used for communication protocols, data storage,
|
||||
etc. There are implementations are available for almost every
|
||||
popular language. The focus points of this scheme are brevity (data
|
||||
is encoded in a very size-efficient manner) and extensibility (one
|
||||
can add keys to the structure, while keeping it readable in previous
|
||||
version of software).
|
||||
|
||||
Protobuf uses semi-self-describing encoding scheme for its
|
||||
messages. It means that it is possible to parse overall structure of
|
||||
the message (skipping over fields one can't understand), but to
|
||||
fully understand the message, one needs a protocol definition file
|
||||
(`.proto`). To be specific:
|
||||
|
||||
* "Keys" in key-value pairs provided in the message are identified
|
||||
only with an integer "field tag". `.proto` file provides info on
|
||||
which symbolic field names these field tags map to.
|
||||
* "Keys" also provide something called "wire type". It's not a data
|
||||
type in its common sense (i.e. you can't, for example, distinguish
|
||||
`sint32` vs `uint32` vs some enum, or `string` from `bytes`), but
|
||||
it's enough information to determine how many bytes to
|
||||
parse. Interpretation of the value should be done according to the
|
||||
type specified in `.proto` file.
|
||||
* There's no direct information on which fields are optional /
|
||||
required, which fields may be repeated or constitute a map, what
|
||||
restrictions are placed on fields usage in a single message, what
|
||||
are the fields' default values, etc, etc.
|
||||
|
||||
.. seealso::
|
||||
Source - https://developers.google.com/protocol-buffers/docs/encoding
|
||||
"""
|
||||
def __init__(self, _io, _parent=None, _root=None):
|
||||
self._io = _io
|
||||
self._parent = _parent
|
||||
self._root = _root if _root else self
|
||||
self._read()
|
||||
|
||||
def _read(self):
|
||||
self.pairs = []
|
||||
while not self._io.is_eof():
|
||||
self.pairs.append(self._root.Pair(self._io, self, self._root))
|
||||
|
||||
|
||||
class Pair(KaitaiStruct):
|
||||
"""Key-value pair."""
|
||||
|
||||
class WireTypes(Enum):
|
||||
varint = 0
|
||||
bit_64 = 1
|
||||
len_delimited = 2
|
||||
group_start = 3
|
||||
group_end = 4
|
||||
bit_32 = 5
|
||||
def __init__(self, _io, _parent=None, _root=None):
|
||||
self._io = _io
|
||||
self._parent = _parent
|
||||
self._root = _root if _root else self
|
||||
self._read()
|
||||
|
||||
def _read(self):
|
||||
self.key = VlqBase128Le(self._io)
|
||||
_on = self.wire_type
|
||||
if _on == self._root.Pair.WireTypes.varint:
|
||||
self.value = VlqBase128Le(self._io)
|
||||
elif _on == self._root.Pair.WireTypes.len_delimited:
|
||||
self.value = self._root.DelimitedBytes(self._io, self, self._root)
|
||||
elif _on == self._root.Pair.WireTypes.bit_64:
|
||||
self.value = self._io.read_u8le()
|
||||
elif _on == self._root.Pair.WireTypes.bit_32:
|
||||
self.value = self._io.read_u4le()
|
||||
|
||||
@property
|
||||
def wire_type(self):
|
||||
""""Wire type" is a part of the "key" that carries enough
|
||||
information to parse value from the wire, i.e. read correct
|
||||
amount of bytes, but there's not enough informaton to
|
||||
interprete in unambiguously. For example, one can't clearly
|
||||
distinguish 64-bit fixed-sized integers from 64-bit floats,
|
||||
signed zigzag-encoded varints from regular unsigned varints,
|
||||
arbitrary bytes from UTF-8 encoded strings, etc.
|
||||
"""
|
||||
if hasattr(self, '_m_wire_type'):
|
||||
return self._m_wire_type if hasattr(self, '_m_wire_type') else None
|
||||
|
||||
self._m_wire_type = self._root.Pair.WireTypes((self.key.value & 7))
|
||||
return self._m_wire_type if hasattr(self, '_m_wire_type') else None
|
||||
|
||||
@property
|
||||
def field_tag(self):
|
||||
"""Identifies a field of protocol. One can look up symbolic
|
||||
field name in a `.proto` file by this field tag.
|
||||
"""
|
||||
if hasattr(self, '_m_field_tag'):
|
||||
return self._m_field_tag if hasattr(self, '_m_field_tag') else None
|
||||
|
||||
self._m_field_tag = (self.key.value >> 3)
|
||||
return self._m_field_tag if hasattr(self, '_m_field_tag') else None
|
||||
|
||||
|
||||
class DelimitedBytes(KaitaiStruct):
|
||||
def __init__(self, _io, _parent=None, _root=None):
|
||||
self._io = _io
|
||||
self._parent = _parent
|
||||
self._root = _root if _root else self
|
||||
self._read()
|
||||
|
||||
def _read(self):
|
||||
self.len = VlqBase128Le(self._io)
|
||||
self.body = self._io.read_bytes(self.len.value)
|
||||
|
||||
|
||||
|
|
@ -7,5 +7,7 @@ wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master
|
|||
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/jpeg.ksy
|
||||
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/png.ksy
|
||||
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/image/ico.ksy
|
||||
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/common/vlq_base128_le.ksy
|
||||
wget -N https://raw.githubusercontent.com/kaitai-io/kaitai_struct_formats/master/serialization/google_protobuf.ksy
|
||||
|
||||
kaitai-struct-compiler --target python --opaque-types=true *.ksy
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
# This is a generated file! Please edit source .ksy file and use kaitai-struct-compiler to rebuild
|
||||
|
||||
from pkg_resources import parse_version
|
||||
from kaitaistruct import __version__ as ks_version, KaitaiStruct, KaitaiStream, BytesIO
|
||||
|
||||
|
||||
if parse_version(ks_version) < parse_version('0.7'):
|
||||
raise Exception("Incompatible Kaitai Struct Python API: 0.7 or later is required, but you have %s" % (ks_version))
|
||||
|
||||
class VlqBase128Le(KaitaiStruct):
|
||||
"""A variable-length unsigned integer using base128 encoding. 1-byte groups
|
||||
consists of 1-bit flag of continuation and 7-bit value, and are ordered
|
||||
"least significant group first", i.e. in "little-endian" manner.
|
||||
|
||||
This particular encoding is specified and used in:
|
||||
|
||||
* DWARF debug file format, where it's dubbed "unsigned LEB128" or "ULEB128".
|
||||
http://dwarfstd.org/doc/dwarf-2.0.0.pdf - page 139
|
||||
* Google Protocol Buffers, where it's called "Base 128 Varints".
|
||||
https://developers.google.com/protocol-buffers/docs/encoding?csw=1#varints
|
||||
* Apache Lucene, where it's called "VInt"
|
||||
http://lucene.apache.org/core/3_5_0/fileformats.html#VInt
|
||||
* Apache Avro uses this as a basis for integer encoding, adding ZigZag on
|
||||
top of it for signed ints
|
||||
http://avro.apache.org/docs/current/spec.html#binary_encode_primitive
|
||||
|
||||
More information on this encoding is available at https://en.wikipedia.org/wiki/LEB128
|
||||
|
||||
This particular implementation supports serialized values to up 8 bytes long.
|
||||
"""
|
||||
def __init__(self, _io, _parent=None, _root=None):
|
||||
self._io = _io
|
||||
self._parent = _parent
|
||||
self._root = _root if _root else self
|
||||
self._read()
|
||||
|
||||
def _read(self):
|
||||
self.groups = []
|
||||
while True:
|
||||
_ = self._root.Group(self._io, self, self._root)
|
||||
self.groups.append(_)
|
||||
if not (_.has_next):
|
||||
break
|
||||
|
||||
class Group(KaitaiStruct):
|
||||
"""One byte group, clearly divided into 7-bit "value" and 1-bit "has continuation
|
||||
in the next byte" flag.
|
||||
"""
|
||||
def __init__(self, _io, _parent=None, _root=None):
|
||||
self._io = _io
|
||||
self._parent = _parent
|
||||
self._root = _root if _root else self
|
||||
self._read()
|
||||
|
||||
def _read(self):
|
||||
self.b = self._io.read_u1()
|
||||
|
||||
@property
|
||||
def has_next(self):
|
||||
"""If true, then we have more bytes to read."""
|
||||
if hasattr(self, '_m_has_next'):
|
||||
return self._m_has_next if hasattr(self, '_m_has_next') else None
|
||||
|
||||
self._m_has_next = (self.b & 128) != 0
|
||||
return self._m_has_next if hasattr(self, '_m_has_next') else None
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
"""The 7-bit (base128) numeric value of this group."""
|
||||
if hasattr(self, '_m_value'):
|
||||
return self._m_value if hasattr(self, '_m_value') else None
|
||||
|
||||
self._m_value = (self.b & 127)
|
||||
return self._m_value if hasattr(self, '_m_value') else None
|
||||
|
||||
|
||||
@property
|
||||
def len(self):
|
||||
if hasattr(self, '_m_len'):
|
||||
return self._m_len if hasattr(self, '_m_len') else None
|
||||
|
||||
self._m_len = len(self.groups)
|
||||
return self._m_len if hasattr(self, '_m_len') else None
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
"""Resulting value as normal integer."""
|
||||
if hasattr(self, '_m_value'):
|
||||
return self._m_value if hasattr(self, '_m_value') else None
|
||||
|
||||
self._m_value = (((((((self.groups[0].value + ((self.groups[1].value << 7) if self.len >= 2 else 0)) + ((self.groups[2].value << 14) if self.len >= 3 else 0)) + ((self.groups[3].value << 21) if self.len >= 4 else 0)) + ((self.groups[4].value << 28) if self.len >= 5 else 0)) + ((self.groups[5].value << 35) if self.len >= 6 else 0)) + ((self.groups[6].value << 42) if self.len >= 7 else 0)) + ((self.groups[7].value << 49) if self.len >= 8 else 0))
|
||||
return self._m_value if hasattr(self, '_m_value') else None
|
||||
|
||||
|
|
@ -1,52 +1,31 @@
|
|||
from unittest import mock
|
||||
import pytest
|
||||
|
||||
from mitmproxy.contentviews import protobuf
|
||||
from mitmproxy.test import tutils
|
||||
from . import full_eval
|
||||
|
||||
data = tutils.test_data.push("mitmproxy/contentviews/test_protobuf_data/")
|
||||
|
||||
|
||||
def test_view_protobuf_request():
|
||||
v = full_eval(protobuf.ViewProtobuf())
|
||||
p = tutils.test_data.path("mitmproxy/data/protobuf01")
|
||||
p = data.path("protobuf01")
|
||||
|
||||
with mock.patch('mitmproxy.contentviews.protobuf.ViewProtobuf.is_available'):
|
||||
with mock.patch('subprocess.Popen') as n:
|
||||
m = mock.Mock()
|
||||
attrs = {'communicate.return_value': (b'1: "3bbc333c-e61c-433b-819a-0b9a8cc103b8"', True)}
|
||||
m.configure_mock(**attrs)
|
||||
n.return_value = m
|
||||
|
||||
with open(p, "rb") as f:
|
||||
data = f.read()
|
||||
content_type, output = v(data)
|
||||
assert content_type == "Protobuf"
|
||||
assert output[0] == [('text', b'1: "3bbc333c-e61c-433b-819a-0b9a8cc103b8"')]
|
||||
|
||||
m.communicate = mock.MagicMock()
|
||||
m.communicate.return_value = (None, None)
|
||||
with pytest.raises(ValueError, matches="Failed to parse input."):
|
||||
v(b'foobar')
|
||||
with open(p, "rb") as f:
|
||||
raw = f.read()
|
||||
content_type, output = v(raw)
|
||||
assert content_type == "Protobuf"
|
||||
assert output == [[('text', '1: 3bbc333c-e61c-433b-819a-0b9a8cc103b8')]]
|
||||
with pytest.raises(ValueError, matches="Failed to parse input."):
|
||||
v(b'foobar')
|
||||
|
||||
|
||||
def test_view_protobuf_availability():
|
||||
with mock.patch('subprocess.Popen') as n:
|
||||
m = mock.Mock()
|
||||
attrs = {'communicate.return_value': (b'libprotoc fake version', True)}
|
||||
m.configure_mock(**attrs)
|
||||
n.return_value = m
|
||||
assert protobuf.ViewProtobuf().is_available()
|
||||
@pytest.mark.parametrize("filename", ["protobuf02", "protobuf03"])
|
||||
def test_format_pbuf(filename):
|
||||
path = data.path(filename)
|
||||
with open(path, "rb") as f:
|
||||
input = f.read()
|
||||
with open(path + "-decoded") as f:
|
||||
expected = f.read()
|
||||
|
||||
m = mock.Mock()
|
||||
attrs = {'communicate.return_value': (b'command not found', True)}
|
||||
m.configure_mock(**attrs)
|
||||
n.return_value = m
|
||||
assert not protobuf.ViewProtobuf().is_available()
|
||||
|
||||
|
||||
def test_view_protobuf_fallback():
|
||||
with mock.patch('subprocess.Popen.communicate') as m:
|
||||
m.side_effect = OSError()
|
||||
v = full_eval(protobuf.ViewProtobuf())
|
||||
with pytest.raises(NotImplementedError, matches='protoc not found'):
|
||||
v(b'foobar')
|
||||
assert protobuf.format_pbuf(input) == expected
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,65 @@
|
|||
1 {
|
||||
1: tpbuf
|
||||
4 {
|
||||
1: Person
|
||||
2 {
|
||||
1: name
|
||||
3: 1
|
||||
4: 2
|
||||
5: 9
|
||||
}
|
||||
2 {
|
||||
1: id
|
||||
3: 2
|
||||
4: 2
|
||||
5: 5
|
||||
}
|
||||
2 {
|
||||
1 {
|
||||
12: 1818845549
|
||||
}
|
||||
3: 3
|
||||
4: 1
|
||||
5: 9
|
||||
}
|
||||
2 {
|
||||
1: phone
|
||||
3: 4
|
||||
4: 3
|
||||
5: 11
|
||||
6: .Person.PhoneNumber
|
||||
}
|
||||
3 {
|
||||
1: PhoneNumber
|
||||
2 {
|
||||
1: number
|
||||
3: 1
|
||||
4: 2
|
||||
5: 9
|
||||
}
|
||||
2 {
|
||||
1: type
|
||||
3: 2
|
||||
4: 1
|
||||
5: 14
|
||||
6: .Person.PhoneType
|
||||
7: HOME
|
||||
}
|
||||
}
|
||||
4 {
|
||||
1: PhoneType
|
||||
2 {
|
||||
1: MOBILE
|
||||
2: 0
|
||||
}
|
||||
2 {
|
||||
1: HOME
|
||||
2: 1
|
||||
}
|
||||
2 {
|
||||
1: WORK
|
||||
2: 2
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
<18> <20>
|
|
@ -0,0 +1,4 @@
|
|||
2 {
|
||||
3: 3840
|
||||
4: 2160
|
||||
}
|
Loading…
Reference in New Issue