diff --git a/examples/har_extractor.py b/examples/har_extractor.py new file mode 100644 index 000000000..531f32aaa --- /dev/null +++ b/examples/har_extractor.py @@ -0,0 +1,212 @@ +""" + This inline script utilizes harparser.HAR from https://github.com/JustusW/harparser + to generate a HAR log object. +""" +try: + from harparser import HAR + from pytz import UTC +except ImportError as e: + import sys + print >> sys.stderr, "\r\nMissing dependencies: please run `pip install mitmproxy[examples]`.\r\n" + raise + +from datetime import datetime, timedelta, tzinfo + + +class _HARLog(HAR.log): + # The attributes need to be registered here for them to actually be available later via self. This is + # due to HAREncodable linking __getattr__ to __getitem__. Anything that is set only in __init__ will + # just be added as key/value pair to self.__classes__. + __page_list__ = [] + __page_count__ = 0 + __page_ref__ = {} + + def __init__(self, page_list): + self.__page_list__ = page_list + self.__page_count__ = 0 + self.__page_ref__ = {} + + HAR.log.__init__(self, {"version": "1.2", + "creator": {"name": "MITMPROXY HARExtractor", + "version": "0.1", + "comment": ""}, + "pages": [], + "entries": []}) + + def reset(self): + self.__init__(self.__page_list__) + + def add(self, obj): + if isinstance(obj, HAR.pages): + self['pages'].append(obj) + if isinstance(obj, HAR.entries): + self['entries'].append(obj) + + def create_page_id(self): + self.__page_count__ += 1 + return "autopage_%s" % str(self.__page_count__) + + def set_page_ref(self, page, ref): + self.__page_ref__[page] = ref + + def get_page_ref(self, page): + return self.__page_ref__.get(page, None) + + def get_page_list(self): + return self.__page_list__ + + +def start(context, argv): + """ + On start we create a HARLog instance. You will have to adapt this to suit your actual needs + of HAR generation. As it will probably be necessary to cluster logs by IPs or reset them + from time to time. + """ + context.dump_file = None + if len(argv) > 1: + context.dump_file = argv[1] + else: + raise ValueError('Usage: -s "har_extractor.py filename" ' + '(- will output to stdout, filenames ending with .zhar will result in compressed har)') + context.HARLog = _HARLog(['https://github.com']) + context.seen_server = set() + + +def response(context, flow): + """ + Called when a server response has been received. At the time of this message both + a request and a response are present and completely done. + """ + # Values are converted from float seconds to int milliseconds later. + ssl_time = -.001 + connect_time = -.001 + if flow.server_conn not in context.seen_server: + # Calculate the connect_time for this server_conn. Afterwards add it to seen list, in + # order to avoid the connect_time being present in entries that use an existing connection. + connect_time = flow.server_conn.timestamp_tcp_setup - flow.server_conn.timestamp_start + context.seen_server.add(flow.server_conn) + + if flow.server_conn.timestamp_ssl_setup is not None: + # Get the ssl_time for this server_conn as the difference between the start of the successful + # tcp setup and the successful ssl setup. If no ssl setup has been made it is left as -1 since + # it doesn't apply to this connection. + ssl_time = flow.server_conn.timestamp_ssl_setup - flow.server_conn.timestamp_tcp_setup + + # Calculate the raw timings from the different timestamps present in the request and response object. + # For lack of a way to measure it dns timings can not be calculated. The same goes for HAR blocked: + # MITMProxy will open a server connection as soon as it receives the host and port from the client + # connection. So the time spent waiting is actually spent waiting between request.timestamp_end and + # response.timestamp_start thus it correlates to HAR wait instead. + timings_raw = {'send': flow.request.timestamp_end - flow.request.timestamp_start, + 'wait': flow.response.timestamp_start - flow.request.timestamp_end, + 'receive': flow.response.timestamp_end - flow.response.timestamp_start, + 'connect': connect_time, + 'ssl': ssl_time} + + # HAR timings are integers in ms, so we have to re-encode the raw timings to that format. + timings = dict([(key, int(1000 * value)) for key, value in timings_raw.iteritems()]) + + # The full_time is the sum of all timings. Timings set to -1 will be ignored as per spec. + full_time = 0 + for item in timings.values(): + if item > -1: + full_time += item + + started_date_time = datetime.fromtimestamp(flow.request.timestamp_start, tz=utc).isoformat() + + request_query_string = [{"name": k, "value": v} for k, v in flow.request.get_query()] + request_http_version = ".".join([str(v) for v in flow.request.httpversion]) + # Cookies are shaped as tuples by MITMProxy. + request_cookies = [{"name": k.strip(), "value": v[0]} for k, v in (flow.request.get_cookies() or {}).iteritems()] + request_headers = [{"name": k, "value": v} for k, v in flow.request.headers] + request_headers_size = len(str(flow.request.headers)) + request_body_size = len(flow.request.content) + + response_http_version = ".".join([str(v) for v in flow.response.httpversion]) + # Cookies are shaped as tuples by MITMProxy. + response_cookies = [{"name": k.strip(), "value": v[0]} for k, v in (flow.response.get_cookies() or {}).iteritems()] + response_headers = [{"name": k, "value": v} for k, v in flow.response.headers] + response_headers_size = len(str(flow.response.headers)) + response_body_size = len(flow.response.content) + response_body_decoded_size = len(flow.response.get_decoded_content()) + response_body_compression = response_body_decoded_size - response_body_size + response_mime_type = flow.response.headers.get_first('Content-Type', '') + response_redirect_url = flow.response.headers.get_first('Location', '') + + entry = HAR.entries({"startedDateTime": started_date_time, + "time": full_time, + "request": {"method": flow.request.method, + "url": flow.request.url, + "httpVersion": request_http_version, + "cookies": request_cookies, + "headers": request_headers, + "queryString": request_query_string, + "headersSize": request_headers_size, + "bodySize": request_body_size, }, + "response": {"status": flow.response.code, + "statusText": flow.response.msg, + "httpVersion": response_http_version, + "cookies": response_cookies, + "headers": response_headers, + "content": {"size": response_body_size, + "compression": response_body_compression, + "mimeType": response_mime_type}, + "redirectURL": response_redirect_url, + "headersSize": response_headers_size, + "bodySize": response_body_size, }, + "cache": {}, + "timings": timings, }) + + # If the current url is in the page list of context.HARLog or does not have a referrer we add it as a new + # pages object. + if flow.request.url in context.HARLog.get_page_list() or flow.request.headers.get('Referer', None) is None: + page_id = context.HARLog.create_page_id() + context.HARLog.add(HAR.pages({"startedDateTime": entry['startedDateTime'], + "id": page_id, + "title": flow.request.url, })) + context.HARLog.set_page_ref(flow.request.url, page_id) + entry['pageref'] = page_id + + # Lookup the referer in the page_ref of context.HARLog to point this entries pageref attribute to the right + # pages object, then set it as a new reference to build a reference tree. + elif context.HARLog.get_page_ref(flow.request.headers.get('Referer', (None, ))[0]) is not None: + entry['pageref'] = context.HARLog.get_page_ref(flow.request.headers['Referer'][0]) + context.HARLog.set_page_ref(flow.request.headers['Referer'][0], entry['pageref']) + + context.HARLog.add(entry) + + +def done(context): + """ + Called once on script shutdown, after any other events. + """ + from pprint import pprint + import json + + json_dump = context.HARLog.json() + compressed_json_dump = context.HARLog.compress() + + print "=" * 100 + if context.dump_file == '-': + pprint(json.loads(json_dump)) + elif context.dump_file.endswith('.zhar'): + file(context.dump_file, "w").write(compressed_json_dump) + else: + file(context.dump_file, "w").write(json_dump) + print "=" * 100 + print "HAR log finished with %s bytes (%s bytes compressed)" % (len(json_dump), len(compressed_json_dump)) + print "Compression rate is %s%%" % str(100. * len(compressed_json_dump) / len(json_dump)) + print "=" * 100 + + +def print_attributes(obj, filter_string=None, hide_privates=False): + """ + Useful helper method to quickly get all attributes of an object and its values. + """ + for attr in dir(obj): + if hide_privates and "__" in attr: + continue + if filter_string is not None and filter_string not in attr: + continue + value = getattr(obj, attr) + print "%s.%s" % ('obj', attr), value, type(value) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d84347b73..946e5ffe8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -e git+https://github.com/mitmproxy/netlib.git#egg=netlib -e git+https://github.com/mitmproxy/pathod.git#egg=pathod --e .[dev] \ No newline at end of file +-e .[dev,examples] \ No newline at end of file diff --git a/setup.py b/setup.py index 5674a0370..1d3c62051 100644 --- a/setup.py +++ b/setup.py @@ -80,6 +80,10 @@ setup( "pyamf>=0.6.1", "protobuf>=2.5.0", "cssutils>=1.0" + ], + 'examples': [ + "pytz", + "harparser", ] } )