proxy.py/proxy/http/url.py

143 lines
5.1 KiB
Python

# -*- coding: utf-8 -*-
"""
proxy.py
~~~~~~~~
⚡⚡⚡ Fast, Lightweight, Pluggable, TLS interception capable proxy server focused on
Network monitoring, controls & Application development, testing, debugging.
:copyright: (c) 2013-present by Abhinav Singh and contributors.
:license: BSD, see LICENSE for more details.
.. spelling::
http
url
"""
from typing import Optional, Tuple
from ..common.constants import COLON, SLASH, HTTP_URL_PREFIX, HTTPS_URL_PREFIX, AT
from ..common.utils import text_
class Url:
"""``urllib.urlparse`` doesn't work for proxy.py, so we wrote a simple URL.
Currently, URL only implements what is necessary for HttpParser to work.
"""
def __init__(
self,
scheme: Optional[bytes] = None,
username: Optional[bytes] = None,
password: Optional[bytes] = None,
hostname: Optional[bytes] = None,
port: Optional[int] = None,
remainder: Optional[bytes] = None,
) -> None:
self.scheme: Optional[bytes] = scheme
self.username: Optional[bytes] = username
self.password: Optional[bytes] = password
self.hostname: Optional[bytes] = hostname
self.port: Optional[int] = port
self.remainder: Optional[bytes] = remainder
@property
def has_credentials(self) -> bool:
"""Returns true if both username and password components are present."""
return self.username is not None and self.password is not None
def __str__(self) -> str:
url = ''
if self.scheme:
url += '{0}://'.format(text_(self.scheme))
if self.hostname:
url += text_(self.hostname)
if self.port:
url += ':{0}'.format(self.port)
if self.remainder:
url += text_(self.remainder)
return url
@classmethod
def from_bytes(cls, raw: bytes) -> 'Url':
"""A URL within proxy.py core can have several styles,
because proxy.py supports both proxy and web server use cases.
Example:
For a Web server, url is like ``/`` or ``/get`` or ``/get?key=value``
For a HTTPS connect tunnel, url is like ``httpbin.org:443``
For a HTTP proxy request, url is like ``http://httpbin.org/get``
Further:
1) URL may contain unicode characters
2) URL may contain IPv4 and IPv6 format addresses instead of domain names
We use heuristics based approach for our URL parser.
"""
# SLASH == 47, check if URL starts with single slash but not double slash
is_single_slash = raw[0] == 47
is_double_slash = is_single_slash and len(raw) >= 2 and raw[1] == 47
if is_single_slash and not is_double_slash:
return cls(remainder=raw)
is_http = raw.startswith(HTTP_URL_PREFIX)
is_https = raw.startswith(HTTPS_URL_PREFIX)
if is_http or is_https or is_double_slash:
rest = raw[len(b'https://'):] \
if is_https \
else raw[len(b'http://'):] \
if is_http \
else raw[len(SLASH + SLASH):]
parts = rest.split(SLASH, 1)
username, password, host, port = Url._parse(parts[0])
return cls(
scheme=b'https' if is_https else b'http',
username=username,
password=password,
hostname=host,
port=port,
remainder=None if len(parts) == 1 else (
SLASH + parts[1]
),
)
username, password, host, port = Url._parse(raw)
return cls(username=username, password=password, hostname=host, port=port)
@staticmethod
def _parse(raw: bytes) -> Tuple[
Optional[bytes],
Optional[bytes],
bytes,
Optional[int],
]:
split_at = raw.split(AT, 1)
username, password = None, None
if len(split_at) == 2:
username, password = split_at[0].split(COLON)
parts = split_at[-1].split(COLON, 2)
num_parts = len(parts)
port: Optional[int] = None
# No port found
if num_parts == 1:
return username, password, parts[0], None
# Host and port found
if num_parts == 2:
return username, password, COLON.join(parts[:-1]), int(parts[-1])
# More than a single COLON i.e. IPv6 scenario
try:
# Try to resolve last part as an int port
last_token = parts[-1].split(COLON)
port = int(last_token[-1])
host = COLON.join(parts[:-1]) + COLON + \
COLON.join(last_token[:-1])
except ValueError:
# If unable to convert last part into port,
# treat entire data as host
host, port = raw, None
# patch up invalid ipv6 scenario
rhost = host.decode('utf-8')
if COLON.decode('utf-8') in rhost and \
rhost[0] != '[' and \
rhost[-1] != ']':
host = b'[' + host + b']'
return username, password, host, port