2021-11-12 13:30:19 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
proxy.py
|
|
|
|
~~~~~~~~
|
|
|
|
⚡⚡⚡ Fast, Lightweight, Pluggable, TLS interception capable proxy server focused on
|
|
|
|
Network monitoring, controls & Application development, testing, debugging.
|
|
|
|
|
|
|
|
:copyright: (c) 2013-present by Abhinav Singh and contributors.
|
|
|
|
:license: BSD, see LICENSE for more details.
|
2021-11-16 23:34:29 +00:00
|
|
|
|
|
|
|
.. spelling::
|
|
|
|
|
|
|
|
http
|
|
|
|
url
|
2021-11-12 13:30:19 +00:00
|
|
|
"""
|
|
|
|
from typing import Optional, Tuple
|
|
|
|
|
2021-12-21 20:18:15 +00:00
|
|
|
from ..common.constants import COLON, SLASH, HTTP_URL_PREFIX, HTTPS_URL_PREFIX, AT
|
2021-11-19 18:35:24 +00:00
|
|
|
from ..common.utils import text_
|
2021-11-12 13:30:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Url:
|
2021-11-16 23:34:29 +00:00
|
|
|
"""``urllib.urlparse`` doesn't work for proxy.py, so we wrote a simple URL.
|
2021-11-12 13:30:19 +00:00
|
|
|
|
2021-11-16 23:34:29 +00:00
|
|
|
Currently, URL only implements what is necessary for HttpParser to work.
|
2021-11-12 13:30:19 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
scheme: Optional[bytes] = None,
|
2021-12-21 20:18:15 +00:00
|
|
|
username: Optional[bytes] = None,
|
|
|
|
password: Optional[bytes] = None,
|
2021-11-12 13:30:19 +00:00
|
|
|
hostname: Optional[bytes] = None,
|
|
|
|
port: Optional[int] = None,
|
|
|
|
remainder: Optional[bytes] = None,
|
|
|
|
) -> None:
|
|
|
|
self.scheme: Optional[bytes] = scheme
|
2021-12-21 20:18:15 +00:00
|
|
|
self.username: Optional[bytes] = username
|
|
|
|
self.password: Optional[bytes] = password
|
2021-11-12 13:30:19 +00:00
|
|
|
self.hostname: Optional[bytes] = hostname
|
|
|
|
self.port: Optional[int] = port
|
|
|
|
self.remainder: Optional[bytes] = remainder
|
|
|
|
|
2021-12-21 20:18:15 +00:00
|
|
|
@property
|
|
|
|
def has_credentials(self) -> bool:
|
|
|
|
"""Returns true if both username and password components are present."""
|
|
|
|
return self.username is not None and self.password is not None
|
|
|
|
|
2021-11-19 18:35:24 +00:00
|
|
|
def __str__(self) -> str:
|
|
|
|
url = ''
|
|
|
|
if self.scheme:
|
|
|
|
url += '{0}://'.format(text_(self.scheme))
|
|
|
|
if self.hostname:
|
|
|
|
url += text_(self.hostname)
|
|
|
|
if self.port:
|
|
|
|
url += ':{0}'.format(self.port)
|
|
|
|
if self.remainder:
|
|
|
|
url += text_(self.remainder)
|
|
|
|
return url
|
|
|
|
|
2021-11-12 13:30:19 +00:00
|
|
|
@classmethod
|
|
|
|
def from_bytes(cls, raw: bytes) -> 'Url':
|
2021-11-16 23:34:29 +00:00
|
|
|
"""A URL within proxy.py core can have several styles,
|
2021-11-12 13:30:19 +00:00
|
|
|
because proxy.py supports both proxy and web server use cases.
|
|
|
|
|
|
|
|
Example:
|
2021-11-16 23:34:29 +00:00
|
|
|
For a Web server, url is like ``/`` or ``/get`` or ``/get?key=value``
|
|
|
|
For a HTTPS connect tunnel, url is like ``httpbin.org:443``
|
|
|
|
For a HTTP proxy request, url is like ``http://httpbin.org/get``
|
2021-11-12 13:30:19 +00:00
|
|
|
|
|
|
|
Further:
|
2021-11-16 23:34:29 +00:00
|
|
|
1) URL may contain unicode characters
|
|
|
|
2) URL may contain IPv4 and IPv6 format addresses instead of domain names
|
2021-11-12 13:30:19 +00:00
|
|
|
|
2021-11-16 23:34:29 +00:00
|
|
|
We use heuristics based approach for our URL parser.
|
2021-11-12 13:30:19 +00:00
|
|
|
"""
|
2022-01-10 20:57:19 +00:00
|
|
|
# SLASH == 47, check if URL starts with single slash but not double slash
|
|
|
|
is_single_slash = raw[0] == 47
|
|
|
|
is_double_slash = is_single_slash and len(raw) >= 2 and raw[1] == 47
|
|
|
|
if is_single_slash and not is_double_slash:
|
2021-11-12 13:30:19 +00:00
|
|
|
return cls(remainder=raw)
|
2021-12-19 16:00:43 +00:00
|
|
|
is_http = raw.startswith(HTTP_URL_PREFIX)
|
|
|
|
is_https = raw.startswith(HTTPS_URL_PREFIX)
|
2022-01-10 20:57:19 +00:00
|
|
|
if is_http or is_https or is_double_slash:
|
2021-11-19 18:35:24 +00:00
|
|
|
rest = raw[len(b'https://'):] \
|
|
|
|
if is_https \
|
2022-01-10 20:57:19 +00:00
|
|
|
else raw[len(b'http://'):] \
|
|
|
|
if is_http \
|
|
|
|
else raw[len(SLASH + SLASH):]
|
2021-11-30 22:48:49 +00:00
|
|
|
parts = rest.split(SLASH, 1)
|
2021-12-21 20:18:15 +00:00
|
|
|
username, password, host, port = Url._parse(parts[0])
|
2021-11-12 13:30:19 +00:00
|
|
|
return cls(
|
|
|
|
scheme=b'https' if is_https else b'http',
|
2021-12-21 20:18:15 +00:00
|
|
|
username=username,
|
|
|
|
password=password,
|
2021-11-12 13:30:19 +00:00
|
|
|
hostname=host,
|
|
|
|
port=port,
|
|
|
|
remainder=None if len(parts) == 1 else (
|
2021-11-30 22:48:49 +00:00
|
|
|
SLASH + parts[1]
|
2021-11-12 13:30:19 +00:00
|
|
|
),
|
|
|
|
)
|
2021-12-21 20:18:15 +00:00
|
|
|
username, password, host, port = Url._parse(raw)
|
|
|
|
return cls(username=username, password=password, hostname=host, port=port)
|
2021-11-12 13:30:19 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2021-12-21 20:18:15 +00:00
|
|
|
def _parse(raw: bytes) -> Tuple[
|
|
|
|
Optional[bytes],
|
|
|
|
Optional[bytes],
|
|
|
|
bytes,
|
|
|
|
Optional[int],
|
|
|
|
]:
|
|
|
|
split_at = raw.split(AT, 1)
|
|
|
|
username, password = None, None
|
|
|
|
if len(split_at) == 2:
|
|
|
|
username, password = split_at[0].split(COLON)
|
|
|
|
parts = split_at[-1].split(COLON, 2)
|
2021-12-19 16:00:43 +00:00
|
|
|
num_parts = len(parts)
|
2021-11-12 13:30:19 +00:00
|
|
|
port: Optional[int] = None
|
2021-12-19 16:00:43 +00:00
|
|
|
# No port found
|
|
|
|
if num_parts == 1:
|
2021-12-21 20:18:15 +00:00
|
|
|
return username, password, parts[0], None
|
2021-12-19 16:00:43 +00:00
|
|
|
# Host and port found
|
|
|
|
if num_parts == 2:
|
2021-12-21 20:18:15 +00:00
|
|
|
return username, password, COLON.join(parts[:-1]), int(parts[-1])
|
2021-12-19 16:00:43 +00:00
|
|
|
# More than a single COLON i.e. IPv6 scenario
|
|
|
|
try:
|
|
|
|
# Try to resolve last part as an int port
|
|
|
|
last_token = parts[-1].split(COLON)
|
|
|
|
port = int(last_token[-1])
|
|
|
|
host = COLON.join(parts[:-1]) + COLON + \
|
|
|
|
COLON.join(last_token[:-1])
|
|
|
|
except ValueError:
|
|
|
|
# If unable to convert last part into port,
|
|
|
|
# treat entire data as host
|
|
|
|
host, port = raw, None
|
2021-11-12 13:30:19 +00:00
|
|
|
# patch up invalid ipv6 scenario
|
|
|
|
rhost = host.decode('utf-8')
|
2021-11-30 22:48:49 +00:00
|
|
|
if COLON.decode('utf-8') in rhost and \
|
|
|
|
rhost[0] != '[' and \
|
|
|
|
rhost[-1] != ']':
|
2021-11-12 13:30:19 +00:00
|
|
|
host = b'[' + host + b']'
|
2021-12-21 20:18:15 +00:00
|
|
|
return username, password, host, port
|