2021-11-12 13:30:19 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
proxy.py
|
|
|
|
~~~~~~~~
|
|
|
|
⚡⚡⚡ Fast, Lightweight, Pluggable, TLS interception capable proxy server focused on
|
|
|
|
Network monitoring, controls & Application development, testing, debugging.
|
|
|
|
|
|
|
|
:copyright: (c) 2013-present by Abhinav Singh and contributors.
|
|
|
|
:license: BSD, see LICENSE for more details.
|
2021-11-16 23:34:29 +00:00
|
|
|
|
|
|
|
.. spelling::
|
|
|
|
|
|
|
|
http
|
|
|
|
url
|
2021-11-12 13:30:19 +00:00
|
|
|
"""
|
|
|
|
from typing import Optional, Tuple
|
|
|
|
|
2021-11-14 21:47:12 +00:00
|
|
|
from ..common.constants import COLON, SLASH
|
2021-11-12 13:30:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Url:
|
2021-11-16 23:34:29 +00:00
|
|
|
"""``urllib.urlparse`` doesn't work for proxy.py, so we wrote a simple URL.
|
2021-11-12 13:30:19 +00:00
|
|
|
|
2021-11-16 23:34:29 +00:00
|
|
|
Currently, URL only implements what is necessary for HttpParser to work.
|
2021-11-12 13:30:19 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
scheme: Optional[bytes] = None,
|
|
|
|
hostname: Optional[bytes] = None,
|
|
|
|
port: Optional[int] = None,
|
|
|
|
remainder: Optional[bytes] = None,
|
|
|
|
) -> None:
|
|
|
|
self.scheme: Optional[bytes] = scheme
|
|
|
|
self.hostname: Optional[bytes] = hostname
|
|
|
|
self.port: Optional[int] = port
|
|
|
|
self.remainder: Optional[bytes] = remainder
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_bytes(cls, raw: bytes) -> 'Url':
|
2021-11-16 23:34:29 +00:00
|
|
|
"""A URL within proxy.py core can have several styles,
|
2021-11-12 13:30:19 +00:00
|
|
|
because proxy.py supports both proxy and web server use cases.
|
|
|
|
|
|
|
|
Example:
|
2021-11-16 23:34:29 +00:00
|
|
|
For a Web server, url is like ``/`` or ``/get`` or ``/get?key=value``
|
|
|
|
For a HTTPS connect tunnel, url is like ``httpbin.org:443``
|
|
|
|
For a HTTP proxy request, url is like ``http://httpbin.org/get``
|
2021-11-12 13:30:19 +00:00
|
|
|
|
|
|
|
Further:
|
2021-11-16 23:34:29 +00:00
|
|
|
1) URL may contain unicode characters
|
|
|
|
2) URL may contain IPv4 and IPv6 format addresses instead of domain names
|
2021-11-12 13:30:19 +00:00
|
|
|
|
2021-11-16 23:34:29 +00:00
|
|
|
We use heuristics based approach for our URL parser.
|
2021-11-12 13:30:19 +00:00
|
|
|
"""
|
|
|
|
sraw = raw.decode('utf-8')
|
|
|
|
if sraw[0] == SLASH.decode('utf-8'):
|
|
|
|
return cls(remainder=raw)
|
|
|
|
if sraw.startswith('https://') or sraw.startswith('http://'):
|
|
|
|
is_https = sraw.startswith('https://')
|
|
|
|
rest = raw[len(b'https://'):] if is_https else raw[len(b'http://'):]
|
|
|
|
parts = rest.split(SLASH)
|
|
|
|
host, port = Url.parse_host_and_port(parts[0])
|
|
|
|
return cls(
|
|
|
|
scheme=b'https' if is_https else b'http',
|
|
|
|
hostname=host,
|
|
|
|
port=port,
|
|
|
|
remainder=None if len(parts) == 1 else (
|
|
|
|
SLASH + SLASH.join(parts[1:])
|
|
|
|
),
|
|
|
|
)
|
|
|
|
host, port = Url.parse_host_and_port(raw)
|
|
|
|
return cls(hostname=host, port=port)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def parse_host_and_port(raw: bytes) -> Tuple[bytes, Optional[int]]:
|
|
|
|
parts = raw.split(COLON)
|
|
|
|
port: Optional[int] = None
|
|
|
|
if len(parts) == 1:
|
|
|
|
return parts[0], None
|
|
|
|
if len(parts) == 2:
|
|
|
|
host, port = COLON.join(parts[:-1]), int(parts[-1])
|
|
|
|
if len(parts) > 2:
|
|
|
|
try:
|
|
|
|
port = int(parts[-1])
|
|
|
|
host = COLON.join(parts[:-1])
|
|
|
|
except ValueError:
|
|
|
|
# If unable to convert last part into port,
|
|
|
|
# this is the IPv6 scenario. Treat entire
|
|
|
|
# data as host
|
|
|
|
host, port = raw, None
|
|
|
|
# patch up invalid ipv6 scenario
|
|
|
|
rhost = host.decode('utf-8')
|
|
|
|
if COLON.decode('utf-8') in rhost and rhost[0] != '[' and rhost[-1] != ']':
|
|
|
|
host = b'[' + host + b']'
|
|
|
|
return host, port
|