# -*- coding: utf-8 -*- """ proxy.py ~~~~~~~~ ⚡⚡⚡ Fast, Lightweight, Pluggable, TLS interception capable proxy server focused on Network monitoring, controls & Application development, testing, debugging. :copyright: (c) 2013-present by Abhinav Singh and contributors. :license: BSD, see LICENSE for more details. """ from urllib import parse as urlparse from typing import TypeVar, NamedTuple, Optional, Dict, Type, Tuple, List from .methods import httpMethods from .chunk_parser import ChunkParser, chunkParserStates from ..common.constants import DEFAULT_DISABLE_HEADERS, COLON, SLASH, CRLF, WHITESPACE, HTTP_1_1, DEFAULT_HTTP_PORT from ..common.utils import build_http_request, build_http_response, find_http_line, text_ HttpParserStates = NamedTuple( 'HttpParserStates', [ ('INITIALIZED', int), ('LINE_RCVD', int), ('RCVING_HEADERS', int), ('HEADERS_COMPLETE', int), ('RCVING_BODY', int), ('COMPLETE', int), ], ) httpParserStates = HttpParserStates(1, 2, 3, 4, 5, 6) HttpParserTypes = NamedTuple( 'HttpParserTypes', [ ('REQUEST_PARSER', int), ('RESPONSE_PARSER', int), ], ) httpParserTypes = HttpParserTypes(1, 2) T = TypeVar('T', bound='HttpParser') class HttpParser: """HTTP request/response parser.""" def __init__(self, parser_type: int) -> None: self.type: int = parser_type self.state: int = httpParserStates.INITIALIZED # Total size of raw bytes passed for parsing self.total_size: int = 0 # Buffer to hold unprocessed bytes self.buffer: bytes = b'' self.headers: Dict[bytes, Tuple[bytes, bytes]] = {} self.body: Optional[bytes] = None self.method: Optional[bytes] = None self.url: Optional[urlparse.SplitResultBytes] = None self.code: Optional[bytes] = None self.reason: Optional[bytes] = None self.version: Optional[bytes] = None self.chunk_parser: Optional[ChunkParser] = None # This cleans up developer APIs as Python urlparse.urlsplit behaves differently # for incoming proxy request and incoming web request. Web request is the one # which is broken. self.host: Optional[bytes] = None self.port: Optional[int] = None self.path: Optional[bytes] = None @classmethod def request(cls: Type[T], raw: bytes) -> T: parser = cls(httpParserTypes.REQUEST_PARSER) parser.parse(raw) return parser @classmethod def response(cls: Type[T], raw: bytes) -> T: parser = cls(httpParserTypes.RESPONSE_PARSER) parser.parse(raw) return parser def header(self, key: bytes) -> bytes: if key.lower() not in self.headers: raise KeyError('%s not found in headers', text_(key)) return self.headers[key.lower()][1] def has_header(self, key: bytes) -> bool: return key.lower() in self.headers def add_header(self, key: bytes, value: bytes) -> None: self.headers[key.lower()] = (key, value) def add_headers(self, headers: List[Tuple[bytes, bytes]]) -> None: for (key, value) in headers: self.add_header(key, value) def del_header(self, header: bytes) -> None: if header.lower() in self.headers: del self.headers[header.lower()] def del_headers(self, headers: List[bytes]) -> None: for key in headers: self.del_header(key.lower()) def set_url(self, url: bytes) -> None: # Work around with urlsplit semantics. # # For CONNECT requests, request line contains # upstream_host:upstream_port which is not complaint # with urlsplit, which expects a fully qualified url. if self.method == httpMethods.CONNECT: url = b'https://' + url self.url = urlparse.urlsplit(url) self.set_line_attributes() def set_line_attributes(self) -> None: if self.type == httpParserTypes.REQUEST_PARSER: if self.method == httpMethods.CONNECT and self.url: self.host = self.url.hostname self.port = 443 if self.url.port is None else self.url.port elif self.url: self.host, self.port = self.url.hostname, self.url.port \ if self.url.port else DEFAULT_HTTP_PORT else: raise KeyError( 'Invalid request. Method: %r, Url: %r' % (self.method, self.url), ) self.path = self.build_path() def is_chunked_encoded(self) -> bool: return b'transfer-encoding' in self.headers and \ self.headers[b'transfer-encoding'][1].lower() == b'chunked' def body_expected(self) -> bool: return ( b'content-length' in self.headers and int(self.header(b'content-length')) > 0 ) or \ self.is_chunked_encoded() def parse(self, raw: bytes) -> None: """Parses Http request out of raw bytes. Check HttpParser state after parse has successfully returned.""" self.total_size += len(raw) raw = self.buffer + raw self.buffer = b'' more = len(raw) > 0 while more and self.state != httpParserStates.COMPLETE: if self.state in ( httpParserStates.HEADERS_COMPLETE, httpParserStates.RCVING_BODY, ): if b'content-length' in self.headers: self.state = httpParserStates.RCVING_BODY if self.body is None: self.body = b'' total_size = int(self.header(b'content-length')) received_size = len(self.body) self.body += raw[:total_size - received_size] if self.body and \ len(self.body) == int(self.header(b'content-length')): self.state = httpParserStates.COMPLETE more, raw = len(raw) > 0, raw[total_size - received_size:] elif self.is_chunked_encoded(): if not self.chunk_parser: self.chunk_parser = ChunkParser() raw = self.chunk_parser.parse(raw) if self.chunk_parser.state == chunkParserStates.COMPLETE: self.body = self.chunk_parser.body self.state = httpParserStates.COMPLETE more = False else: raise NotImplementedError( 'Parser shouldn\'t have reached here. ' + 'This can happen when content length header is missing but their is a body in the payload', ) else: more, raw = self.process(raw) self.buffer = raw def process(self, raw: bytes) -> Tuple[bool, bytes]: """Returns False when no CRLF could be found in received bytes.""" line, raw = find_http_line(raw) if line is None: return False, raw if self.state == httpParserStates.INITIALIZED: self.process_line(line) self.state = httpParserStates.LINE_RCVD elif self.state in (httpParserStates.LINE_RCVD, httpParserStates.RCVING_HEADERS): if self.state == httpParserStates.LINE_RCVD: # LINE_RCVD state is equivalent to RCVING_HEADERS self.state = httpParserStates.RCVING_HEADERS if line.strip() == b'': # Blank line received. self.state = httpParserStates.HEADERS_COMPLETE else: self.process_header(line) # When server sends a response line without any header or body e.g. # HTTP/1.1 200 Connection established\r\n\r\n if self.state == httpParserStates.LINE_RCVD and \ self.type == httpParserTypes.RESPONSE_PARSER and \ raw == CRLF: self.state = httpParserStates.COMPLETE elif self.state == httpParserStates.HEADERS_COMPLETE and \ not self.body_expected() and \ raw == b'': self.state = httpParserStates.COMPLETE return len(raw) > 0, raw def process_line(self, raw: bytes) -> None: line = raw.split(WHITESPACE) if self.type == httpParserTypes.REQUEST_PARSER: self.method = line[0].upper() self.set_url(line[1]) self.version = line[2] else: self.version = line[0] self.code = line[1] self.reason = WHITESPACE.join(line[2:]) def process_header(self, raw: bytes) -> None: parts = raw.split(COLON) key = parts[0].strip() value = COLON.join(parts[1:]).strip() self.add_headers([(key, value)]) def build_path(self) -> bytes: if not self.url: return b'/None' url = self.url.path if url == b'': url = b'/' if not self.url.query == b'': url += b'?' + self.url.query if not self.url.fragment == b'': url += b'#' + self.url.fragment return url def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool = False) -> bytes: """Rebuild the request object.""" assert self.method and self.version and self.path and self.type == httpParserTypes.REQUEST_PARSER if disable_headers is None: disable_headers = DEFAULT_DISABLE_HEADERS body: Optional[bytes] = ChunkParser.to_chunks(self.body) \ if self.is_chunked_encoded() and self.body else \ self.body path = self.path if for_proxy: assert self.url and self.host and self.port and self.path path = ( self.url.scheme + COLON + SLASH + SLASH + self.host + COLON + str(self.port).encode() + self.path ) if self.method != httpMethods.CONNECT else (self.host + COLON + str(self.port).encode()) return build_http_request( self.method, path, self.version, headers={} if not self.headers else { self.headers[k][0]: self.headers[k][1] for k in self.headers if k.lower() not in disable_headers }, body=body, ) def build_response(self) -> bytes: """Rebuild the response object.""" assert self.code and self.version and self.body and self.type == httpParserTypes.RESPONSE_PARSER return build_http_response( status_code=int(self.code), protocol_version=self.version, reason=self.reason, headers={} if not self.headers else { self.headers[k][0]: self.headers[k][1] for k in self.headers }, body=self.body if not self.is_chunked_encoded( ) else ChunkParser.to_chunks(self.body), ) def has_host(self) -> bool: """Host field SHOULD be None for incoming local WebServer requests.""" return self.host is not None def is_http_1_1_keep_alive(self) -> bool: return self.version == HTTP_1_1 and \ ( not self.has_header(b'Connection') or self.header(b'Connection').lower() == b'keep-alive' ) def is_connection_upgrade(self) -> bool: return self.version == HTTP_1_1 and \ self.has_header(b'Connection') and \ self.has_header(b'Upgrade')