diff --git a/.github/workflows/test-library.yml b/.github/workflows/test-library.yml index 6cb0fd37..3c15baaf 100644 --- a/.github/workflows/test-library.yml +++ b/.github/workflows/test-library.yml @@ -610,6 +610,7 @@ jobs: --enable-web-server --enable-reverse-proxy --plugin proxy.plugin.ReverseProxyPlugin + --rewrite-host-header && ./tests/integration/test_integration.sh 8899 diff --git a/README.md b/README.md index c8897bd9..68061779 100644 --- a/README.md +++ b/README.md @@ -1075,7 +1075,7 @@ following `Nginx` config: ```console location /get { - proxy_pass http://httpbin.org/get + proxy_pass http://httpbin.org/get; } ``` @@ -1094,6 +1094,36 @@ Verify using `curl -v localhost:8899/get`: } ``` +#### Rewrite Host Header + +With above example, you may sometimes see: + +```console +> +* Empty reply from server +* Closing connection +curl: (52) Empty reply from server +``` + +This is happenening because our default reverse proxy plugin `ReverseProxyPlugin` is configured +with a `http` and a `https` upstream server. And, by default `ReverseProxyPlugin` preserves the +original host header. While this works with `https` upstreams, this doesn't work reliably with +`http` upstreams. To work around this problem use the `--rewrite-host-header` flags. + +Example: + + +```console +❯ proxy --enable-reverse-proxy \ + --plugins proxy.plugin.ReverseProxyPlugin \ + --rewrite-host-header +``` + +This will ensure that `Host` header field is set as `httpbin.org` and works with both `http` and +`https` upstreams. + +> NOTE: Whether to use `--rewrite-host-header` or not depends upon your use-case. + ## Plugin Ordering When using multiple plugins, depending upon plugin functionality, @@ -2613,7 +2643,7 @@ usage: -m [-h] [--tunnel-hostname TUNNEL_HOSTNAME] [--tunnel-port TUNNEL_PORT] [--proxy-pool PROXY_POOL] [--enable-web-server] [--enable-static-server] [--static-server-dir STATIC_SERVER_DIR] [--min-compression-length MIN_COMPRESSION_LENGTH] - [--enable-reverse-proxy] [--enable-metrics] + [--enable-reverse-proxy] [--rewrite-host-header] [--enable-metrics] [--metrics-path METRICS_PATH] [--pac-file PAC_FILE] [--pac-file-url-path PAC_FILE_URL_PATH] [--cloudflare-dns-mode CLOUDFLARE_DNS_MODE] @@ -2622,7 +2652,7 @@ usage: -m [-h] [--tunnel-hostname TUNNEL_HOSTNAME] [--tunnel-port TUNNEL_PORT] [--filtered-client-ips FILTERED_CLIENT_IPS] [--filtered-url-regex-config FILTERED_URL_REGEX_CONFIG] -proxy.py v2.4.6.dev25+g2754b928.d20240812 +proxy.py v2.4.8.dev8+gc703edac.d20241013 options: -h, --help show this help message and exit @@ -2791,6 +2821,9 @@ options: response that will be compressed (gzipped). --enable-reverse-proxy Default: False. Whether to enable reverse proxy core. + --rewrite-host-header + Default: False. If used, reverse proxy server will + rewrite Host header field before sending to upstream. --enable-metrics Default: False. Enables metrics. --metrics-path METRICS_PATH Default: /metrics. Web server path to serve proxy.py diff --git a/proxy/common/constants.py b/proxy/common/constants.py index f1385390..5d042dbb 100644 --- a/proxy/common/constants.py +++ b/proxy/common/constants.py @@ -165,6 +165,7 @@ DEFAULT_SSL_CONTEXT_OPTIONS = ( if sys.version_info >= (3, 10) else (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) ) +DEFAULT_ENABLE_REWRITE_HOST = False DEFAULT_DEVTOOLS_DOC_URL = 'http://proxy' DEFAULT_DEVTOOLS_FRAME_ID = secrets.token_hex(8) diff --git a/proxy/common/utils.py b/proxy/common/utils.py index 84f3484a..74bad920 100644 --- a/proxy/common/utils.py +++ b/proxy/common/utils.py @@ -25,8 +25,6 @@ import contextlib from types import TracebackType from typing import Any, Dict, List, Type, Tuple, Callable, Optional -import _ssl # noqa: WPS436 - from .types import HostPort from .constants import ( CRLF, COLON, HTTP_1_1, IS_WINDOWS, WHITESPACE, DEFAULT_TIMEOUT, @@ -42,6 +40,9 @@ logger = logging.getLogger(__name__) def cert_der_to_dict(der: Optional[bytes]) -> Dict[str, Any]: """Parse a DER formatted certificate to a python dict""" + # pylint: disable=import-outside-toplevel + import _ssl # noqa: WPS436 + if not der: return {} with tempfile.NamedTemporaryFile(delete=False) as cert_file: @@ -322,6 +323,7 @@ def set_open_file_limit(soft_limit: int) -> None: if IS_WINDOWS: # pragma: no cover return + # pylint: disable=possibly-used-before-assignment curr_soft_limit, curr_hard_limit = resource.getrlimit( resource.RLIMIT_NOFILE, ) diff --git a/proxy/core/connection/connection.py b/proxy/core/connection/connection.py index 63cb62e3..2200f93e 100644 --- a/proxy/core/connection/connection.py +++ b/proxy/core/connection/connection.py @@ -49,7 +49,6 @@ class TcpConnection(ABC): def send(self, data: Union[memoryview, bytes]) -> int: """Users must handle BrokenPipeError exceptions""" - # logger.info(data.tobytes()) return self.connection.send(data) def recv( @@ -67,7 +66,7 @@ class TcpConnection(ABC): return memoryview(data) def close(self) -> bool: - if not self.closed: + if not self.closed and self.connection: self.connection.close() self.closed = True return self.closed @@ -97,8 +96,9 @@ class TcpConnection(ABC): self._num_buffer -= 1 else: self.buffer[0] = mv[sent:] - del mv logger.debug('flushed %d bytes to %s' % (sent, self.tag)) + # logger.info(mv[:sent].tobytes()) + del mv return sent def is_reusable(self) -> bool: diff --git a/proxy/http/parser/parser.py b/proxy/http/parser/parser.py index c16f74e7..6e1e8859 100644 --- a/proxy/http/parser/parser.py +++ b/proxy/http/parser/parser.py @@ -283,7 +283,12 @@ class HttpParser: self.state = httpParserStates.COMPLETE self.buffer = None if raw == b'' else raw - def build(self, disable_headers: Optional[List[bytes]] = None, for_proxy: bool = False) -> bytes: + def build( + self, + disable_headers: Optional[List[bytes]] = None, + for_proxy: bool = False, + host: Optional[bytes] = None, + ) -> bytes: """Rebuild the request object.""" assert self.method and self.version and self.type == httpParserTypes.REQUEST_PARSER if disable_headers is None: @@ -301,11 +306,22 @@ class HttpParser: path ) if not self._is_https_tunnel else (self.host + COLON + str(self.port).encode()) return build_http_request( - self.method, path, self.version, - headers={} if not self.headers else { - self.headers[k][0]: self.headers[k][1] for k in self.headers if - k.lower() not in disable_headers - }, + self.method, + path, + self.version, + headers=( + {} + if not self.headers + else { + self.headers[k][0]: ( + self.headers[k][1] + if host is None or self.headers[k][0].lower() != b'host' + else host + ) + for k in self.headers + if k.lower() not in disable_headers + } + ), body=body, no_ua=True, ) diff --git a/proxy/http/server/reverse.py b/proxy/http/server/reverse.py index 303b627f..d1d5e631 100644 --- a/proxy/http/server/reverse.py +++ b/proxy/http/server/reverse.py @@ -17,10 +17,10 @@ from proxy.http import Url from proxy.core.base import TcpUpstreamConnectionHandler from proxy.http.parser import HttpParser from proxy.http.server import HttpWebServerBasePlugin -from proxy.common.utils import text_ +from proxy.common.utils import text_, bytes_ from proxy.http.exception import HttpProtocolException from proxy.common.constants import ( - HTTPS_PROTO, DEFAULT_HTTP_PORT, DEFAULT_HTTPS_PORT, + COLON, HTTP_PROTO, HTTPS_PROTO, DEFAULT_HTTP_PORT, DEFAULT_HTTPS_PORT, DEFAULT_REVERSE_PROXY_ACCESS_LOG_FORMAT, ) from ...common.types import Readables, Writables, Descriptors @@ -111,8 +111,8 @@ class ReverseProxy(TcpUpstreamConnectionHandler, HttpWebServerBasePlugin): assert self.choice and self.choice.hostname port = ( self.choice.port or DEFAULT_HTTP_PORT - if self.choice.scheme == b'http' - else DEFAULT_HTTPS_PORT + if self.choice.scheme == HTTP_PROTO + else self.choice.port or DEFAULT_HTTPS_PORT ) self.initialize_upstream(text_(self.choice.hostname), port) assert self.upstream @@ -120,14 +120,27 @@ class ReverseProxy(TcpUpstreamConnectionHandler, HttpWebServerBasePlugin): self.upstream.connect() if self.choice.scheme == HTTPS_PROTO: self.upstream.wrap( - text_( - self.choice.hostname, - ), + text_(self.choice.hostname), as_non_blocking=True, ca_file=self.flags.ca_file, ) request.path = self.choice.remainder - self.upstream.queue(memoryview(request.build())) + self.upstream.queue( + memoryview( + request.build( + host=( + self.choice.hostname + + ( + COLON + bytes_(self.choice.port) + if self.choice.port is not None + else b'' + ) + if self.flags.rewrite_host_header + else None + ), + ), + ), + ) except ConnectionRefusedError: raise HttpProtocolException( # pragma: no cover 'Connection refused by upstream server {0}:{1}'.format( diff --git a/proxy/http/server/web.py b/proxy/http/server/web.py index cec4abbd..77547196 100644 --- a/proxy/http/server/web.py +++ b/proxy/http/server/web.py @@ -29,8 +29,9 @@ from ...common.types import Readables, Writables, Descriptors from ...common.utils import text_, build_websocket_handshake_response from ...common.constants import ( DEFAULT_ENABLE_WEB_SERVER, DEFAULT_STATIC_SERVER_DIR, - DEFAULT_ENABLE_REVERSE_PROXY, DEFAULT_ENABLE_STATIC_SERVER, - DEFAULT_WEB_ACCESS_LOG_FORMAT, DEFAULT_MIN_COMPRESSION_LENGTH, + DEFAULT_ENABLE_REWRITE_HOST, DEFAULT_ENABLE_REVERSE_PROXY, + DEFAULT_ENABLE_STATIC_SERVER, DEFAULT_WEB_ACCESS_LOG_FORMAT, + DEFAULT_MIN_COMPRESSION_LENGTH, ) @@ -78,6 +79,16 @@ flags.add_argument( help='Default: False. Whether to enable reverse proxy core.', ) +flags.add_argument( + '--rewrite-host-header', + action='store_true', + default=DEFAULT_ENABLE_REWRITE_HOST, + help='Default: ' + + str(DEFAULT_ENABLE_REWRITE_HOST) + + '. ' + + 'If used, reverse proxy server will rewrite Host header field before sending to upstream.', +) + class HttpWebServerPlugin(HttpProtocolHandlerPlugin): """HttpProtocolHandler plugin which handles incoming requests to local web server.""" diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index 1a143744..8b91413a 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -181,18 +181,30 @@ def proxy_py_subprocess(request: Any) -> Generator[int, None, None]: ca_cert_dir = TEMP_DIR / ('certificates-%s' % run_id) os.makedirs(ca_cert_dir, exist_ok=True) proxy_cmd = ( - sys.executable, '-m', 'proxy', - '--hostname', '127.0.0.1', - '--port', '0', - '--port-file', str(port_file), + sys.executable, + '-m', + 'proxy', + '--hostname', + '127.0.0.1', + '--port', + '0', + '--port-file', + str(port_file), '--enable-web-server', - '--plugin', 'proxy.plugin.WebServerPlugin', + '--plugin', + 'proxy.plugin.WebServerPlugin', '--enable-reverse-proxy', - '--plugin', 'proxy.plugin.ReverseProxyPlugin', - '--num-acceptors', '3', - '--num-workers', '3', - '--ca-cert-dir', str(ca_cert_dir), - '--log-level', 'd', + '--plugin', + 'proxy.plugin.ReverseProxyPlugin', + '--rewrite-host-header', + '--num-acceptors', + '3', + '--num-workers', + '3', + '--ca-cert-dir', + str(ca_cert_dir), + '--log-level', + 'd', ) + tuple(request.param.split()) proxy_proc = Popen(proxy_cmd, stderr=subprocess.STDOUT) # Needed because port file might not be available immediately diff --git a/tests/integration/test_integration.sh b/tests/integration/test_integration.sh index 28d5a371..303da0a0 100755 --- a/tests/integration/test_integration.sh +++ b/tests/integration/test_integration.sh @@ -16,6 +16,8 @@ # For github action, we simply bank upon GitHub # to clean up any background process including # proxy.py +# +set -x PROXY_PY_PORT=$1 if [[ -z "$PROXY_PY_PORT" ]]; then @@ -164,8 +166,14 @@ cat downloaded2.whl | $SHASUM -c downloaded2.hash VERIFIED5=$? rm downloaded2.whl downloaded2.hash +# Without --rewrite-host-header we will receive localhost: as host header back in response +# read -r -d '' REVERSE_PROXY_RESPONSE << EOM +# "localhost:$PROXY_PY_PORT" +# EOM + +# With --rewrite-host-header we will receive httpbingo.org as host header back in response read -r -d '' REVERSE_PROXY_RESPONSE << EOM -"localhost:$PROXY_PY_PORT" +"httpbingo.org" EOM echo "[Test Reverse Proxy Plugin]" @@ -174,8 +182,5 @@ RESPONSE=$($CMD 2> /dev/null) verify_contains "$RESPONSE" "$REVERSE_PROXY_RESPONSE" VERIFIED6=$? -# FIXME: VERIFIED6 NOT ASSERTED BECAUSE WE STARTED GETTING EMPTY RESPONSE FROM UPSTREAM -# AFTER CHANGE FROM HTTPBIN TO HTTPBINGO. This test works and passes perfectly when -# run from a local system -EXIT_CODE=$(( $VERIFIED1 || $VERIFIED2 || $VERIFIED3 || $VERIFIED4 || $VERIFIED5 )) +EXIT_CODE=$(( $VERIFIED1 || $VERIFIED2 || $VERIFIED3 || $VERIFIED4 || $VERIFIED5 || $VERIFIED6 )) exit $EXIT_CODE