From 549edaf64d08bedf0228489c062d1ff3f0281320 Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Tue, 6 Aug 2019 18:16:31 +0300 Subject: [PATCH 1/2] httputil: cache header normalization with @lru_cache instead of hand-rolling Tornado is now py3-only so @lru_cache is always available. Performance is about the same. Benchmark below. Python 3.7 on Linux. before, cached: 0.9121252089971676 before, uncached: 13.358482279989403 after, cached: 0.9175888689933345 after, uncached: 11.085199063003529 ```py from time import perf_counter names = [f'sOMe-RanDOM-hEAdeR-{i}' for i in range(1000)] from tornado.httputil import _normalize_header start = perf_counter() for i in range(10000): # _normalize_header.cache_clear() for name in names: _normalize_header(name) print(perf_counter() - start) from tornado.httputil import _NormalizedHeaderCache start = perf_counter() _normalized_headers = _NormalizedHeaderCache(1000) for i in range(10000): # _normalized_headers = _NormalizedHeaderCache(1000) for name in names: _normalized_headers[name] print(perf_counter() - start) ``` --- tornado/httputil.py | 46 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/tornado/httputil.py b/tornado/httputil.py index 26a6c440..9d7abeb3 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -24,6 +24,7 @@ import collections import copy import datetime import email.utils +from functools import lru_cache from http.client import responses import http.cookies import re @@ -62,37 +63,14 @@ if typing.TYPE_CHECKING: import unittest # noqa: F401 -class _NormalizedHeaderCache(dict): - """Dynamic cached mapping of header names to Http-Header-Case. +@lru_cache(1000) +def _normalize_header(name: str) -> str: + """Map a header name to Http-Header-Case. - Implemented as a dict subclass so that cache hits are as fast as a - normal dict lookup, without the overhead of a python function - call. - - >>> normalized_headers = _NormalizedHeaderCache(10) - >>> normalized_headers["coNtent-TYPE"] + >>> _normalize_header("coNtent-TYPE") 'Content-Type' """ - - def __init__(self, size: int) -> None: - super(_NormalizedHeaderCache, self).__init__() - self.size = size - self.queue = collections.deque() # type: Deque[str] - - def __missing__(self, key: str) -> str: - normalized = "-".join([w.capitalize() for w in key.split("-")]) - self[key] = normalized - self.queue.append(key) - if len(self.queue) > self.size: - # Limit the size of the cache. LRU would be better, but this - # simpler approach should be fine. In Python 2.7+ we could - # use OrderedDict (or in 3.2+, @functools.lru_cache). - old_key = self.queue.popleft() - del self[old_key] - return normalized - - -_normalized_headers = _NormalizedHeaderCache(1000) + return "-".join([w.capitalize() for w in name.split("-")]) class HTTPHeaders(collections.abc.MutableMapping): @@ -143,7 +121,7 @@ class HTTPHeaders(collections.abc.MutableMapping): def __init__(self, *args: typing.Any, **kwargs: str) -> None: # noqa: F811 self._dict = {} # type: typing.Dict[str, str] self._as_list = {} # type: typing.Dict[str, typing.List[str]] - self._last_key = None + self._last_key = None # type: Optional[str] if len(args) == 1 and len(kwargs) == 0 and isinstance(args[0], HTTPHeaders): # Copy constructor for k, v in args[0].get_all(): @@ -156,7 +134,7 @@ class HTTPHeaders(collections.abc.MutableMapping): def add(self, name: str, value: str) -> None: """Adds a new value for the given key.""" - norm_name = _normalized_headers[name] + norm_name = _normalize_header(name) self._last_key = norm_name if norm_name in self: self._dict[norm_name] = ( @@ -168,7 +146,7 @@ class HTTPHeaders(collections.abc.MutableMapping): def get_list(self, name: str) -> List[str]: """Returns all values for the given header as a list.""" - norm_name = _normalized_headers[name] + norm_name = _normalize_header(name) return self._as_list.get(norm_name, []) def get_all(self) -> Iterable[Tuple[str, str]]: @@ -230,15 +208,15 @@ class HTTPHeaders(collections.abc.MutableMapping): # MutableMapping abstract method implementations. def __setitem__(self, name: str, value: str) -> None: - norm_name = _normalized_headers[name] + norm_name = _normalize_header(name) self._dict[norm_name] = value self._as_list[norm_name] = [value] def __getitem__(self, name: str) -> str: - return self._dict[_normalized_headers[name]] + return self._dict[_normalize_header(name)] def __delitem__(self, name: str) -> None: - norm_name = _normalized_headers[name] + norm_name = _normalize_header(name) del self._dict[norm_name] del self._as_list[norm_name] From 61a535b261117d85839621ac8de4473e9135b402 Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Tue, 6 Aug 2019 19:18:41 +0300 Subject: [PATCH 2/2] httputil: use compiled re patterns This is slightly faster than using the builtin cache, e.g.: With benchmark below (Python 3.7, Linux): before: 0.7284867879934609 after: 0.2657967659761198 ```py import re from time import perf_counter line = 'HTTP/1.1' _http_version_re = re.compile(r"^HTTP/1\.[0-9]$") start = perf_counter() for i in range(1000000): _http_version_re.match(line) print(perf_counter() - start) start = perf_counter() for i in range(1000000): re.match(r"^HTTP/1\.[0-9]$", line) print(perf_counter() - start) ``` --- tornado/httputil.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tornado/httputil.py b/tornado/httputil.py index 9d7abeb3..00761178 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -873,6 +873,9 @@ RequestStartLine = collections.namedtuple( ) +_http_version_re = re.compile(r"^HTTP/1\.[0-9]$") + + def parse_request_start_line(line: str) -> RequestStartLine: """Returns a (method, path, version) tuple for an HTTP 1.x request line. @@ -887,7 +890,7 @@ def parse_request_start_line(line: str) -> RequestStartLine: # https://tools.ietf.org/html/rfc7230#section-3.1.1 # invalid request-line SHOULD respond with a 400 (Bad Request) raise HTTPInputError("Malformed HTTP request line") - if not re.match(r"^HTTP/1\.[0-9]$", version): + if not _http_version_re.match(version): raise HTTPInputError( "Malformed HTTP version in HTTP Request-Line: %r" % version ) @@ -899,6 +902,9 @@ ResponseStartLine = collections.namedtuple( ) +_http_response_line_re = re.compile(r"(HTTP/1.[0-9]) ([0-9]+) ([^\r]*)") + + def parse_response_start_line(line: str) -> ResponseStartLine: """Returns a (version, code, reason) tuple for an HTTP 1.x response line. @@ -908,7 +914,7 @@ def parse_response_start_line(line: str) -> ResponseStartLine: ResponseStartLine(version='HTTP/1.1', code=200, reason='OK') """ line = native_str(line) - match = re.match("(HTTP/1.[0-9]) ([0-9]+) ([^\r]*)", line) + match = _http_response_line_re.match(line) if not match: raise HTTPInputError("Error parsing response start line") return ResponseStartLine(match.group(1), int(match.group(2)), match.group(3)) @@ -1013,6 +1019,9 @@ def doctests(): return doctest.DocTestSuite() +_netloc_re = re.compile(r"^(.+):(\d+)$") + + def split_host_and_port(netloc: str) -> Tuple[str, Optional[int]]: """Returns ``(host, port)`` tuple from ``netloc``. @@ -1020,7 +1029,7 @@ def split_host_and_port(netloc: str) -> Tuple[str, Optional[int]]: .. versionadded:: 4.1 """ - match = re.match(r"^(.+):(\d+)$", netloc) + match = _netloc_re.match(netloc) if match: host = match.group(1) port = int(match.group(2)) # type: Optional[int]