From 4026708022fe7b2b51bd234013998dd178cd6708 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 18 Mar 2017 13:50:42 -0700 Subject: [PATCH] urlutils docs: and finally it is looking to be complete. Plenty of links and explanation, hard won. --- boltons/urlutils.py | 84 +++++++++++++------------ docs/urlutils.rst | 145 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 188 insertions(+), 41 deletions(-) diff --git a/boltons/urlutils.py b/boltons/urlutils.py index d66be49..968851d 100644 --- a/boltons/urlutils.py +++ b/boltons/urlutils.py @@ -1,23 +1,25 @@ # -*- coding: utf-8 -*- """:mod:`urlutils` is a module dedicated to one of our most versatile, well-aged, and beloved data structures: the URL, also known as the -Uniform Resource Locator. +`Uniform Resource Locator`_. Among other things, this module is a full reimplementation of URLs, -without any reliance on the :mod:`urlparse` or :mod:`urllib` -modules. The centerpiece and top-level interface of urlutils is the -:class:`URL` type. Also featured is the :func:`find_all_links` -convenience function. +without any reliance on the :mod:`urlparse` or :mod:`urllib` standard +library modules. The centerpiece and top-level interface of urlutils +is the :class:`URL` type. Also featured is the :func:`find_all_links` +convenience function. Some low-level functions and constants are also +below. The implementations in this module are based heavily on `RFC 3986`_ and -`RFC 3987`_, and incorporates details from several other RFCs and W3C -documents. +`RFC 3987`_, and incorporates details from several other RFCs and `W3C +documents`_. +.. _Uniform Resource Locator: https://en.wikipedia.org/wiki/Uniform_Resource_Locator .. _RFC 3986: https://tools.ietf.org/html/rfc3986 .. _RFC 3987: https://tools.ietf.org/html/rfc3987 +.. _W3C documents: https://www.w3.org/TR/uri-clarification/ """ -# TODO: add more RFC links throughout import re import socket @@ -134,6 +136,9 @@ def find_all_links(text, with_text=False, default_scheme='https', schemes=()): match in order to be included in the results. Defaults to empty, which matches all schemes. + .. note:: Currently this function does not support finding IPv6 + addresses or URLs with netloc-less schemes, like mailto. + """ text = to_unicode(text) prev_end, start, end = 0, None, None @@ -253,8 +258,8 @@ def unquote(string, encoding='utf-8', errors='replace'): default, percent-encoded sequences are decoded with UTF-8, and invalid sequences are replaced by a placeholder character. - unquote('abc%20def') -> 'abc def'. - + >>> unquote(u'abc%20def') + u'abc def' """ if '%' not in string: string.split @@ -304,8 +309,8 @@ def register_scheme(text, uses_netloc=None, default_port=None): slash behavior from the URL object. There are dozens of standard schemes preregistered, so this function is mostly meant for proprietary internal customizations or stopgaps on missing - standards information. If a scheme seems to be missing, file an - issue! + standards information. If a scheme seems to be missing, please + `file an issue`_! Args: text (str): Text representing the scheme. @@ -315,6 +320,7 @@ def register_scheme(text, uses_netloc=None, default_port=None): default_port (int): The default port, if any, for netloc-using schemes. + .. _file an issue: https://github.com/mahmoud/boltons/issues """ text = text.lower() if default_port is not None: @@ -338,9 +344,9 @@ def register_scheme(text, uses_netloc=None, default_port=None): def resolve_path_parts(path_parts): - """ - Normalize the URL path by resolving segments of '.' and '..'. - See RFC 3986 section 5.2.4, Remove Dot Segments. + """Normalize the URL path by resolving segments of '.' and '..', + resulting in a dot-free path. See RFC 3986 section 5.2.4, Remove + Dot Segments. """ # TODO: what to do with multiple slashes ret = [] @@ -430,7 +436,8 @@ class URL(object): Note that URL instances are mutable objects. If an immutable representation of the URL is desired, the string from :meth:`~URL.to_text()` may be used. For an immutable, but - almost-as-featureful, URL, check out the `hyperlink package`_. + almost-as-featureful, URL object, check out the `hyperlink + package`_. .. _hyperlink package: https://github.com/mahmoud/hyperlink @@ -440,17 +447,6 @@ class URL(object): _cmp_attrs = ('scheme', 'uses_netloc', 'username', 'password', 'family', 'host', 'port', 'path', 'query_params', 'fragment') - """ - Usage is -straightforward: - ->>> url = URL(u'https://boltons.readthedocs.io/?query_param=True#fragment') ->>> print(url.scheme) -https ->>> print(url.host) -boltons.readthedocs.io - """ - def __init__(self, url=''): # TODO: encoding param. The encoding that underlies the # percent-encoding is always utf8 for IRIs, but can be Latin-1 @@ -562,11 +558,11 @@ boltons.readthedocs.io @property def uses_netloc(self): - """Whether or not a URL uses ``:`` or ``://`` to separate the scheme - from the rest of the URL depends on the scheme's own standard - definition. There is no way to infer this behavior from other - parts of the URL. A scheme either supports network locations - or it does not. + """Whether or not a URL uses :code:`:` or :code:`://` to separate the + scheme from the rest of the URL depends on the scheme's own + standard definition. There is no way to infer this behavior + from other parts of the URL. A scheme either supports network + locations or it does not. The URL type's approach to this is to check for explicitly registered schemes, with common schemes like HTTP @@ -584,6 +580,7 @@ boltons.readthedocs.io fakescheme://test.com >>> print(URL('mockscheme:hello:world').to_text()) mockscheme:hello:world + """ default = self._netloc_sep if self.scheme in SCHEME_PORT_MAP: @@ -598,7 +595,9 @@ boltons.readthedocs.io def default_port(self): """Return the default port for the currently-set scheme. Returns ``None`` if the scheme is unrecognized. See - :func:`register_scheme` above. + :func:`register_scheme` above. If :attr:`~URL.port` matches + this value, no port is emitted in the output of + :meth:`~URL.to_text()`. Applies the same '+' heuristic detailed in :meth:`URL.uses_netloc`. """ @@ -672,9 +671,14 @@ boltons.readthedocs.io return ret def get_authority(self, full_quote=False, with_userinfo=False): - """Get the text representation of just the authority part of the - URL. Used internally by :meth:`~URL.to_text()` and can be - useful for labeling connections. + """Used by URL schemes that have a network location, + :meth:`~URL.get_authority` combines :attr:`username`, + :attr:`password`, :attr:`host`, and :attr:`port` into one + string, the *authority*, that is used for + connecting to a network-accessible resource. + + Used internally by :meth:`~URL.to_text()` and can be useful + for labeling connections. >>> url = URL('ftp://user@ftp.debian.org:2121/debian/README') >>> print(url.get_authority()) @@ -688,6 +692,7 @@ boltons.readthedocs.io with_userinfo (bool): Whether or not to include username and password, technically part of the authority. Defaults to ``False``. + """ parts = [] _add = parts.append @@ -724,8 +729,8 @@ boltons.readthedocs.io By setting the *full_quote* flag, the URL can either be fully quoted or minimally quoted. The most common characteristic of an encoded-URL is the presence of percent-encoded text (e.g., - %60). Minimally-encoded URLs are more readable and suitable - for display, whereas fully-encoded URLs are more conservative + %60). Unquoted URLs are more readable and suitable + for display, whereas fully-quoted URLs are more conservative and generally necessary for sending over the network. """ scheme = self.scheme @@ -911,6 +916,9 @@ DEFAULT_PARSED_URL = parse_url('') def parse_qsl(qs, keep_blank_values=True, encoding=DEFAULT_ENCODING): + """ + Converts a query string into a list of (key, value) pairs. + """ pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] ret = [] for pair in pairs: diff --git a/docs/urlutils.rst b/docs/urlutils.rst index b3987c0..4832fe1 100644 --- a/docs/urlutils.rst +++ b/docs/urlutils.rst @@ -5,11 +5,117 @@ .. versionadded:: 17.2 -The ``URL`` ------------ +The URL type +------------ .. autoclass:: boltons.urlutils.URL - :members: + + .. attribute:: URL.scheme + + The scheme is an ASCII string, normally lowercase, which + specifies the semantics for the rest of the URL, as well as + network protocol in many cases. For example, "http" in + "http://hatnote.com". + + .. attribute:: URL.username + + The username is a string used by some schemes for + authentication. For example, "public" in + "ftp://public@example.com". + + .. attribute:: URL.password + + The password is a string also used for + authentication. Technically deprecated by `RFC 3986 Section + 7.5`_, they're still used in cases when the URL is private or + the password is public. For example "password" in + "db://private:password@127.0.0.1". + + .. _RFC 3986 Section 7.5: https://tools.ietf.org/html/rfc3986#section-7.5 + + .. attribute:: URL.host + + The host is a string used to resolve the network location of the + resource, either empty, a domain, or IP address (v4 or + v6). "example.com", "127.0.0.1", and "::1" are all good examples + of host strings. + + Per spec, fully-encoded output from :attr:`~URL.to_text()` is + `IDNA encoded`_ for compatibility with DNS. + + .. _IDNA encoded: https://en.wikipedia.org/wiki/Internationalized_domain_name#Example_of_IDNA_encoding + + .. attribute:: URL.port + + The port is an integer used, along with :attr:`host`, in + connecting to network locations. ``8080`` is the port in + "http://localhost:8080/index.html". + + .. note:: + + As is the case for 80 for HTTP and 22 for SSH, many schemes have + default ports, and RFC 3986 states that when a URL's port is the + same as its scheme's default port, the port should not be + emitted:: + + >>> URL(u'https://github.com:443/mahmoud/boltons').to_text() + u'https://github.com/mahmoud/boltons' + + Custom schemes can register their port with + :func:`~boltons.urlutils.register_scheme`. See + :attr:`URL.default_port` for more info. + + .. attribute:: URL.path + + The string starting with the first leading slash after the + authority part of the URL, ending with the first question + mark. Often percent-quoted for network use. "/a/b/c" is the path + of "http://example.com/a/b/c?d=e". + + + .. attribute:: URL.path_parts + + The :class:`tuple` form of :attr:`~URL.path`, split on + slashes. Empty slash segments are preserved, including that of + the leading slash:: + + >>> url = URL(u'http://example.com/a/b/c') + >>> url.path_parts + (u'', u'a', u'b', u'c') + + + .. attribute:: URL.query_params + + An instance of :class:`~boltons.urlutils.QueryParamDict`, an + :class:`~boltons.dictutils.OrderedMultiDict` subtype, mapping + textual keys and values which follow the first question mark + after the :attr:`path`. Also available as the handy alias + ``qp``:: + + >>> url = URL('http://boltons.readthedocs.io/en/latest/?utm_source=docs&sphinx=ok') + >>> url.qp.keys() + [u'utm_source', u'sphinx'] + + Also percent-encoded for network use cases. + + .. attribute:: URL.fragment + + The string following the first '#' after the + :attr:`query_params` until the end of the URL. It has no + inherent internal structure, and is percent-quoted. + + .. automethod:: URL.from_parts + .. automethod:: URL.to_text + + .. autoattribute:: URL.default_port + .. autoattribute:: URL.uses_netloc + + .. automethod:: URL.get_authority + + .. automethod:: URL.normalize + .. automethod:: URL.navigate + + Related functions ~~~~~~~~~~~~~~~~~ @@ -46,3 +152,36 @@ URLs have many parts, and almost as many individual "quoting" There is however, only one unquoting strategy: .. autofunction:: boltons.urlutils.unquote + +Useful constants +---------------- + +.. attribute:: boltons.urlutils.SCHEME_PORT_MAP + + A mapping of URL schemes to their protocols' default + ports. Painstakingly assembled from the `IANA scheme registry`_, + `port registry`_, and independent research. + + Keys are lowercase strings, values are integers or None, with None + indicating that the scheme does not have a default port (or may not + support ports at all):: + + >>> boltons.urlutils.SCHEME_PORT_MAP['http'] + 80 + >>> boltons.urlutils.SCHEME_PORT_MAP['file'] + None + + See :attr:`URL.port` for more info on how it is used. See + :attr:`~boltons.urlutils.NO_NETLOC_SCHEMES` for more scheme info. + + Also `available in JSON`_. + + .. _IANA scheme registry: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + .. _port registry: https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml + .. _available in JSON: https://gist.github.com/mahmoud/2fe281a8daaff26cfe9c15d2c5bf5c8b + + +.. attribute:: boltons.urlutils.NO_NETLOC_SCHEMES + + This is a :class:`set` of schemes explicitly do not support network + resolution, such as "mailto" and "urn".