mirror of https://github.com/mahmoud/boltons.git
urlutils docs: and finally it is looking to be complete. Plenty of links and explanation, hard won.
This commit is contained in:
parent
585be1a2da
commit
4026708022
|
@ -1,23 +1,25 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
""":mod:`urlutils` is a module dedicated to one of our most versatile,
|
||||
well-aged, and beloved data structures: the URL, also known as the
|
||||
Uniform Resource Locator.
|
||||
`Uniform Resource Locator`_.
|
||||
|
||||
Among other things, this module is a full reimplementation of URLs,
|
||||
without any reliance on the :mod:`urlparse` or :mod:`urllib`
|
||||
modules. The centerpiece and top-level interface of urlutils is the
|
||||
:class:`URL` type. Also featured is the :func:`find_all_links`
|
||||
convenience function.
|
||||
without any reliance on the :mod:`urlparse` or :mod:`urllib` standard
|
||||
library modules. The centerpiece and top-level interface of urlutils
|
||||
is the :class:`URL` type. Also featured is the :func:`find_all_links`
|
||||
convenience function. Some low-level functions and constants are also
|
||||
below.
|
||||
|
||||
The implementations in this module are based heavily on `RFC 3986`_ and
|
||||
`RFC 3987`_, and incorporates details from several other RFCs and W3C
|
||||
documents.
|
||||
`RFC 3987`_, and incorporates details from several other RFCs and `W3C
|
||||
documents`_.
|
||||
|
||||
.. _Uniform Resource Locator: https://en.wikipedia.org/wiki/Uniform_Resource_Locator
|
||||
.. _RFC 3986: https://tools.ietf.org/html/rfc3986
|
||||
.. _RFC 3987: https://tools.ietf.org/html/rfc3987
|
||||
.. _W3C documents: https://www.w3.org/TR/uri-clarification/
|
||||
|
||||
"""
|
||||
# TODO: add more RFC links throughout
|
||||
|
||||
import re
|
||||
import socket
|
||||
|
@ -134,6 +136,9 @@ def find_all_links(text, with_text=False, default_scheme='https', schemes=()):
|
|||
match in order to be included in the results. Defaults to
|
||||
empty, which matches all schemes.
|
||||
|
||||
.. note:: Currently this function does not support finding IPv6
|
||||
addresses or URLs with netloc-less schemes, like mailto.
|
||||
|
||||
"""
|
||||
text = to_unicode(text)
|
||||
prev_end, start, end = 0, None, None
|
||||
|
@ -253,8 +258,8 @@ def unquote(string, encoding='utf-8', errors='replace'):
|
|||
default, percent-encoded sequences are decoded with UTF-8, and
|
||||
invalid sequences are replaced by a placeholder character.
|
||||
|
||||
unquote('abc%20def') -> 'abc def'.
|
||||
|
||||
>>> unquote(u'abc%20def')
|
||||
u'abc def'
|
||||
"""
|
||||
if '%' not in string:
|
||||
string.split
|
||||
|
@ -304,8 +309,8 @@ def register_scheme(text, uses_netloc=None, default_port=None):
|
|||
slash behavior from the URL object. There are dozens of standard
|
||||
schemes preregistered, so this function is mostly meant for
|
||||
proprietary internal customizations or stopgaps on missing
|
||||
standards information. If a scheme seems to be missing, file an
|
||||
issue!
|
||||
standards information. If a scheme seems to be missing, please
|
||||
`file an issue`_!
|
||||
|
||||
Args:
|
||||
text (str): Text representing the scheme.
|
||||
|
@ -315,6 +320,7 @@ def register_scheme(text, uses_netloc=None, default_port=None):
|
|||
default_port (int): The default port, if any, for netloc-using
|
||||
schemes.
|
||||
|
||||
.. _file an issue: https://github.com/mahmoud/boltons/issues
|
||||
"""
|
||||
text = text.lower()
|
||||
if default_port is not None:
|
||||
|
@ -338,9 +344,9 @@ def register_scheme(text, uses_netloc=None, default_port=None):
|
|||
|
||||
|
||||
def resolve_path_parts(path_parts):
|
||||
"""
|
||||
Normalize the URL path by resolving segments of '.' and '..'.
|
||||
See RFC 3986 section 5.2.4, Remove Dot Segments.
|
||||
"""Normalize the URL path by resolving segments of '.' and '..',
|
||||
resulting in a dot-free path. See RFC 3986 section 5.2.4, Remove
|
||||
Dot Segments.
|
||||
"""
|
||||
# TODO: what to do with multiple slashes
|
||||
ret = []
|
||||
|
@ -430,7 +436,8 @@ class URL(object):
|
|||
Note that URL instances are mutable objects. If an immutable
|
||||
representation of the URL is desired, the string from
|
||||
:meth:`~URL.to_text()` may be used. For an immutable, but
|
||||
almost-as-featureful, URL, check out the `hyperlink package`_.
|
||||
almost-as-featureful, URL object, check out the `hyperlink
|
||||
package`_.
|
||||
|
||||
.. _hyperlink package: https://github.com/mahmoud/hyperlink
|
||||
|
||||
|
@ -440,17 +447,6 @@ class URL(object):
|
|||
_cmp_attrs = ('scheme', 'uses_netloc', 'username', 'password',
|
||||
'family', 'host', 'port', 'path', 'query_params', 'fragment')
|
||||
|
||||
"""
|
||||
Usage is
|
||||
straightforward:
|
||||
|
||||
>>> url = URL(u'https://boltons.readthedocs.io/?query_param=True#fragment')
|
||||
>>> print(url.scheme)
|
||||
https
|
||||
>>> print(url.host)
|
||||
boltons.readthedocs.io
|
||||
"""
|
||||
|
||||
def __init__(self, url=''):
|
||||
# TODO: encoding param. The encoding that underlies the
|
||||
# percent-encoding is always utf8 for IRIs, but can be Latin-1
|
||||
|
@ -562,11 +558,11 @@ boltons.readthedocs.io
|
|||
|
||||
@property
|
||||
def uses_netloc(self):
|
||||
"""Whether or not a URL uses ``:`` or ``://`` to separate the scheme
|
||||
from the rest of the URL depends on the scheme's own standard
|
||||
definition. There is no way to infer this behavior from other
|
||||
parts of the URL. A scheme either supports network locations
|
||||
or it does not.
|
||||
"""Whether or not a URL uses :code:`:` or :code:`://` to separate the
|
||||
scheme from the rest of the URL depends on the scheme's own
|
||||
standard definition. There is no way to infer this behavior
|
||||
from other parts of the URL. A scheme either supports network
|
||||
locations or it does not.
|
||||
|
||||
The URL type's approach to this is to check for explicitly
|
||||
registered schemes, with common schemes like HTTP
|
||||
|
@ -584,6 +580,7 @@ boltons.readthedocs.io
|
|||
fakescheme://test.com
|
||||
>>> print(URL('mockscheme:hello:world').to_text())
|
||||
mockscheme:hello:world
|
||||
|
||||
"""
|
||||
default = self._netloc_sep
|
||||
if self.scheme in SCHEME_PORT_MAP:
|
||||
|
@ -598,7 +595,9 @@ boltons.readthedocs.io
|
|||
def default_port(self):
|
||||
"""Return the default port for the currently-set scheme. Returns
|
||||
``None`` if the scheme is unrecognized. See
|
||||
:func:`register_scheme` above.
|
||||
:func:`register_scheme` above. If :attr:`~URL.port` matches
|
||||
this value, no port is emitted in the output of
|
||||
:meth:`~URL.to_text()`.
|
||||
|
||||
Applies the same '+' heuristic detailed in :meth:`URL.uses_netloc`.
|
||||
"""
|
||||
|
@ -672,9 +671,14 @@ boltons.readthedocs.io
|
|||
return ret
|
||||
|
||||
def get_authority(self, full_quote=False, with_userinfo=False):
|
||||
"""Get the text representation of just the authority part of the
|
||||
URL. Used internally by :meth:`~URL.to_text()` and can be
|
||||
useful for labeling connections.
|
||||
"""Used by URL schemes that have a network location,
|
||||
:meth:`~URL.get_authority` combines :attr:`username`,
|
||||
:attr:`password`, :attr:`host`, and :attr:`port` into one
|
||||
string, the *authority*, that is used for
|
||||
connecting to a network-accessible resource.
|
||||
|
||||
Used internally by :meth:`~URL.to_text()` and can be useful
|
||||
for labeling connections.
|
||||
|
||||
>>> url = URL('ftp://user@ftp.debian.org:2121/debian/README')
|
||||
>>> print(url.get_authority())
|
||||
|
@ -688,6 +692,7 @@ boltons.readthedocs.io
|
|||
with_userinfo (bool): Whether or not to include username
|
||||
and password, technically part of the
|
||||
authority. Defaults to ``False``.
|
||||
|
||||
"""
|
||||
parts = []
|
||||
_add = parts.append
|
||||
|
@ -724,8 +729,8 @@ boltons.readthedocs.io
|
|||
By setting the *full_quote* flag, the URL can either be fully
|
||||
quoted or minimally quoted. The most common characteristic of
|
||||
an encoded-URL is the presence of percent-encoded text (e.g.,
|
||||
%60). Minimally-encoded URLs are more readable and suitable
|
||||
for display, whereas fully-encoded URLs are more conservative
|
||||
%60). Unquoted URLs are more readable and suitable
|
||||
for display, whereas fully-quoted URLs are more conservative
|
||||
and generally necessary for sending over the network.
|
||||
"""
|
||||
scheme = self.scheme
|
||||
|
@ -911,6 +916,9 @@ DEFAULT_PARSED_URL = parse_url('')
|
|||
|
||||
|
||||
def parse_qsl(qs, keep_blank_values=True, encoding=DEFAULT_ENCODING):
|
||||
"""
|
||||
Converts a query string into a list of (key, value) pairs.
|
||||
"""
|
||||
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
|
||||
ret = []
|
||||
for pair in pairs:
|
||||
|
|
|
@ -5,11 +5,117 @@
|
|||
|
||||
.. versionadded:: 17.2
|
||||
|
||||
The ``URL``
|
||||
-----------
|
||||
The URL type
|
||||
------------
|
||||
|
||||
.. autoclass:: boltons.urlutils.URL
|
||||
:members:
|
||||
|
||||
.. attribute:: URL.scheme
|
||||
|
||||
The scheme is an ASCII string, normally lowercase, which
|
||||
specifies the semantics for the rest of the URL, as well as
|
||||
network protocol in many cases. For example, "http" in
|
||||
"http://hatnote.com".
|
||||
|
||||
.. attribute:: URL.username
|
||||
|
||||
The username is a string used by some schemes for
|
||||
authentication. For example, "public" in
|
||||
"ftp://public@example.com".
|
||||
|
||||
.. attribute:: URL.password
|
||||
|
||||
The password is a string also used for
|
||||
authentication. Technically deprecated by `RFC 3986 Section
|
||||
7.5`_, they're still used in cases when the URL is private or
|
||||
the password is public. For example "password" in
|
||||
"db://private:password@127.0.0.1".
|
||||
|
||||
.. _RFC 3986 Section 7.5: https://tools.ietf.org/html/rfc3986#section-7.5
|
||||
|
||||
.. attribute:: URL.host
|
||||
|
||||
The host is a string used to resolve the network location of the
|
||||
resource, either empty, a domain, or IP address (v4 or
|
||||
v6). "example.com", "127.0.0.1", and "::1" are all good examples
|
||||
of host strings.
|
||||
|
||||
Per spec, fully-encoded output from :attr:`~URL.to_text()` is
|
||||
`IDNA encoded`_ for compatibility with DNS.
|
||||
|
||||
.. _IDNA encoded: https://en.wikipedia.org/wiki/Internationalized_domain_name#Example_of_IDNA_encoding
|
||||
|
||||
.. attribute:: URL.port
|
||||
|
||||
The port is an integer used, along with :attr:`host`, in
|
||||
connecting to network locations. ``8080`` is the port in
|
||||
"http://localhost:8080/index.html".
|
||||
|
||||
.. note::
|
||||
|
||||
As is the case for 80 for HTTP and 22 for SSH, many schemes have
|
||||
default ports, and RFC 3986 states that when a URL's port is the
|
||||
same as its scheme's default port, the port should not be
|
||||
emitted::
|
||||
|
||||
>>> URL(u'https://github.com:443/mahmoud/boltons').to_text()
|
||||
u'https://github.com/mahmoud/boltons'
|
||||
|
||||
Custom schemes can register their port with
|
||||
:func:`~boltons.urlutils.register_scheme`. See
|
||||
:attr:`URL.default_port` for more info.
|
||||
|
||||
.. attribute:: URL.path
|
||||
|
||||
The string starting with the first leading slash after the
|
||||
authority part of the URL, ending with the first question
|
||||
mark. Often percent-quoted for network use. "/a/b/c" is the path
|
||||
of "http://example.com/a/b/c?d=e".
|
||||
|
||||
|
||||
.. attribute:: URL.path_parts
|
||||
|
||||
The :class:`tuple` form of :attr:`~URL.path`, split on
|
||||
slashes. Empty slash segments are preserved, including that of
|
||||
the leading slash::
|
||||
|
||||
>>> url = URL(u'http://example.com/a/b/c')
|
||||
>>> url.path_parts
|
||||
(u'', u'a', u'b', u'c')
|
||||
|
||||
|
||||
.. attribute:: URL.query_params
|
||||
|
||||
An instance of :class:`~boltons.urlutils.QueryParamDict`, an
|
||||
:class:`~boltons.dictutils.OrderedMultiDict` subtype, mapping
|
||||
textual keys and values which follow the first question mark
|
||||
after the :attr:`path`. Also available as the handy alias
|
||||
``qp``::
|
||||
|
||||
>>> url = URL('http://boltons.readthedocs.io/en/latest/?utm_source=docs&sphinx=ok')
|
||||
>>> url.qp.keys()
|
||||
[u'utm_source', u'sphinx']
|
||||
|
||||
Also percent-encoded for network use cases.
|
||||
|
||||
.. attribute:: URL.fragment
|
||||
|
||||
The string following the first '#' after the
|
||||
:attr:`query_params` until the end of the URL. It has no
|
||||
inherent internal structure, and is percent-quoted.
|
||||
|
||||
.. automethod:: URL.from_parts
|
||||
.. automethod:: URL.to_text
|
||||
|
||||
.. autoattribute:: URL.default_port
|
||||
.. autoattribute:: URL.uses_netloc
|
||||
|
||||
.. automethod:: URL.get_authority
|
||||
|
||||
.. automethod:: URL.normalize
|
||||
.. automethod:: URL.navigate
|
||||
|
||||
|
||||
|
||||
Related functions
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
@ -46,3 +152,36 @@ URLs have many parts, and almost as many individual "quoting"
|
|||
There is however, only one unquoting strategy:
|
||||
|
||||
.. autofunction:: boltons.urlutils.unquote
|
||||
|
||||
Useful constants
|
||||
----------------
|
||||
|
||||
.. attribute:: boltons.urlutils.SCHEME_PORT_MAP
|
||||
|
||||
A mapping of URL schemes to their protocols' default
|
||||
ports. Painstakingly assembled from the `IANA scheme registry`_,
|
||||
`port registry`_, and independent research.
|
||||
|
||||
Keys are lowercase strings, values are integers or None, with None
|
||||
indicating that the scheme does not have a default port (or may not
|
||||
support ports at all)::
|
||||
|
||||
>>> boltons.urlutils.SCHEME_PORT_MAP['http']
|
||||
80
|
||||
>>> boltons.urlutils.SCHEME_PORT_MAP['file']
|
||||
None
|
||||
|
||||
See :attr:`URL.port` for more info on how it is used. See
|
||||
:attr:`~boltons.urlutils.NO_NETLOC_SCHEMES` for more scheme info.
|
||||
|
||||
Also `available in JSON`_.
|
||||
|
||||
.. _IANA scheme registry: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
|
||||
.. _port registry: https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml
|
||||
.. _available in JSON: https://gist.github.com/mahmoud/2fe281a8daaff26cfe9c15d2c5bf5c8b
|
||||
|
||||
|
||||
.. attribute:: boltons.urlutils.NO_NETLOC_SCHEMES
|
||||
|
||||
This is a :class:`set` of schemes explicitly do not support network
|
||||
resolution, such as "mailto" and "urn".
|
||||
|
|
Loading…
Reference in New Issue