Tidy up and document Doc, Token and Span

2017-10-27 15:41:45 +02:00 · 2017-10-27 15:41:45 +02:00 · 6a0483b7aa
parent 1a559d4c95
commit 6a0483b7aa
6 changed files with 356 additions and 173 deletions
--- a/spacy/tokens/init.py
+++ b/spacy/tokens/init.py
@ -2,4 +2,4 @@ from .doc import Doc
 from .token import Token
 from .span import Span

-__all__ = [Doc, Token, Span]
+__all__ = ['Doc', 'Token', 'Span']
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs import intify_attrs, IDS
 from ..attrs cimport attr_id_t
-from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
-from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
-from ..attrs cimport SENT_START
+from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
+from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
+from ..attrs cimport ENT_TYPE, SENT_START
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 from ..util import normalize_slice
 from ..compat import is_config, copy_reg, pickle
@ -78,24 +78,25 @@ def _get_chunker(lang):

 cdef class Doc:
    """A sequence of Token objects. Access sentences and named entities, export
-    annotations to numpy arrays, losslessly serialize to compressed binary strings.
-    The `Doc` object holds an array of `TokenC` structs. The Python-level
-    `Token` and `Span` objects are views of this array, i.e. they don't own
-    the data themselves.
+    annotations to numpy arrays, losslessly serialize to compressed binary
+    strings. The `Doc` object holds an array of `TokenC` structs. The
+    Python-level `Token` and `Span` objects are views of this array, i.e.
+    they don't own the data themselves.

    EXAMPLE: Construction 1
        >>> doc = nlp(u'Some text')

        Construction 2
        >>> from spacy.tokens import Doc
-        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
+        >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
+                      spaces=[True, False, False])
    """
    @classmethod
    def set_extension(cls, name, default=None, method=None,
                      getter=None, setter=None):
        nr_defined = sum(t is not None for t in (default, getter, setter, method))
        assert nr_defined == 1
-        Underscore.doc_extensions[name] = (default, method, getter, setter) 
+        Underscore.doc_extensions[name] = (default, method, getter, setter)

    @classmethod
    def get_extension(cls, name):
@ -109,15 +110,14 @@ cdef class Doc:
                 orths_and_spaces=None):
        """Create a Doc object.

-        vocab (Vocab): A vocabulary object, which must match any models you want
-            to use (e.g. tokenizer, parser, entity recognizer).
+        vocab (Vocab): A vocabulary object, which must match any models you
+            want to use (e.g. tokenizer, parser, entity recognizer).
        words (list or None): A list of unicode strings to add to the document
            as words. If `None`, defaults to empty list.
        spaces (list or None): A list of boolean values, of the same length as
            words. True means that the word is followed by a space, False means
            it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
- 
        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
@ -153,10 +153,10 @@ cdef class Doc:
                spaces = [True] * len(words)
            elif len(spaces) != len(words):
                raise ValueError(
-                    "Arguments 'words' and 'spaces' should be sequences of the "
-                    "same length, or 'spaces' should be left default at None. "
-                    "spaces should be a sequence of booleans, with True meaning "
-                    "that the word owns a ' ' character following it.")
+                    "Arguments 'words' and 'spaces' should be sequences of "
+                    "the same length, or 'spaces' should be left default at "
+                    "None. spaces should be a sequence of booleans, with True "
+                    "meaning that the word owns a ' ' character following it.")
            orths_and_spaces = zip(words, spaces)
        if orths_and_spaces is not None:
            for orth_space in orths_and_spaces:
@ -166,7 +166,8 @@ cdef class Doc:
                elif isinstance(orth_space, bytes):
                    raise ValueError(
                        "orths_and_spaces expects either List(unicode) or "
-                        "List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
+                        "List((unicode, bool)). "
+                        "Got bytes instance: %s" % (str(orth_space)))
                else:
                    orth, has_space = orth_space
                # Note that we pass self.mem here --- we have ownership, if LexemeC
@ -186,7 +187,8 @@ cdef class Doc:
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.

-        i (int or tuple) The index of the token, or the slice of the document to get.
+        i (int or tuple) The index of the token, or the slice of the document
+            to get.
        RETURNS (Token or Span): The token at `doc[i]]`, or the span at
            `doc[start : end]`.

@ -199,11 +201,11 @@ cdef class Doc:
            >>> doc[start : end]]
            Get a `Span` object, starting at position `start` and ending at
            position `end`, where `start` and `end` are token indices. For
-            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
-            Stepped slices (e.g. `doc[start : end : step]`) are not supported,
-            as `Span` objects must be contiguous (cannot have gaps). You can use
-            negative indices and open-ended ranges, which have their normal
-            Python semantics.
+            instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
+            4. Stepped slices (e.g. `doc[start : end : step]`) are not
+            supported, as `Span` objects must be contiguous (cannot have gaps).
+            You can use negative indices and open-ended ranges, which have
+            their normal Python semantics.
        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -262,8 +264,10 @@ cdef class Doc:
        doc (Doc): The parent document.
        start (int): The index of the first character of the span.
        end (int): The index of the first character after the span.
-        label (uint64 or string): A label to attach to the Span, e.g. for named entities.
-        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        label (uint64 or string): A label to attach to the Span, e.g. for
+            named entities.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
+            the span.
        RETURNS (Span): The newly constructed object.
        """
        if not isinstance(label, int):
@ -377,13 +381,14 @@ cdef class Doc:
            return self.text

    property ents:
-        """Iterate over the entities in the document. Yields named-entity `Span`
-        objects, if the entity recognizer has been applied to the document.
+        """Iterate over the entities in the document. Yields named-entity
+        `Span` objects, if the entity recognizer has been applied to the
+        document.

        YIELDS (Span): Entities in the document.

-        EXAMPLE: Iterate over the span to get individual Token objects, or access
-            the label:
+        EXAMPLE: Iterate over the span to get individual Token objects,
+            or access the label:

            >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
            >>> ents = list(tokens.ents)
@ -456,10 +461,11 @@ cdef class Doc:

    property noun_chunks:
        """Iterate over the base noun phrases in the document. Yields base
-        noun-phrase #[code Span] objects, if the document has been syntactically
-        parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
-        not permit other NPs to be nested within it – so no NP-level
-        coordination, no prepositional phrases, and no relative clauses.
+        noun-phrase #[code Span] objects, if the document has been
+        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
+        phrase that does not permit other NPs to be nested within it – so no
+        NP-level coordination, no prepositional phrases, and no relative
+        clauses.

        YIELDS (Span): Noun chunks in the document.
        """
@ -467,12 +473,14 @@ cdef class Doc:
            if not self.is_parsed:
                raise ValueError(
                    "noun_chunks requires the dependency parse, which "
-                    "requires data to be installed. For more info, see the "
+                    "requires a statistical model to be installed and loaded. "
+                    "For more info, see the "
                    "documentation: \n%s\n" % about.__docs_models__)
-            # Accumulate the result before beginning to iterate over it. This prevents
-            # the tokenisation from being changed out from under us during the iteration.
-            # The tricky thing here is that Span accepts its tokenisation changing,
-            # so it's okay once we have the Span objects. See Issue #375
+            # Accumulate the result before beginning to iterate over it. This
+            # prevents the tokenisation from being changed out from under us
+            # during the iteration. The tricky thing here is that Span accepts
+            # its tokenisation changing, so it's okay once we have the Span
+            # objects. See Issue #375.
            spans = []
            for start, end, label in self.noun_chunks_iterator(self):
                spans.append(Span(self, start, end, label=label))
@ -497,8 +505,9 @@ cdef class Doc:

            if not self.is_parsed:
                raise ValueError(
-                    "sentence boundary detection requires the dependency parse, which "
-                    "requires data to be installed. For more info, see the "
+                    "Sentence boundary detection requires the dependency "
+                    "parse, which requires a statistical model to be "
+                    "installed and loaded. For more info, see the "
                    "documentation: \n%s\n" % about.__docs_models__)
            cdef int i
            start = 0
@ -537,12 +546,11 @@ cdef class Doc:
    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
        """Export given token attributes to a numpy `ndarray`.
-
-	If `attr_ids` is a sequence of M attributes, the output array will
-	be of shape `(N, M)`, where N is the length of the `Doc`
-	(in tokens). If `attr_ids` is a single attribute, the output shape will
-	be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
-	or string name (e.g. 'LEMMA' or 'lemma').
+        If `attr_ids` is a sequence of M attributes, the output array will be
+        of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
+        `attr_ids` is a single attribute, the output shape will be (N,). You
+        can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
+        string name (e.g. 'LEMMA' or 'lemma').

        attr_ids (list[]): A list of attributes (int IDs or string names).
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
@ -641,13 +649,12 @@ cdef class Doc:
    def from_array(self, attrs, array):
        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(
-                "Conflicting attributes specified in doc.from_array():\n"
+                "Conflicting attributes specified in doc.from_array(): "
                "(HEAD, SENT_START)\n"
-                "The HEAD attribute currently sets sentence boundaries implicitly,\n"
-                "based on the tree structure. This means the HEAD attribute would "
-                "potentially override the sentence boundaries set by SENT_START.\n"
-                "See https://github.com/spacy-io/spaCy/issues/235 for details and "
-                "workarounds, and to propose solutions.")
+                "The HEAD attribute currently sets sentence boundaries "
+                "implicitly, based on the tree structure. This means the HEAD "
+                "attribute would potentially override the sentence boundaries "
+                "set by SENT_START.")
        cdef int i, col
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c
@ -675,18 +682,14 @@ cdef class Doc:
        return self

    def get_lca_matrix(self):
-        '''
-        Calculates the lowest common ancestor matrix
-        for a given Spacy doc.
-        Returns LCA matrix containing the integer index
-        of the ancestor, or -1 if no common ancestor is
-        found (ex if span excludes a necessary ancestor).
-        Apologies about the recursion, but the
-        impact on performance is negligible given
-        the natural limitations on the depth of a typical human sentence.
-        '''
+        """Calculates the lowest common ancestor matrix for a given `Doc`.
+        Returns LCA matrix containing the integer index of the ancestor, or -1
+        if no common ancestor is found (ex if span excludes a necessary
+        ancestor). Apologies about the recursion, but the impact on
+        performance is negligible given the natural limitations on the depth
+        of a typical human sentence.
+        """
        # Efficiency notes:
-        #
        # We can easily improve the performance here by iterating in Cython.
        # To loop over the tokens in Cython, the easiest way is:
        # for token in doc.c[:doc.c.length]:
@ -719,7 +722,6 @@ cdef class Doc:
                token_k = self[k]
                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
                lca_matrix[k][j] = lca_matrix[j][k]
-
        return lca_matrix

    def to_disk(self, path, **exclude):
@ -819,14 +821,15 @@ cdef class Doc:
        return self

    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
-        is merged into a single token. If `start_idx` and `end_idx `do not mark
-        start and end token boundaries, the document remains unchanged.
+        """Retokenize the document, such that the span at
+        `doc.text[start_idx : end_idx]` is merged into a single token. If
+        `start_idx` and `end_idx `do not mark start and end token boundaries,
+        the document remains unchanged.

-        start_idx (int): The character index of the start of the slice to merge.
-        end_idx (int): The character index after the end of the slice to merge.
+        start_idx (int): Character index of the start of the slice to merge.
+        end_idx (int): Character index after the end of the slice to merge.
        **attributes: Attributes to assign to the merged token. By default,
-            attributes are inherited from the syntactic root token of the span.
+            attributes are inherited from the syntactic root of the span.
        RETURNS (Token): The newly merged token, or `None` if the start and end
            indices did not fall at token boundaries.
        """
@ -847,10 +850,10 @@ cdef class Doc:
                attributes[ENT_TYPE] = attributes['ent_type']
        elif args:
            raise ValueError(
-                "Doc.merge received %d non-keyword arguments. "
-                "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
+                "Doc.merge received %d non-keyword arguments. Expected either "
+                "3 arguments (deprecated), or 0 (use keyword arguments). "
                "Arguments supplied:\n%s\n"
-                "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
+                "Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes)))

        # More deprecated attribute handling =/
        if 'label' in attributes:
@ -882,8 +885,9 @@ cdef class Doc:
                Token.set_struct_attr(token, attr_name, attr_value)
        # Begin by setting all the head indices to absolute token positions
        # This is easier to work with for now than the offsets
-        # Before thinking of something simpler, beware the case where a dependency
-        # bridges over the entity. Here the alignment of the tokens changes.
+        # Before thinking of something simpler, beware the case where a
+        # dependency bridges over the entity. Here the alignment of the
+        # tokens changes.
        span_root = span.root.i
        token.dep = span.root.dep
        # We update token.lex after keeping span root and dep, since
@ -932,8 +936,9 @@ cdef class Doc:
            >>> trees = doc.print_tree()
            >>> trees[1]
            {'modifiers': [
-                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
-                'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
+                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
+                'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
+                'lemma': 'Alice'},
                {'modifiers': [
                    {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
                    'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
@ -1008,7 +1013,7 @@ def pickle_doc(doc):

 def unpickle_doc(vocab, hooks_and_data, bytes_data):
    user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
- 
+
    doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
                                                     exclude='user_data')
    doc.user_hooks.update(doc_hooks)
@ -1018,4 +1023,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):


 copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
-
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -35,15 +35,16 @@ cdef class Span:
    def has_extension(cls, name):
        return name in Underscore.span_extensions

-    def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
-                  vector_norm=None):
+    def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
+                  vector=None, vector_norm=None):
        """Create a `Span` object from the slice `doc[start : end]`.

        doc (Doc): The parent document.
        start (int): The index of the first token of the span.
        end (int): The index of the first token after the span.
        label (uint64): A label to attach to the Span, e.g. for named entities.
-        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation
+            of the span.
        RETURNS (Span): The newly constructed object.
        """
        if not (0 <= start <= end <= len(doc)):
@ -162,7 +163,8 @@ cdef class Span:
            attributes are inherited from the syntactic root token of the span.
        RETURNS (Token): The newly merged token.
        """
-        return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
+        return self.doc.merge(self.start_char, self.end_char, *args,
+                              **attributes)

    def similarity(self, other):
        """Make a semantic similarity estimate. The default estimate is cosine
@ -179,24 +181,19 @@ cdef class Span:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    def get_lca_matrix(self):
-        '''
-        Calculates the lowest common ancestor matrix
-        for a given Spacy span.
-        Returns LCA matrix containing the integer index
-        of the ancestor, or -1 if no common ancestor is
-        found (ex if span excludes a necessary ancestor).
-        Apologies about the recursion, but the
-        impact on performance is negligible given
-        the natural limitations on the depth of a typical human sentence.
-        '''
-
+        """Calculates the lowest common ancestor matrix for a given `Span`.
+        Returns LCA matrix containing the integer index of the ancestor, or -1
+        if no common ancestor is found (ex if span excludes a necessary
+        ancestor). Apologies about the recursion, but the impact on
+        performance is negligible given the natural limitations on the depth
+        of a typical human sentence.
+        """
        def __pairwise_lca(token_j, token_k, lca_matrix, margins):
            offset = margins[0]
            token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
            token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
            token_j_i = token_j.i - offset
            token_k_i = token_k.i - offset
-
            if lca_matrix[token_j_i][token_k_i] != -2:
                return lca_matrix[token_j_i][token_k_i]
            elif token_j == token_k:
@ -209,23 +206,19 @@ cdef class Span:
                lca_index = -1
            else:
                lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
-
            lca_matrix[token_j_i][token_k_i] = lca_index
            lca_matrix[token_k_i][token_j_i] = lca_index
-
            return lca_index

        lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
        lca_matrix.fill(-2)
        margins = [self.start, self.end]
-
        for j in range(len(self)):
            token_j = self[j]
            for k in range(len(self)):
                token_k = self[k]
                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
                lca_matrix[k][j] = lca_matrix[j][k]
-
        return lca_matrix

    cpdef np.ndarray to_array(self, object py_attr_ids):
@ -349,7 +342,8 @@ cdef class Span:
        """The text content of the span with a trailing whitespace character if
        the last token has one.

-        RETURNS (unicode): The text content of the span (with trailing whitespace).
+        RETURNS (unicode): The text content of the span (with trailing
+            whitespace).
        """
        def __get__(self):
            return u''.join([t.text_with_ws for t in self])
@ -358,7 +352,8 @@ cdef class Span:
        """Yields base noun-phrase `Span` objects, if the document has been
        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
-        NP-level coordination, no prepositional phrases, and no relative clauses.
+        NP-level coordination, no prepositional phrases, and no relative
+        clauses.

        YIELDS (Span): Base noun-phrase `Span` objects
        """
@ -366,7 +361,8 @@ cdef class Span:
            if not self.doc.is_parsed:
                raise ValueError(
                    "noun_chunks requires the dependency parse, which "
-                    "requires data to be installed. For more info, see the "
+                    "requires a statistical model to be installed and loaded. "
+                    "For more info, see the "
                    "documentation: \n%s\n" % about.__docs_models__)
            # Accumulate the result before beginning to iterate over it. This prevents
            # the tokenisation from being changed out from under us during the iteration.
@ -385,9 +381,9 @@ cdef class Span:

        RETURNS (Token): The root token.

-        EXAMPLE: The root token has the shortest path to the root of the sentence
-            (or is the root itself). If multiple words are equally high in the
-            tree, the first word is taken. For example:
+        EXAMPLE: The root token has the shortest path to the root of the
+            sentence (or is the root itself). If multiple words are equally
+            high in the tree, the first word is taken. For example:

            >>> toks = nlp(u'I like New York in Autumn.')

@ -437,11 +433,11 @@ cdef class Span:
                if self.doc.c[i].head == 0:
                    return self.doc[i]
            # If we don't have a sentence root, we do something that's not so
-            # algorithmically clever, but I think should be quite fast, especially
-            # for short spans.
+            # algorithmically clever, but I think should be quite fast,
+            # especially for short spans.
            # For each word, we count the path length, and arg min this measure.
-            # We could use better tree logic to save steps here...But I think this
-            # should be okay.
+            # We could use better tree logic to save steps here...But I
+            # think this should be okay.
            cdef int current_best = self.doc.length
            cdef int root = -1
            for i in range(self.start, self.end):
@ -463,7 +459,7 @@ cdef class Span:
        YIELDS (Token):A left-child of a token of the span.
        """
        def __get__(self):
-            for token in reversed(self): # Reverse, so we get the tokens in order
+            for token in reversed(self): # Reverse, so we get tokens in order
                for left in token.lefts:
                    if left.i < self.start:
                        yield left
@ -493,7 +489,7 @@ cdef class Span:
                yield from word.subtree

    property ent_id:
-        """An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
+        """An (integer) entity ID.

        RETURNS (uint64): The entity ID.
        """
@ -503,8 +499,8 @@ cdef class Span:
        def __set__(self, hash_t key):
            # TODO
            raise NotImplementedError(
-                "Can't yet set ent_id from Span. Vote for this feature on the issue "
-                "tracker: http://github.com/explosion/spaCy/issues")
+                "Can't yet set ent_id from Span. Vote for this feature on "
+                "the issue tracker: http://github.com/explosion/spaCy/issues")

    property ent_id_:
        """A (string) entity ID. Usually assigned by patterns in the `Matcher`.
@ -517,13 +513,16 @@ cdef class Span:
        def __set__(self, hash_t key):
            # TODO
            raise NotImplementedError(
-                "Can't yet set ent_id_ from Span. Vote for this feature on the issue "
-                "tracker: http://github.com/explosion/spaCy/issues")
+                "Can't yet set ent_id_ from Span. Vote for this feature on the "
+                "issue tracker: http://github.com/explosion/spaCy/issues")

    property orth_:
-        # TODO: docstring
+        """Verbatim text content (identical to Span.text). Exists mostly for
+        consistency with other attributes.
+
+        RETURNS (unicode): The span's text."""
        def __get__(self):
-            return ''.join([t.string for t in self]).strip()
+            return ''.join([t.orth_ for t in self]).strip()

    property lemma_:
        """The span's lemma.
@ -534,19 +533,19 @@ cdef class Span:
            return ' '.join([t.lemma_ for t in self]).strip()

    property upper_:
-        # TODO: docstring
+        """Deprecated. Use Span.text.upper() instead."""
        def __get__(self):
-            return ''.join([t.string.upper() for t in self]).strip()
+            return ''.join([t.text_with_ws.upper() for t in self]).strip()

    property lower_:
-        # TODO: docstring
+        """Deprecated. Use Span.text.lower() instead."""
        def __get__(self):
-            return ''.join([t.string.lower() for t in self]).strip()
+            return ''.join([t.text_with_ws.lower() for t in self]).strip()

    property string:
-        # TODO: docstring
+        """Deprecated: Use Span.text instead."""
        def __get__(self):
-            return ''.join([t.string for t in self])
+            return ''.join([t.text_with_ws for t in self])

    property label_:
        """The span's label.
@ -570,7 +569,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
        n += 1
        if n >= sent_length:
            raise RuntimeError(
-                "Array bounds exceeded while searching for root word. This likely "
-                "means the parse tree is in an invalid state. Please report this "
-                "issue here: http://github.com/explosion/spaCy/issues")
+                "Array bounds exceeded while searching for root word. This "
+                "likely means the parse tree is in an invalid state. Please "
+                "report this issue here: "
+                "http://github.com/explosion/spaCy/issues")
    return n
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from .. import parts_of_speech
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
-from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
-from ..attrs cimport LEMMA, POS, TAG, DEP
+from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
+from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
+from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
+from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
 from ..compat import is_config
 from .. import about
 from .underscore import Underscore


 cdef class Token:
-    """An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
+    """An individual token – i.e. a word, punctuation symbol, whitespace,
+    etc."""
    @classmethod
    def set_extension(cls, name, default=None, method=None,
                      getter=None, setter=None):
@ -171,10 +172,11 @@ cdef class Token:
            return self.orth_

    property text_with_ws:
-        """The text content of the token with a trailing whitespace character if
-        it has one.
+        """The text content of the token with a trailing whitespace character
+        if it has one.

-        RETURNS (unicode): The text content of the span (with trailing whitespace).
+        RETURNS (unicode): The text content of the span (with trailing
+            whitespace).
        """
        def __get__(self):
            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -306,9 +308,8 @@ cdef class Token:
        def __set__(self, value):
            if self.doc.is_parsed:
                raise ValueError(
-                    'Refusing to write to token.sent_start if its document is parsed, '
-                    'because this may cause inconsistent state. '
-                    'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
+                    "Refusing to write to token.sent_start if its document "
+                    "is parsed, because this may cause inconsistent state.")
            if value is None:
                self.c.sent_start = 0
            elif value is True:
@ -316,13 +317,12 @@ cdef class Token:
            elif value is False:
                self.c.sent_start = -1
            else:
-                raise ValueError("Invalid value for token.sent_start -- must be one of "
-                                 "None, True, False")
+                raise ValueError("Invalid value for token.sent_start. Must be "
+                                 "one of: None, True, False")

    property lefts:
        def __get__(self):
-            """
-            The leftward immediate children of the word, in the syntactic
+            """The leftward immediate children of the word, in the syntactic
            dependency parse.
            """
            cdef int nr_iter = 0
@ -334,13 +334,12 @@ cdef class Token:
                nr_iter += 1
                # This is ugly, but it's a way to guard out infinite loops
                if nr_iter >= 10000000:
-                    raise RuntimeError(
-                        "Possibly infinite loop encountered while looking for token.lefts")
+                    raise RuntimeError("Possibly infinite loop encountered "
+                                       "while looking for token.lefts")

    property rights:
        def __get__(self):
-            """
-            The rightward immediate children of the word, in the syntactic
+            """The rightward immediate children of the word, in the syntactic
            dependency parse.
            """
            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
@ -352,27 +351,26 @@ cdef class Token:
                ptr -= 1
                nr_iter += 1
                if nr_iter >= 10000000:
-                    raise RuntimeError(
-                        "Possibly infinite loop encountered while looking for token.rights")
+                    raise RuntimeError("Possibly infinite loop encountered "
+                                       "while looking for token.rights")
            tokens.reverse()
            for t in tokens:
                yield t

    property children:
-        """
-        A sequence of the token's immediate syntactic children.
+        """A sequence of the token's immediate syntactic children.

-        Yields: Token A child token such that child.head==self
+        YIELDS (Token): A child token such that child.head==self
        """
        def __get__(self):
            yield from self.lefts
            yield from self.rights

    property subtree:
-        """
-        A sequence of all the token's syntactic descendents.
+        """A sequence of all the token's syntactic descendents.

-        Yields: Token A descendent token such that self.is_ancestor(descendent)
+        YIELDS (Token): A descendent token such that
+            `self.is_ancestor(descendent)`.
        """
        def __get__(self):
            for word in self.lefts:
@ -456,13 +454,15 @@ cdef class Token:
            if self.c.head > 0: # left dependent
                old_head.c.l_kids -= 1
                if self.c.l_edge == old_head.c.l_edge:
-                    # the token dominates the left edge so the left edge of the head
-                    # may change when the token is reattached
-                    # it may not change if the new head is a descendant of the current head
+                    # the token dominates the left edge so the left edge of
+                    # the  head may change when the token is reattached, it may
+                    # not change if the new head is a descendant of the current
+                    # head

                    new_edge = self.c.l_edge
-                    # the new l_edge is the left-most l_edge on any of the other dependents
-                    # where the l_edge is left of the head, otherwise it is the head
+                    # the new l_edge is the left-most l_edge on any of the
+                    # other dependents where the l_edge is left of the head,
+                    # otherwise it is the head
                    if not is_desc:
                        new_edge = old_head.i
                        for child in old_head.children:
@ -472,14 +472,15 @@ cdef class Token:
                                new_edge = child.c.l_edge
                        old_head.c.l_edge = new_edge

-                    # walk up the tree from old_head and assign new l_edge to ancestors
-                    # until an ancestor already has an l_edge that's further left
+                    # walk up the tree from old_head and assign new l_edge to
+                    # ancestors until an ancestor already has an l_edge that's
+                    # further left
                    for anc in old_head.ancestors:
                        if anc.c.l_edge <= new_edge:
                            break
                        anc.c.l_edge = new_edge

-            elif self.c.head < 0: # right dependent
+            elif self.c.head < 0:  # right dependent
                old_head.c.r_kids -= 1
                # do the same thing as for l_edge
                if self.c.r_edge == old_head.c.r_edge:
@ -500,7 +501,7 @@ cdef class Token:
                        anc.c.r_edge = new_edge

            # update number of deps of new head
-            if rel_newhead_i > 0: # left dependent
+            if rel_newhead_i > 0:  # left dependent
                new_head.c.l_kids += 1
                # walk up the tree from new head and set l_edge to self.l_edge
                # until you hit a token with an l_edge further to the left
@ -511,7 +512,7 @@ cdef class Token:
                            break
                        anc.c.l_edge = self.c.l_edge

-            elif rel_newhead_i < 0: # right dependent
+            elif rel_newhead_i < 0:  # right dependent
                new_head.c.r_kids += 1
                # do the same as for l_edge
                if self.c.r_edge > new_head.c.r_edge:
@ -572,8 +573,8 @@ cdef class Token:

    property ent_iob_:
        """IOB code of named entity tag. "B" means the token begins an entity,
-        "I" means it is inside an entity, "O" means it is outside an entity, and
-        "" means no entity tag is set.
+        "I" means it is inside an entity, "O" means it is outside an entity,
+        and "" means no entity tag is set.

        RETURNS (unicode): IOB code of named entity tag.
        """
@ -582,8 +583,7 @@ cdef class Token:
            return iob_strings[self.c.ent_iob]

    property ent_id:
-        """ID of the entity the token is an instance of, if any. Usually
-        assigned by patterns in the Matcher.
+        """ID of the entity the token is an instance of, if any.

        RETURNS (uint64): ID of the entity.
        """
@ -594,8 +594,7 @@ cdef class Token:
            self.c.ent_id = key

    property ent_id_:
-        """ID of the entity the token is an instance of, if any. Usually
-        assigned by patterns in the Matcher.
+        """ID of the entity the token is an instance of, if any.

        RETURNS (unicode): ID of the entity.
        """
@ -606,34 +605,70 @@ cdef class Token:
            self.c.ent_id = self.vocab.strings.add(name)

    property whitespace_:
+        """Trailing space character if present.
+
+        RETURNS (unicode): The whitespace character.
+        """
        def __get__(self):
            return ' ' if self.c.spacy else ''

    property orth_:
+        """Verbatim text content (identical to `Token.text`). Existst mostly
+        for consistency with the other attributes.
+
+        RETURNS (unicode): The token text.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.orth]

    property lower_:
+        """Lowercase form of the token text. Equivalent to
+        `Token.text.lower()`.
+
+        RETURNS (unicode): The lowercase token text.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.lower]

    property norm_:
+        """The token's norm, i.e. a normalised form of the token text.
+        Usually set in the language's tokenizer exceptions or norm exceptions.
+
+        RETURNS (unicode): The norm.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.norm]

    property shape_:
+        """Transform of the tokens's string, to show orthographic features.
+        For example, "Xxxx" or "dd".
+
+        RETURNS (unicode): The token shape.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.shape]

    property prefix_:
+        """A length-N substring from the start of the token. Defaults to `N=1`.
+
+        RETURNS (unicode): The token's prefix.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.prefix]

    property suffix_:
+        """A length-N substring from the end of the token. Defaults to `N=3`.
+
+        RETURNS (unicode): The token's suffix.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.suffix]

    property lang_:
+        """Language of the parent document's vocabulary, e.g. 'en'.
+
+        RETURNS (unicode): The language code.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.lex.lang]

@ -648,65 +683,152 @@ cdef class Token:
            self.c.lemma = self.vocab.strings.add(lemma_)

    property pos_:
+        """Coarse-grained part-of-speech.
+
+        RETURNS (unicode): The part-of-speech tag.
+        """
        def __get__(self):
            return parts_of_speech.NAMES[self.c.pos]

    property tag_:
+        """Fine-grained part-of-speech.
+
+        RETURNS (unicode): The part-of-speech tag.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.tag]
        def __set__(self, tag):
            self.tag = self.vocab.strings.add(tag)

    property dep_:
+        """Syntactic dependency relation.
+
+        RETURNS (unicode): The dependency label.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.dep]
        def __set__(self, unicode label):
            self.c.dep = self.vocab.strings.add(label)

    property is_oov:
+        """Is the token out-of-vocabulary?
+
+        RETURNS (bool): Whether the token is out-of-vocabulary.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)

    property is_stop:
+        """Is the token part of a "stop list"? (defined by the language data)
+
+        RETURNS (bool): Whether the token is a stop word.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)

    property is_alpha:
+        """Does the token consist of alphabetic characters? Equivalent to
+        `token.text.isalpha()`.
+
+        RETURNS (bool): Whether the token consists of alpha characters.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)

    property is_ascii:
+        """Does the token consist of ASCII characters? Equivalent to
+        `[any(ord(c) >= 128 for c in token.text)]`.
+
+        RETURNS (bool): Whether the token consists of ASCII characters.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)

    property is_digit:
+        """Does the token consist of digits? Equivalent to
+        `token.text.isdigit()`.
+
+        RETURNS (bool): Whether the token consists of digits.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)

    property is_lower:
+        """Is the token in lowercase? Equivalent to `token.text.islower()`.
+
+        RETURNS (bool): Whether the token is in lowercase.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)

+    property is_upper:
+        """Is the token in uppercase? Equivalent to `token.text.isupper()`.
+
+        RETURNS (bool): Whether the token is in uppercase.
+        """
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
+
    property is_title:
+        """Is the token in titlecase? Equivalent to `token.text.istitle()`.
+
+        RETURNS (bool): Whether the token is in titlecase.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)

    property is_punct:
+        """Is the token punctuation?
+
+        RETURNS (bool): Whether the token is punctuation.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)

    property is_space:
+        """Does the token consist of whitespace characters? Equivalent to
+        `token.text.isspace()`.
+
+        RETURNS (bool): Whether the token consists of whitespace characters.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)

    property is_bracket:
+        """Is the token a bracket?
+
+        RETURNS (bool): Whether the token is a bracket.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)

    property is_quote:
+        """Is the token a quotation mark?
+
+        RETURNS (bool): Whether the token is a quotation mark.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)

    property is_left_punct:
+        """Is the token a left punctuation mark, e.g. "("?
+
+        RETURNS (bool): Whether the token is a left punctuation mark.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)

    property is_right_punct:
+        """Is the token a left punctuation mark, e.g. "("?
+
+        RETURNS (bool): Whether the token is a left punctuation mark.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)

    property like_url:
+        """Does the token resemble a URL?
+
+        RETURNS (bool): Whether the token resembles a URL.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)

    property like_num:
+        """Does the token represent a number? e.g. "10.9", "10", "ten", etc.
+
+        RETURNS (bool): Whether the token resembles a number.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)

    property like_email:
+        """Does the token resemble an email address?
+
+        RETURNS (bool): Whether the token resembles an email address.
+        """
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
--- a/website/api/span.jade
+++ b/website/api/span.jade
@ -248,6 +248,28 @@ p
        +cell float
        +cell A scalar similarity score. Higher is more similar.

+h(2, "get_lca_matrix") Span.get_lca_matrix
+    +tag method
+
+p
+    |  Calculates the lowest common ancestor matrix for a given #[code Span].
+    |  Returns LCA matrix containing the integer index of the ancestor, or
+    |  #[code -1] if no common ancestor is found, e.g. if span excludes a
+    |  necessary ancestor.
+
+aside-code("Example").
+    doc = nlp(u'I like New York in Autumn')
+    span = doc[1:4]
+    matrix = span.get_lca_matrix()
+    # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
+
+table(["Name", "Type", "Description"])
+    +row("foot")
+        +cell returns
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+        +cell The lowest common ancestor matrix of the #[code Span].
+
+
 +h(2, "to_array") Span.to_array
    +tag method
    +tag-new(2)
@ -495,6 +517,18 @@ p
            |  The text content of the span with a trailing whitespace character
            |  if the last token has one.

+    +row
+        +cell #[code orth]
+        +cell int
+        +cell ID of the verbatim text content.
+
+    +row
+        +cell #[code orth_]
+        +cell unicode
+        +cell
+            |  Verbatim text content (identical to #[code Span.text]). Existst
+            |  mostly for consistency with the other attributes.
+
    +row
        +cell #[code label]
        +cell int
--- a/website/api/token.jade
+++ b/website/api/token.jade
@ -489,15 +489,35 @@ p The L2 norm of the token's vector representation.
        +cell unicode
        +cell Base form of the token, with no inflectional suffixes.

+    +row
+        +cell #[code norm]
+        +cell int
+        +cell
+            |  The token's norm, i.e. a normalised form of the token text.
+            |  Usually set in the language's
+            |  #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
+            |  #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+
+    +row
+        +cell #[code norm_]
+        +cell unicode
+        +cell
+            |  The token's norm, i.e. a normalised form of the token text.
+            |  Usually set in the language's
+            |  #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
+            |  #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+
    +row
        +cell #[code lower]
        +cell int
-        +cell Lower-case form of the token.
+        +cell Lowercase form of the token.

    +row
        +cell #[code lower_]
        +cell unicode
-        +cell Lower-case form of the token.
+        +cell
+            |  Lowercase form of the token text. Equivalent to
+            |  #[code Token.text.lower()].

    +row
        +cell #[code shape]
@ -537,7 +557,9 @@ p The L2 norm of the token's vector representation.
    +row
        +cell #[code suffix_]
        +cell unicode
-        +cell Length-N substring from the end of the token. Defaults to #[code N=3].
+        +cell
+            |  Length-N substring from the end of the token. Defaults to
+            |  #[code N=3].

    +row
        +cell #[code is_alpha]
@ -672,6 +694,7 @@ p The L2 norm of the token's vector representation.
        +cell #[code lang]
        +cell int
        +cell Language of the parent document's vocabulary.
+
    +row
        +cell #[code lang_]
        +cell unicode