From 1b520e7bab0b5499f0614773569c6bfd270c09d4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 28 Sep 2016 11:15:13 +0200
Subject: [PATCH] Improve docstrings for Doc object

---
 spacy/tokens/doc.pyx | 253 ++++++++++++++++++++++++++++++-------------
 1 file changed, 179 insertions(+), 74 deletions(-)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 418c3f8f0..4916df077 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -59,10 +59,42 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
 
 cdef class Doc:
     """
-    Container class for annotated text.  Constructed via English.__call__ or
-    Tokenizer.__call__.
+    A sequence of `Token` objects. Access sentences and named entities, 
+    export annotations to numpy arrays, losslessly serialize to compressed 
+    binary strings.
+
+    Aside: Internals
+        The `Doc` object holds an array of `TokenC` structs. 
+        The Python-level `Token` and `Span` objects are views of this 
+        array, i.e. they don't own the data themselves.
+
+    Code: Construction 1
+        doc = nlp.tokenizer(u'Some text')
+
+    Code: Construction 2
+        doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
+
     """
     def __init__(self, Vocab vocab, orths_and_spaces=None):
+        '''
+        Create a Doc object.
+
+        Aside: Implementation
+            This method of constructing a `Doc` object is usually only used 
+            for deserialization. Standard usage is to construct the document via 
+            a call to the language object.
+
+        Arguments:
+            vocab:
+                A Vocabulary object, which must match any models you want to 
+                use (e.g. tokenizer, parser, entity recognizer).
+
+            orths_and_spaces:
+                A list of tokens in the document as a sequence of 
+                `(orth_id, has_space)` tuples, where `orth_id` is an
+                integer and `has_space` is a boolean, indicating whether the
+                token has a trailing space.
+        '''
         self.vocab = vocab
         size = 20
         self.mem = Pool()
@@ -100,13 +132,23 @@ cdef class Doc:
                 # must be created.
                 self.push_back(
                     <const LexemeC*>self.vocab.get(self.mem, orth), has_space)
-
+    
     def __getitem__(self, object i):
-        """Get a Token or a Span from the Doc.
-
-        Returns:
-            token (Token) or span (Span):
-        """
+        '''
+        doc[i]
+            Get the Token object at position i, where i is an integer. 
+            Negative indexing is supported, and follows the usual Python 
+            semantics, i.e. doc[-2] is doc[len(doc) - 2].
+        doc[start : end]]
+            Get a `Span` object, starting at position `start`
+            and ending at position `end`, where `start` and
+            `end` are token indices. For instance,
+            `doc[2:5]` produces a span consisting of 
+            tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) 
+            are not supported, as `Span` objects must be contiguous (cannot have gaps).
+            You can use negative indices and open-ended ranges, which have their
+            normal Python semantics.
+        '''
         if isinstance(i, slice):
             start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
             return Span(self, start, stop, label=0)
@@ -120,11 +162,15 @@ cdef class Doc:
             return Token.cinit(self.vocab, &self.c[i], i, self)
 
     def __iter__(self):
-        """Iterate over the tokens.
-
-        Yields:
-            token (Token):
-        """
+        '''
+        for token in doc
+            Iterate over `Token`  objects, from which the annotations can 
+            be easily accessed. This is the main way of accessing Token 
+            objects, which are the main way annotations are accessed from 
+            Python. If faster-than-Python speeds are required, you can 
+            instead access the annotations as a numpy array, or access the 
+            underlying C data directly from Cython.
+        '''
         cdef int i
         for i in range(self.length):
             if self._py_tokens[i] is not None:
@@ -133,6 +179,10 @@ cdef class Doc:
                 yield Token.cinit(self.vocab, &self.c[i], i, self)
 
     def __len__(self):
+        '''
+        len(doc)
+            The number of tokens in the document.
+        '''
         return self.length
 
     def __unicode__(self):
@@ -161,7 +211,10 @@ cdef class Doc:
     property vector:
         def __get__(self):
             if self._vector is None:
-                self._vector = sum(t.vector for t in self) / len(self)
+                if len(self):
+                    self._vector = sum(t.vector for t in self) / len(self)
+                else:
+                    return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
             return self._vector
 
         def __set__(self, value):
@@ -193,18 +246,22 @@ cdef class Doc:
         return u''.join(t.text_with_ws for t in self)
 
     property ents:
-        def __get__(self):
-            """Yields named-entity Span objects.
-        
-            Iterate over the span to get individual Token objects, or access the label:
+        '''
+        Yields named-entity `Span` objects, if the entity recognizer
+        has been applied to the document. Iterate over the span to get 
+        individual Token objects, or access the label:
 
-            >>> from spacy.en import English
-            >>> nlp = English()
-            >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
-            >>> ents = list(tokens.ents)
-            >>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0])
-            (112504, u'PERSON', u'Best ') 
-            """
+        Example:
+            from spacy.en import English
+            nlp = English()
+            tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
+            ents = list(tokens.ents)
+            assert ents[0].label == 346
+            assert ents[0].label_ == 'PERSON'
+            assert ents[0].orth_ == 'Best'
+            assert ents[0].text == 'Mr. Best'
+        '''
+        def __get__(self):
             cdef int i
             cdef const TokenC* token
             cdef int start = -1
@@ -263,44 +320,59 @@ cdef class Doc:
                     # Set start as B
                     self.c[start].ent_iob = 3
 
-    @property
-    def noun_chunks(self):
-        """Yield spans for base noun phrases."""
-        if not self.is_parsed:
-            raise ValueError(
-                "noun_chunks requires the dependency parse, which "
-                "requires data to be installed. If you haven't done so, run: "
-                "\npython -m spacy.%s.download all\n"
-                "to install the data" % self.vocab.lang)
-        # Accumulate the result before beginning to iterate over it. This prevents
-        # the tokenisation from being changed out from under us during the iteration.
-        # The tricky thing here is that Span accepts its tokenisation changing,
-        # so it's okay once we have the Span objects. See Issue #375
-        spans = []
-        for start, end, label in self.noun_chunks_iterator(self):
-            spans.append(Span(self, start, end, label=label))
-        for span in spans:
-            yield span
+    property:
+        '''
+        Yields base noun-phrase #[code Span] objects, if the document
+        has been syntactically parsed. A base noun phrase, or 
+        'NP chunk', is a noun phrase that does not permit other NPs to 
+        be nested within it – so no NP-level coordination, no prepositional 
+        phrases, and no relative clauses. For example:
+        '''
+        def __get__(self):
+            if not self.is_parsed:
+                raise ValueError(
+                    "noun_chunks requires the dependency parse, which "
+                    "requires data to be installed. If you haven't done so, run: "
+                    "\npython -m spacy.%s.download all\n"
+                    "to install the data" % self.vocab.lang)
+            # Accumulate the result before beginning to iterate over it. This prevents
+            # the tokenisation from being changed out from under us during the iteration.
+            # The tricky thing here is that Span accepts its tokenisation changing,
+            # so it's okay once we have the Span objects. See Issue #375
+            spans = []
+            for start, end, label in self.noun_chunks_iterator(self):
+                spans.append(Span(self, start, end, label=label))
+            for span in spans:
+                yield span
 
-    @property
-    def sents(self):
+    property sents:
         """
-        Yield a list of sentence Span objects, calculated from the dependency parse.
+        Yields sentence `Span` objects. Sentence spans have no label.
+        To improve accuracy on informal texts, spaCy calculates sentence
+        boundaries from the syntactic dependency parse. If the parser is disabled,
+        `sents` iterator will be unavailable.
+
+        Example:
+            from spacy.en import English
+            nlp = English()
+            doc = nlp("This is a sentence. Here's another...")
+            assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
         """
-        if not self.is_parsed:
-            raise ValueError(
-                "sentence boundary detection requires the dependency parse, which "
-                "requires data to be installed. If you haven't done so, run: "
-                "\npython -m spacy.%s.download all\n"
-                "to install the data" % self.vocab.lang)
-        cdef int i
-        start = 0
-        for i in range(1, self.length):
-            if self.c[i].sent_start:
-                yield Span(self, start, i)
-                start = i
-        if start != self.length:
-            yield Span(self, start, self.length)
+        def __get__(self):
+            if not self.is_parsed:
+                raise ValueError(
+                    "sentence boundary detection requires the dependency parse, which "
+                    "requires data to be installed. If you haven't done so, run: "
+                    "\npython -m spacy.%s.download all\n"
+                    "to install the data" % self.vocab.lang)
+            cdef int i
+            start = 0
+            for i in range(1, self.length):
+                if self.c[i].sent_start:
+                    yield Span(self, start, i)
+                    start = i
+            if start != self.length:
+                yield Span(self, start, self.length)
 
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
         if self.length == self.max_length:
@@ -324,9 +396,17 @@ cdef class Doc:
 
     @cython.boundscheck(False)
     cpdef np.ndarray to_array(self, object py_attr_ids):
-        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
-        of shape N*M, where N is the length of the sentence.
+        """
+        Given a list of M attribute IDs, export the tokens to a numpy 
+        `ndarray` of shape (N, M), where `N` is the length 
+        of the document. The values will be 32-bit integers.
 
+        Example:
+            from spacy import attrs
+            doc = nlp(text)
+            # All strings mapped to integers, for easy export to numpy
+            np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
+                
         Arguments:
             attr_ids (list[int]): A list of attribute ID ints.
 
@@ -351,16 +431,22 @@ cdef class Doc:
         """Produce a dict of {attribute (int): count (ints)} frequencies, keyed
         by the values of the given attribute ID.
 
-        >>> from spacy.en import English, attrs
-        >>> nlp = English()
-        >>> tokens = nlp(u'apple apple orange banana')
-        >>> tokens.count_by(attrs.ORTH)
-        {12800L: 1, 11880L: 2, 7561L: 1}
-        >>> tokens.to_array([attrs.ORTH])
-        array([[11880],
-               [11880],
-               [ 7561],
-               [12800]])
+        Example:
+            from spacy.en import English, attrs
+            nlp = English()
+            tokens = nlp(u'apple apple orange banana')
+            tokens.count_by(attrs.ORTH)
+            # {12800L: 1, 11880L: 2, 7561L: 1}
+            tokens.to_array([attrs.ORTH])
+            # array([[11880],
+            #   [11880],
+            #   [ 7561],
+            #   [12800]])
+
+        Arguments:
+            attr_id
+                int
+                The attribute ID to key the counts.
         """
         cdef int i
         cdef attr_t attr
@@ -408,6 +494,8 @@ cdef class Doc:
             self.c[i] = parsed[i]
 
     def from_array(self, attrs, array):
+        '''Write to a `Doc` object, from an `(M, N)` array of attributes.
+        '''
         cdef int i, col
         cdef attr_id_t attr_id
         cdef TokenC* tokens = self.c
@@ -448,16 +536,34 @@ cdef class Doc:
         return self
 
     def to_bytes(self):
+        '''Serialize, producing a byte string.'''
         byte_string = self.vocab.serializer.pack(self)
         cdef uint32_t length = len(byte_string)
         return struct.pack('I', length) + byte_string
 
     def from_bytes(self, data):
+        '''Deserialize, loading from bytes.'''
         self.vocab.serializer.unpack_into(data[4:], self)
         return self
     
     @staticmethod
     def read_bytes(file_):
+        '''
+        A static method, used to read serialized #[code Doc] objects from 
+        a file. For example:
+
+        Example:
+            from spacy.tokens.doc import Doc
+            loc = 'test_serialize.bin'
+            with open(loc, 'wb') as file_:
+                file_.write(nlp(u'This is a document.').to_bytes())
+                file_.write(nlp(u'This is another.').to_bytes())
+            docs = []
+            with open(loc, 'rb') as file_:
+                for byte_string in Doc.read_bytes(file_):
+                    docs.append(Doc(nlp.vocab).from_bytes(byte_string))
+            assert len(docs) == 2
+        '''
         keep_reading = True
         while keep_reading:
             try:
@@ -472,8 +578,7 @@ cdef class Doc:
 
     def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
               unicode ent_type):
-        """Merge a multi-word expression into a single token.  Currently
-        experimental; API is likely to change."""
+        """Merge a multi-word expression into a single token."""
         cdef int start = token_by_start(self.c, self.length, start_idx)
         if start == -1:
             return None