Update docstrings and API docs for GoldParse

2017-05-21 13:53:46 +02:00 · 2017-05-21 13:53:46 +02:00 · 075f5ff87a
parent 465a1dd710
commit 075f5ff87a
2 changed files with 95 additions and 58 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -225,25 +225,17 @@ cdef class GoldParse:

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
                 deps=None, entities=None, make_projective=False):
-        """
-        Create a GoldParse.
+        """Create a GoldParse.

-        Arguments:
-            doc (Doc):
-                The document the annotations refer to.
-            words:
-                A sequence of unicode word strings.
-            tags:
-                A sequence of strings, representing tag annotations.
-            heads:
-                A sequence of integers, representing syntactic head offsets.
-            deps:
-                A sequence of strings, representing the syntactic relation types.
-            entities:
-                A sequence of named entity annotations, either as BILUO tag strings,
-                or as (start_char, end_char, label) tuples, representing the entity
-                positions.
-        Returns (GoldParse): The newly constructed object.
+        doc (Doc): The document the annotations refer to.
+        words (iterable): A sequence of unicode word strings.
+        tags (iterable): A sequence of strings, representing tag annotations.
+        heads (iterable): A sequence of integers, representing syntactic head offsets.
+        deps (iterable): A sequence of strings, representing the syntactic relation types.
+        entities (iterable): A sequence of named entity annotations, either as
+            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
+            representing the entity positions.
+        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
            words = [token.text for token in doc]
@ -308,55 +300,45 @@ cdef class GoldParse:
            self.heads = proj_heads

    def __len__(self):
-        """
-        Get the number of gold-standard tokens.
+        """Get the number of gold-standard tokens.

-        Returns (int): The number of gold-standard tokens.
+        RETURNS (int): The number of gold-standard tokens.
        """
        return self.length

    @property
    def is_projective(self):
-        """
-        Whether the provided syntactic annotations form a projective dependency
-        tree.
+        """Whether the provided syntactic annotations form a projective
+        dependency tree.
        """
        return not nonproj.is_nonproj_tree(self.heads)


 def biluo_tags_from_offsets(doc, entities):
-    """
-    Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
-    scheme (biluo).
+    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
+    scheme (BILUO).

-    Arguments:
-        doc (Doc):
-            The document that the entity offsets refer to. The output tags will
-            refer to the token boundaries within the document.
+    doc (Doc): The document that the entity offsets refer to. The output tags
+        will refer to the token boundaries within the document.
+    entities (iterable): A sequence of `(start, end, label)` triples. `start` and
+        `end` should be character-offset integers denoting the slice into the
+        original string.

-        entities (sequence):
-            A sequence of (start, end, label) triples. start and end should be
-            character-offset integers denoting the slice into the original string.
+    RETURNS (list): A list of unicode strings, describing the tags. Each tag
+        string will be of the form either "", "O" or "{action}-{label}", where
+        action is one of "B", "I", "L", "U". The string "-" is used where the
+        entity offsets don't align with the tokenization in the `Doc` object. The
+        training algorithm will view these as missing values. "O" denotes a
+        non-entity token. "B" denotes the beginning of a multi-token entity,
+        "I" the inside of an entity of three or more tokens, and "L" the end
+        of an entity of two or more tokens. "U" denotes a single-token entity.

-    Returns:
-        tags (list):
-            A list of unicode strings, describing the tags. Each tag string will
-            be of the form either "", "O" or "{action}-{label}", where action is one
-            of "B", "I", "L", "U". The string "-" is used where the entity
-            offsets don't align with the tokenization in the Doc object. The
-            training algorithm will view these as missing values. "O" denotes
-            a non-entity token. "B" denotes the beginning of a multi-token entity,
-            "I" the inside of an entity of three or more tokens, and "L" the end
-            of an entity of two or more tokens. "U" denotes a single-token entity.
-
-    Example:
-        text = 'I like London.'
-        entities = [(len('I like '), len('I like London'), 'LOC')]
-        doc = nlp.tokenizer(text)
-
-        tags = biluo_tags_from_offsets(doc, entities)
-
-        assert tags == ['O', 'O', 'U-LOC', 'O']
+    EXAMPLE:
+        >>> text = 'I like London.'
+        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
+        >>> doc = nlp.tokenizer(text)
+        >>> tags = biluo_tags_from_offsets(doc, entities)
+        >>> assert tags == ['O', 'O', 'U-LOC', 'O']
    """
    starts = {token.idx: token.i for token in doc}
    ends = {token.idx+len(token): token.i for token in doc}
--- a/website/docs/api/goldparse.jade
+++ b/website/docs/api/goldparse.jade
@ -17,27 +17,27 @@ p Create a GoldParse.

    +row
        +cell #[code words]
-        +cell -
+        +cell iterable
        +cell A sequence of unicode word strings.

    +row
        +cell #[code tags]
-        +cell -
+        +cell iterable
        +cell A sequence of strings, representing tag annotations.

    +row
        +cell #[code heads]
-        +cell -
+        +cell iterable
        +cell A sequence of integers, representing syntactic head offsets.

    +row
        +cell #[code deps]
-        +cell -
+        +cell iterable
        +cell A sequence of strings, representing the syntactic relation types.

    +row
        +cell #[code entities]
-        +cell -
+        +cell iterable
        +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.

    +footrow
@ -102,3 +102,58 @@ p
        +cell #[code gold_to_cand]
        +cell list
        +cell The alignment from gold tokenization to candidate tokenization.
+
+
+h(2, "util") Utilities
+
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
+    +tag function
+
+p
+    |  Encode labelled spans into per-token tags, using the
+    |  #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
+
+p
+    |  Returns a list of unicode strings, describing the tags. Each tag string
+    |  will be of the form either #[code ""], #[code "O"] or
+    |  #[code "{action}-{label}"], where action is one of #[code "B"],
+    |  #[code "I"], #[code "L"], #[code "U"]. The string #[code &quot;-&quot;]
+    |  is used where the entity offsets don't align with the tokenization in the
+    |  #[code Doc] object. The training algorithm will view these as missing
+    |  values. #[code O] denotes a non-entity token. #[code B] denotes the
+    |  beginning of a multi-token entity, #[code I] the inside of an entity
+    |  of three or more tokens, and #[code L] the end of an entity of two or
+    |  more tokens. #[code U] denotes a single-token entity.
+
+aside-code("Example").
+    from spacy.gold import biluo_tags_from_offsets
+    text = 'I like London.'
+    entities = [(len('I like '), len('I like London'), 'LOC')]
+    doc = tokenizer(text)
+    tags = biluo_tags_from_offsets(doc, entities)
+    assert tags == ['O', 'O', 'U-LOC', 'O']
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code doc]
+        +cell #[code Doc]
+        +cell
+            |  The document that the entity offsets refer to. The output tags
+            |  will refer to the token boundaries within the document.
+
+    +row
+        +cell #[code entities]
+        +cell iterable
+        +cell
+            |  A sequence of #[code (start, end, label)] triples. #[code start]
+            |  and #[code end] should be character-offset integers denoting the
+            |  slice into the original string.
+
+    +footrow
+        +cell returns
+        +cell list
+        +cell
+            |  Unicode strings, describing the
+            |  #[+a("/docs/api/annotation#biluo") BILUO] tags.
+
+