Update docstrings and API docs for GoldParse

This commit is contained in:
ines 2017-05-21 13:53:46 +02:00
parent 465a1dd710
commit 075f5ff87a
2 changed files with 95 additions and 58 deletions

View File

@ -225,25 +225,17 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False):
"""
Create a GoldParse.
"""Create a GoldParse.
Arguments:
doc (Doc):
The document the annotations refer to.
words:
A sequence of unicode word strings.
tags:
A sequence of strings, representing tag annotations.
heads:
A sequence of integers, representing syntactic head offsets.
deps:
A sequence of strings, representing the syntactic relation types.
entities:
A sequence of named entity annotations, either as BILUO tag strings,
or as (start_char, end_char, label) tuples, representing the entity
positions.
Returns (GoldParse): The newly constructed object.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
heads (iterable): A sequence of integers, representing syntactic head offsets.
deps (iterable): A sequence of strings, representing the syntactic relation types.
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
RETURNS (GoldParse): The newly constructed object.
"""
if words is None:
words = [token.text for token in doc]
@ -308,55 +300,45 @@ cdef class GoldParse:
self.heads = proj_heads
def __len__(self):
"""
Get the number of gold-standard tokens.
"""Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens.
RETURNS (int): The number of gold-standard tokens.
"""
return self.length
@property
def is_projective(self):
"""
Whether the provided syntactic annotations form a projective dependency
tree.
"""Whether the provided syntactic annotations form a projective
dependency tree.
"""
return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities):
"""
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (biluo).
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (BILUO).
Arguments:
doc (Doc):
The document that the entity offsets refer to. The output tags will
refer to the token boundaries within the document.
doc (Doc): The document that the entity offsets refer to. The output tags
will refer to the token boundaries within the document.
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
`end` should be character-offset integers denoting the slice into the
original string.
entities (sequence):
A sequence of (start, end, label) triples. start and end should be
character-offset integers denoting the slice into the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The
training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Returns:
tags (list):
A list of unicode strings, describing the tags. Each tag string will
be of the form either "", "O" or "{action}-{label}", where action is one
of "B", "I", "L", "U". The string "-" is used where the entity
offsets don't align with the tokenization in the Doc object. The
training algorithm will view these as missing values. "O" denotes
a non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Example:
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = nlp.tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
EXAMPLE:
>>> text = 'I like London.'
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
>>> doc = nlp.tokenizer(text)
>>> tags = biluo_tags_from_offsets(doc, entities)
>>> assert tags == ['O', 'O', 'U-LOC', 'O']
"""
starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc}

View File

@ -17,27 +17,27 @@ p Create a GoldParse.
+row
+cell #[code words]
+cell -
+cell iterable
+cell A sequence of unicode word strings.
+row
+cell #[code tags]
+cell -
+cell iterable
+cell A sequence of strings, representing tag annotations.
+row
+cell #[code heads]
+cell -
+cell iterable
+cell A sequence of integers, representing syntactic head offsets.
+row
+cell #[code deps]
+cell -
+cell iterable
+cell A sequence of strings, representing the syntactic relation types.
+row
+cell #[code entities]
+cell -
+cell iterable
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+footrow
@ -102,3 +102,58 @@ p
+cell #[code gold_to_cand]
+cell list
+cell The alignment from gold tokenization to candidate tokenization.
+h(2, "util") Utilities
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
+tag function
p
| Encode labelled spans into per-token tags, using the
| #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
p
| Returns a list of unicode strings, describing the tags. Each tag string
| will be of the form either #[code ""], #[code "O"] or
| #[code "{action}-{label}"], where action is one of #[code "B"],
| #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"]
| is used where the entity offsets don't align with the tokenization in the
| #[code Doc] object. The training algorithm will view these as missing
| values. #[code O] denotes a non-entity token. #[code B] denotes the
| beginning of a multi-token entity, #[code I] the inside of an entity
| of three or more tokens, and #[code L] the end of an entity of two or
| more tokens. #[code U] denotes a single-token entity.
+aside-code("Example").
from spacy.gold import biluo_tags_from_offsets
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell
| The document that the entity offsets refer to. The output tags
| will refer to the token boundaries within the document.
+row
+cell #[code entities]
+cell iterable
+cell
| A sequence of #[code (start, end, label)] triples. #[code start]
| and #[code end] should be character-offset integers denoting the
| slice into the original string.
+footrow
+cell returns
+cell list
+cell
| Unicode strings, describing the
| #[+a("/docs/api/annotation#biluo") BILUO] tags.