diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 7e00030a4..18a34e156 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -225,25 +225,17 @@ cdef class GoldParse: def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, deps=None, entities=None, make_projective=False): - """ - Create a GoldParse. + """Create a GoldParse. - Arguments: - doc (Doc): - The document the annotations refer to. - words: - A sequence of unicode word strings. - tags: - A sequence of strings, representing tag annotations. - heads: - A sequence of integers, representing syntactic head offsets. - deps: - A sequence of strings, representing the syntactic relation types. - entities: - A sequence of named entity annotations, either as BILUO tag strings, - or as (start_char, end_char, label) tuples, representing the entity - positions. - Returns (GoldParse): The newly constructed object. + doc (Doc): The document the annotations refer to. + words (iterable): A sequence of unicode word strings. + tags (iterable): A sequence of strings, representing tag annotations. + heads (iterable): A sequence of integers, representing syntactic head offsets. + deps (iterable): A sequence of strings, representing the syntactic relation types. + entities (iterable): A sequence of named entity annotations, either as + BILUO tag strings, or as `(start_char, end_char, label)` tuples, + representing the entity positions. + RETURNS (GoldParse): The newly constructed object. """ if words is None: words = [token.text for token in doc] @@ -308,55 +300,45 @@ cdef class GoldParse: self.heads = proj_heads def __len__(self): - """ - Get the number of gold-standard tokens. + """Get the number of gold-standard tokens. - Returns (int): The number of gold-standard tokens. + RETURNS (int): The number of gold-standard tokens. """ return self.length @property def is_projective(self): - """ - Whether the provided syntactic annotations form a projective dependency - tree. + """Whether the provided syntactic annotations form a projective + dependency tree. """ return not nonproj.is_nonproj_tree(self.heads) def biluo_tags_from_offsets(doc, entities): - """ - Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out - scheme (biluo). + """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out + scheme (BILUO). - Arguments: - doc (Doc): - The document that the entity offsets refer to. The output tags will - refer to the token boundaries within the document. + doc (Doc): The document that the entity offsets refer to. The output tags + will refer to the token boundaries within the document. + entities (iterable): A sequence of `(start, end, label)` triples. `start` and + `end` should be character-offset integers denoting the slice into the + original string. - entities (sequence): - A sequence of (start, end, label) triples. start and end should be - character-offset integers denoting the slice into the original string. + RETURNS (list): A list of unicode strings, describing the tags. Each tag + string will be of the form either "", "O" or "{action}-{label}", where + action is one of "B", "I", "L", "U". The string "-" is used where the + entity offsets don't align with the tokenization in the `Doc` object. The + training algorithm will view these as missing values. "O" denotes a + non-entity token. "B" denotes the beginning of a multi-token entity, + "I" the inside of an entity of three or more tokens, and "L" the end + of an entity of two or more tokens. "U" denotes a single-token entity. - Returns: - tags (list): - A list of unicode strings, describing the tags. Each tag string will - be of the form either "", "O" or "{action}-{label}", where action is one - of "B", "I", "L", "U". The string "-" is used where the entity - offsets don't align with the tokenization in the Doc object. The - training algorithm will view these as missing values. "O" denotes - a non-entity token. "B" denotes the beginning of a multi-token entity, - "I" the inside of an entity of three or more tokens, and "L" the end - of an entity of two or more tokens. "U" denotes a single-token entity. - - Example: - text = 'I like London.' - entities = [(len('I like '), len('I like London'), 'LOC')] - doc = nlp.tokenizer(text) - - tags = biluo_tags_from_offsets(doc, entities) - - assert tags == ['O', 'O', 'U-LOC', 'O'] + EXAMPLE: + >>> text = 'I like London.' + >>> entities = [(len('I like '), len('I like London'), 'LOC')] + >>> doc = nlp.tokenizer(text) + >>> tags = biluo_tags_from_offsets(doc, entities) + >>> assert tags == ['O', 'O', 'U-LOC', 'O'] """ starts = {token.idx: token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc} diff --git a/website/docs/api/goldparse.jade b/website/docs/api/goldparse.jade index be6c97648..f39558b35 100644 --- a/website/docs/api/goldparse.jade +++ b/website/docs/api/goldparse.jade @@ -17,27 +17,27 @@ p Create a GoldParse. +row +cell #[code words] - +cell - + +cell iterable +cell A sequence of unicode word strings. +row +cell #[code tags] - +cell - + +cell iterable +cell A sequence of strings, representing tag annotations. +row +cell #[code heads] - +cell - + +cell iterable +cell A sequence of integers, representing syntactic head offsets. +row +cell #[code deps] - +cell - + +cell iterable +cell A sequence of strings, representing the syntactic relation types. +row +cell #[code entities] - +cell - + +cell iterable +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions. +footrow @@ -102,3 +102,58 @@ p +cell #[code gold_to_cand] +cell list +cell The alignment from gold tokenization to candidate tokenization. + + ++h(2, "util") Utilities + ++h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets + +tag function + +p + | Encode labelled spans into per-token tags, using the + | #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out). + +p + | Returns a list of unicode strings, describing the tags. Each tag string + | will be of the form either #[code ""], #[code "O"] or + | #[code "{action}-{label}"], where action is one of #[code "B"], + | #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"] + | is used where the entity offsets don't align with the tokenization in the + | #[code Doc] object. The training algorithm will view these as missing + | values. #[code O] denotes a non-entity token. #[code B] denotes the + | beginning of a multi-token entity, #[code I] the inside of an entity + | of three or more tokens, and #[code L] the end of an entity of two or + | more tokens. #[code U] denotes a single-token entity. + ++aside-code("Example"). + from spacy.gold import biluo_tags_from_offsets + text = 'I like London.' + entities = [(len('I like '), len('I like London'), 'LOC')] + doc = tokenizer(text) + tags = biluo_tags_from_offsets(doc, entities) + assert tags == ['O', 'O', 'U-LOC', 'O'] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell + | The document that the entity offsets refer to. The output tags + | will refer to the token boundaries within the document. + + +row + +cell #[code entities] + +cell iterable + +cell + | A sequence of #[code (start, end, label)] triples. #[code start] + | and #[code end] should be character-offset integers denoting the + | slice into the original string. + + +footrow + +cell returns + +cell list + +cell + | Unicode strings, describing the + | #[+a("/docs/api/annotation#biluo") BILUO] tags. + +