spaCy/website/api/goldparse.jade

168 lines
4.7 KiB
Plaintext
Raw Normal View History

2016-10-31 18:04:15 +00:00
//- 💫 DOCS > API > GOLDPARSE
2017-10-03 12:27:22 +00:00
include ../_includes/_mixins
2016-10-31 18:04:15 +00:00
p Collection for training annotations.
+h(2, "init") GoldParse.__init__
+tag method
p Create a #[code GoldParse].
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The document the annotations refer to.
+row
+cell #[code words]
+cell iterable
2016-10-31 18:04:15 +00:00
+cell A sequence of unicode word strings.
+row
+cell #[code tags]
+cell iterable
2016-10-31 18:04:15 +00:00
+cell A sequence of strings, representing tag annotations.
+row
+cell #[code heads]
+cell iterable
2016-10-31 18:04:15 +00:00
+cell A sequence of integers, representing syntactic head offsets.
+row
+cell #[code deps]
+cell iterable
2016-10-31 18:04:15 +00:00
+cell A sequence of strings, representing the syntactic relation types.
+row
+cell #[code entities]
+cell iterable
2016-10-31 18:04:15 +00:00
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
2016-10-31 18:04:15 +00:00
+cell #[code GoldParse]
+cell The newly constructed object.
+h(2, "len") GoldParse.__len__
+tag method
p Get the number of gold-standard tokens.
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
2016-10-31 18:04:15 +00:00
+cell int
+cell The number of gold-standard tokens.
+h(2, "is_projective") GoldParse.is_projective
+tag property
p
| Whether the provided syntactic annotations form a projective dependency
| tree.
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
2016-10-31 18:04:15 +00:00
+cell bool
+cell Whether annotations form projective tree.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code tags]
+cell list
+cell The part-of-speech tag annotations.
+row
+cell #[code heads]
+cell list
+cell The syntactic head annotations.
+row
+cell #[code labels]
+cell list
+cell The syntactic relation-type annotations.
+row
+cell #[code ents]
+cell list
+cell The named entity annotations.
+row
+cell #[code cand_to_gold]
+cell list
+cell The alignment from candidate tokenization to gold tokenization.
+row
+cell #[code gold_to_cand]
+cell list
+cell The alignment from gold tokenization to candidate tokenization.
2017-07-22 15:55:35 +00:00
+row
+cell #[code cats] #[+tag-new(2)]
+cell list
+cell
| Entries in the list should be either a label, or a
| #[code (start, end, label)] triple. The tuple form is used for
| categories applied to spans of the document.
+h(2, "util") Utilities
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
+tag function
p
| Encode labelled spans into per-token tags, using the
2017-10-03 12:27:22 +00:00
| #[+a("/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
p
| Returns a list of unicode strings, describing the tags. Each tag string
| will be of the form either #[code ""], #[code "O"] or
| #[code "{action}-{label}"], where action is one of #[code "B"],
| #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"]
| is used where the entity offsets don't align with the tokenization in the
| #[code Doc] object. The training algorithm will view these as missing
| values. #[code O] denotes a non-entity token. #[code B] denotes the
| beginning of a multi-token entity, #[code I] the inside of an entity
| of three or more tokens, and #[code L] the end of an entity of two or
| more tokens. #[code U] denotes a single-token entity.
+aside-code("Example").
from spacy.gold import biluo_tags_from_offsets
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell
| The document that the entity offsets refer to. The output tags
| will refer to the token boundaries within the document.
+row
+cell #[code entities]
+cell iterable
+cell
| A sequence of #[code (start, end, label)] triples. #[code start]
| and #[code end] should be character-offset integers denoting the
| slice into the original string.
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
+cell list
+cell
| Unicode strings, describing the
2017-10-03 12:27:22 +00:00
| #[+a("/api/annotation#biluo") BILUO] tags.