mirror of https://github.com/explosion/spaCy.git
168 lines
4.7 KiB
Plaintext
168 lines
4.7 KiB
Plaintext
//- 💫 DOCS > API > GOLDPARSE
|
|
|
|
include ../_includes/_mixins
|
|
|
|
p Collection for training annotations.
|
|
|
|
+h(2, "init") GoldParse.__init__
|
|
+tag method
|
|
|
|
p Create a #[code GoldParse].
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code doc]
|
|
+cell #[code Doc]
|
|
+cell The document the annotations refer to.
|
|
|
|
+row
|
|
+cell #[code words]
|
|
+cell iterable
|
|
+cell A sequence of unicode word strings.
|
|
|
|
+row
|
|
+cell #[code tags]
|
|
+cell iterable
|
|
+cell A sequence of strings, representing tag annotations.
|
|
|
|
+row
|
|
+cell #[code heads]
|
|
+cell iterable
|
|
+cell A sequence of integers, representing syntactic head offsets.
|
|
|
|
+row
|
|
+cell #[code deps]
|
|
+cell iterable
|
|
+cell A sequence of strings, representing the syntactic relation types.
|
|
|
|
+row
|
|
+cell #[code entities]
|
|
+cell iterable
|
|
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
|
|
|
|
+row("foot")
|
|
+cell returns
|
|
+cell #[code GoldParse]
|
|
+cell The newly constructed object.
|
|
|
|
+h(2, "len") GoldParse.__len__
|
|
+tag method
|
|
|
|
p Get the number of gold-standard tokens.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row("foot")
|
|
+cell returns
|
|
+cell int
|
|
+cell The number of gold-standard tokens.
|
|
|
|
+h(2, "is_projective") GoldParse.is_projective
|
|
+tag property
|
|
|
|
p
|
|
| Whether the provided syntactic annotations form a projective dependency
|
|
| tree.
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row("foot")
|
|
+cell returns
|
|
+cell bool
|
|
+cell Whether annotations form projective tree.
|
|
|
|
|
|
+h(2, "attributes") Attributes
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code tags]
|
|
+cell list
|
|
+cell The part-of-speech tag annotations.
|
|
|
|
+row
|
|
+cell #[code heads]
|
|
+cell list
|
|
+cell The syntactic head annotations.
|
|
|
|
+row
|
|
+cell #[code labels]
|
|
+cell list
|
|
+cell The syntactic relation-type annotations.
|
|
|
|
+row
|
|
+cell #[code ents]
|
|
+cell list
|
|
+cell The named entity annotations.
|
|
|
|
+row
|
|
+cell #[code cand_to_gold]
|
|
+cell list
|
|
+cell The alignment from candidate tokenization to gold tokenization.
|
|
|
|
+row
|
|
+cell #[code gold_to_cand]
|
|
+cell list
|
|
+cell The alignment from gold tokenization to candidate tokenization.
|
|
|
|
+row
|
|
+cell #[code cats] #[+tag-new(2)]
|
|
+cell list
|
|
+cell
|
|
| Entries in the list should be either a label, or a
|
|
| #[code (start, end, label)] triple. The tuple form is used for
|
|
| categories applied to spans of the document.
|
|
|
|
|
|
+h(2, "util") Utilities
|
|
|
|
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
|
|
+tag function
|
|
|
|
p
|
|
| Encode labelled spans into per-token tags, using the
|
|
| #[+a("/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
|
|
|
|
p
|
|
| Returns a list of unicode strings, describing the tags. Each tag string
|
|
| will be of the form either #[code ""], #[code "O"] or
|
|
| #[code "{action}-{label}"], where action is one of #[code "B"],
|
|
| #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"]
|
|
| is used where the entity offsets don't align with the tokenization in the
|
|
| #[code Doc] object. The training algorithm will view these as missing
|
|
| values. #[code O] denotes a non-entity token. #[code B] denotes the
|
|
| beginning of a multi-token entity, #[code I] the inside of an entity
|
|
| of three or more tokens, and #[code L] the end of an entity of two or
|
|
| more tokens. #[code U] denotes a single-token entity.
|
|
|
|
+aside-code("Example").
|
|
from spacy.gold import biluo_tags_from_offsets
|
|
text = 'I like London.'
|
|
entities = [(len('I like '), len('I like London'), 'LOC')]
|
|
doc = tokenizer(text)
|
|
tags = biluo_tags_from_offsets(doc, entities)
|
|
assert tags == ['O', 'O', 'U-LOC', 'O']
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code doc]
|
|
+cell #[code Doc]
|
|
+cell
|
|
| The document that the entity offsets refer to. The output tags
|
|
| will refer to the token boundaries within the document.
|
|
|
|
+row
|
|
+cell #[code entities]
|
|
+cell iterable
|
|
+cell
|
|
| A sequence of #[code (start, end, label)] triples. #[code start]
|
|
| and #[code end] should be character-offset integers denoting the
|
|
| slice into the original string.
|
|
|
|
+row("foot")
|
|
+cell returns
|
|
+cell list
|
|
+cell
|
|
| Unicode strings, describing the
|
|
| #[+a("/api/annotation#biluo") BILUO] tags.
|
|
|
|
|