spaCy/website/api/goldparse.jade

//- 💫 DOCS > API > GOLDPARSE

include ../_includes/_mixins

p Collection for training annotations.

+h(2, "init") GoldParse.__init__
    +tag method

p Create a #[code GoldParse].

+table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The document the annotations refer to.

    +row
        +cell #[code words]
        +cell iterable
        +cell A sequence of unicode word strings.

    +row
        +cell #[code tags]
        +cell iterable
        +cell A sequence of strings, representing tag annotations.

    +row
        +cell #[code heads]
        +cell iterable
        +cell A sequence of integers, representing syntactic head offsets.

    +row
        +cell #[code deps]
        +cell iterable
        +cell A sequence of strings, representing the syntactic relation types.

    +row
        +cell #[code entities]
        +cell iterable
        +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.

    +row("foot")
        +cell returns
        +cell #[code GoldParse]
        +cell The newly constructed object.

+h(2, "len") GoldParse.__len__
    +tag method

p Get the number of gold-standard tokens.

+table(["Name", "Type", "Description"])
    +row("foot")
        +cell returns
        +cell int
        +cell The number of gold-standard tokens.

+h(2, "is_projective") GoldParse.is_projective
    +tag property

p
    |  Whether the provided syntactic annotations form a projective dependency
    |  tree.

+table(["Name", "Type", "Description"])
    +row("foot")
        +cell returns
        +cell bool
        +cell Whether annotations form projective tree.


+h(2, "attributes") Attributes

+table(["Name", "Type", "Description"])
    +row
        +cell #[code tags]
        +cell list
        +cell The part-of-speech tag annotations.

    +row
        +cell #[code heads]
        +cell list
        +cell The syntactic head annotations.

    +row
        +cell #[code labels]
        +cell list
        +cell The syntactic relation-type annotations.

    +row
        +cell #[code ents]
        +cell list
        +cell The named entity annotations.

    +row
        +cell #[code cand_to_gold]
        +cell list
        +cell The alignment from candidate tokenization to gold tokenization.

    +row
        +cell #[code gold_to_cand]
        +cell list
        +cell The alignment from gold tokenization to candidate tokenization.

    +row
        +cell #[code cats] #[+tag-new(2)]
        +cell list
        +cell
            |  Entries in the list should be either a label, or a
            |  #[code (start, end, label)] triple. The tuple form is used for
            |  categories applied to spans of the document.


+h(2, "util") Utilities

+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
    +tag function

p
    |  Encode labelled spans into per-token tags, using the
    |  #[+a("/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).

p
    |  Returns a list of unicode strings, describing the tags. Each tag string
    |  will be of the form either #[code ""], #[code "O"] or
    |  #[code "{action}-{label}"], where action is one of #[code "B"],
    |  #[code "I"], #[code "L"], #[code "U"]. The string #[code &quot;-&quot;]
    |  is used where the entity offsets don't align with the tokenization in the
    |  #[code Doc] object. The training algorithm will view these as missing
    |  values. #[code O] denotes a non-entity token. #[code B] denotes the
    |  beginning of a multi-token entity, #[code I] the inside of an entity
    |  of three or more tokens, and #[code L] the end of an entity of two or
    |  more tokens. #[code U] denotes a single-token entity.

+aside-code("Example").
    from spacy.gold import biluo_tags_from_offsets
    text = 'I like London.'
    entities = [(len('I like '), len('I like London'), 'LOC')]
    doc = tokenizer(text)
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ['O', 'O', 'U-LOC', 'O']

+table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell
            |  The document that the entity offsets refer to. The output tags
            |  will refer to the token boundaries within the document.

    +row
        +cell #[code entities]
        +cell iterable
        +cell
            |  A sequence of #[code (start, end, label)] triples. #[code start]
            |  and #[code end] should be character-offset integers denoting the
            |  slice into the original string.

    +row("foot")
        +cell returns
        +cell list
        +cell
            |  Unicode strings, describing the
            |  #[+a("/api/annotation#biluo") BILUO] tags.