spaCy/website/api/_cython/_tokenc.jade

//- 💫 DOCS > API > CYTHON > STRUCTS > TOKENC

p
    |  Cython data container for the #[code Token] object.

+aside-code("Example").
    token = &doc.c[3]
    token_ptr = &doc.c[3]

+table(["Name", "Type", "Description"])
    +row
        +cell #[code lex]
        +cell #[code const LexemeC*]
        +cell A pointer to the lexeme for the token.

    +row
        +cell #[code morph]
        +cell #[code uint64_t]
        +cell An ID allowing lookup of morphological attributes.

    +row
        +cell #[code pos]
        +cell #[code univ_pos_t]
        +cell Coarse-grained part-of-speech tag.

    +row
        +cell #[code spacy]
        +cell #[code bint]
        +cell A binary value indicating whether the token has trailing whitespace.

    +row
        +cell #[code tag]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Fine-grained part-of-speech tag.

    +row
        +cell #[code idx]
        +cell #[code int]
        +cell The character offset of the token within the parent document.

    +row
        +cell #[code lemma]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Base form of the token, with no inflectional suffixes.

    +row
        +cell #[code sense]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Space for storing a word sense ID, currently unused.

    +row
        +cell #[code head]
        +cell #[code int]
        +cell Offset of the syntactic parent relative to the token.

    +row
        +cell #[code dep]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Syntactic dependency relation.

    +row
        +cell #[code l_kids]
        +cell #[code uint32_t]
        +cell Number of left children.

    +row
        +cell #[code r_kids]
        +cell #[code uint32_t]
        +cell Number of right children.

    +row
        +cell #[code l_edge]
        +cell #[code uint32_t]
        +cell Offset of the leftmost token of this token's syntactic descendents.

    +row
        +cell #[code r_edge]
        +cell #[code uint32_t]
        +cell Offset of the rightmost token of this token's syntactic descendents.

    +row
        +cell #[code sent_start]
        +cell #[code int]
        +cell
            |  Ternary value indicating whether the token is the first word of
            |  a sentence. #[code 0] indicates a missing value, #[code -1]
            |  indicates #[code False] and #[code 1] indicates #[code True]. The default value, 0,
            |  is interpretted as no sentence break. Sentence boundary detectors will usually
            |  set 0 for all tokens except tokens that follow a sentence boundary.

    +row
        +cell #[code ent_iob]
        +cell #[code int]
        +cell
            |  IOB code of named entity tag. #[code 0] indicates a missing
            |  value, #[code 1] indicates #[code I], #[code 2] indicates
            |  #[code 0] and #[code 3] indicates #[code B].

    +row
        +cell #[code ent_type]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Named entity type.

    +row
        +cell #[code ent_id]
        +cell #[+abbr("uint64_t") #[code hash_t]]
        +cell
            |  ID of the entity the token is an instance of, if any. Currently
            |  not used, but potentially for coreference resolution.

+h(3, "token_get_struct_attr", "spacy/tokens/token.pxd") Token.get_struct_attr
    +tag staticmethod
    +tag nogil

p Get the value of an attribute from the #[code TokenC] struct by attribute ID.

+aside-code("Example").
    from spacy.attrs cimport IS_ALPHA
    from spacy.tokens cimport Token

    is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code token]
        +cell #[code const TokenC*]
        +cell A pointer to a #[code TokenC] struct.

    +row
        +cell #[code feat_name]
        +cell #[code attr_id_t]
        +cell
            |  The ID of the attribute to look up. The attributes are
            |  enumerated in #[code spacy.typedefs].

    +row("foot")
        +cell returns
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell The value of the attribute.

+h(3, "token_set_struct_attr", "spacy/tokens/token.pxd") Token.set_struct_attr
    +tag staticmethod
    +tag nogil

p Set the value of an attribute of the #[code TokenC] struct by attribute ID.

+aside-code("Example").
    from spacy.attrs cimport TAG
    from spacy.tokens cimport Token

    token = &doc.c[3]
    Token.set_struct_attr(token, TAG, 0)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code token]
        +cell #[code const TokenC*]
        +cell A pointer to a #[code TokenC] struct.

    +row
        +cell #[code feat_name]
        +cell #[code attr_id_t]
        +cell
            |  The ID of the attribute to look up. The attributes are
            |  enumerated in #[code spacy.typedefs].

    +row
        +cell #[code value]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell The value to set.

+h(3, "token_by_start", "spacy/tokens/doc.pxd") token_by_start
    +tag function

p Find a token in a #[code TokenC*] array by the offset of its first character.

+aside-code("Example").
    from spacy.tokens.doc cimport Doc, token_by_start
    from spacy.vocab cimport Vocab

    doc = Doc(Vocab(), words=[u'hello', u'world'])
    assert token_by_start(doc.c, doc.length, 6) == 1
    assert token_by_start(doc.c, doc.length, 4) == -1

+table(["Name", "Type", "Description"])
    +row
        +cell #[code tokens]
        +cell #[code const TokenC*]
        +cell A #[code TokenC*] array.

    +row
        +cell #[code length]
        +cell #[code int]
        +cell The number of tokens in the array.

    +row
        +cell #[code start_char]
        +cell #[code int]
        +cell The start index to search for.

    +row("foot")
        +cell returns
        +cell #[code int]
        +cell The index of the token in the array or #[code -1] if not found.

+h(3, "token_by_end", "spacy/tokens/doc.pxd") token_by_end
    +tag function

p Find a token in a #[code TokenC*] array by the offset of its final character.

+aside-code("Example").
    from spacy.tokens.doc cimport Doc, token_by_end
    from spacy.vocab cimport Vocab

    doc = Doc(Vocab(), words=[u'hello', u'world'])
    assert token_by_end(doc.c, doc.length, 5) == 0
    assert token_by_end(doc.c, doc.length, 1) == -1

+table(["Name", "Type", "Description"])
    +row
        +cell #[code tokens]
        +cell #[code const TokenC*]
        +cell A #[code TokenC*] array.

    +row
        +cell #[code length]
        +cell #[code int]
        +cell The number of tokens in the array.

    +row
        +cell #[code end_char]
        +cell #[code int]
        +cell The end index to search for.

    +row("foot")
        +cell returns
        +cell #[code int]
        +cell The index of the token in the array or #[code -1] if not found.

+h(3, "set_children_from_heads", "spacy/tokens/doc.pxd") set_children_from_heads
    +tag function

p
    |  Set attributes that allow lookup of syntactic children on a
    |  #[code TokenC*] array. This function must be called after making changes
    |  to the #[code TokenC.head] attribute, in order to make the parse tree
    |  navigation consistent.

+aside-code("Example").
    from spacy.tokens.doc cimport Doc, set_children_from_heads
    from spacy.vocab cimport Vocab

    doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe'])
    doc.c[0].head = 0
    doc.c[1].head = 0
    doc.c[2].head = 3
    doc.c[3].head = 1
    set_children_from_heads(doc.c, doc.length)
    assert doc.c[3].l_kids == 1

+table(["Name", "Type", "Description"])
    +row
        +cell #[code tokens]
        +cell #[code const TokenC*]
        +cell A #[code TokenC*] array.

    +row
        +cell #[code length]
        +cell #[code int]
        +cell The number of tokens in the array.
💫 Document Cython API (#2433) ## Description This PR adds the most relevant documentation of spaCy's Cython API. (Todo for when we publish this: rewrite `/api/#section-cython` and `/api/#cython` to `/api/cython#conventions`.) ### Types of change docs ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-06-11 15:47:46 +00:00			`//- 💫 DOCS > API > CYTHON > STRUCTS > TOKENC`

			`p`
			`\| Cython data container for the #[code Token] object.`

			`+aside-code("Example").`
			`token = &doc.c[3]`
			`token_ptr = &doc.c[3]`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code lex]`
			`+cell #[code const LexemeC*]`
			`+cell A pointer to the lexeme for the token.`

			`+row`
			`+cell #[code morph]`
			`+cell #[code uint64_t]`
			`+cell An ID allowing lookup of morphological attributes.`

			`+row`
			`+cell #[code pos]`
			`+cell #[code univ_pos_t]`
			`+cell Coarse-grained part-of-speech tag.`

			`+row`
			`+cell #[code spacy]`
			`+cell #[code bint]`
			`+cell A binary value indicating whether the token has trailing whitespace.`

			`+row`
			`+cell #[code tag]`
			`+cell #[+abbr("uint64_t") #[code attr_t]]`
			`+cell Fine-grained part-of-speech tag.`

			`+row`
			`+cell #[code idx]`
			`+cell #[code int]`
			`+cell The character offset of the token within the parent document.`

			`+row`
			`+cell #[code lemma]`
			`+cell #[+abbr("uint64_t") #[code attr_t]]`
			`+cell Base form of the token, with no inflectional suffixes.`

			`+row`
			`+cell #[code sense]`
			`+cell #[+abbr("uint64_t") #[code attr_t]]`
			`+cell Space for storing a word sense ID, currently unused.`

			`+row`
			`+cell #[code head]`
			`+cell #[code int]`
			`+cell Offset of the syntactic parent relative to the token.`

			`+row`
			`+cell #[code dep]`
			`+cell #[+abbr("uint64_t") #[code attr_t]]`
			`+cell Syntactic dependency relation.`

			`+row`
			`+cell #[code l_kids]`
			`+cell #[code uint32_t]`
			`+cell Number of left children.`

			`+row`
			`+cell #[code r_kids]`
			`+cell #[code uint32_t]`
			`+cell Number of right children.`

			`+row`
			`+cell #[code l_edge]`
			`+cell #[code uint32_t]`
			`+cell Offset of the leftmost token of this token's syntactic descendents.`

			`+row`
			`+cell #[code r_edge]`
			`+cell #[code uint32_t]`
			`+cell Offset of the rightmost token of this token's syntactic descendents.`

			`+row`
			`+cell #[code sent_start]`
			`+cell #[code int]`
			`+cell`
			`\| Ternary value indicating whether the token is the first word of`
			`\| a sentence. #[code 0] indicates a missing value, #[code -1]`
			`\| indicates #[code False] and #[code 1] indicates #[code True]. The default value, 0,`
			`\| is interpretted as no sentence break. Sentence boundary detectors will usually`
			`\| set 0 for all tokens except tokens that follow a sentence boundary.`

			`+row`
			`+cell #[code ent_iob]`
			`+cell #[code int]`
			`+cell`
			`\| IOB code of named entity tag. #[code 0] indicates a missing`
			`\| value, #[code 1] indicates #[code I], #[code 2] indicates`
			`\| #[code 0] and #[code 3] indicates #[code B].`

			`+row`
			`+cell #[code ent_type]`
			`+cell #[+abbr("uint64_t") #[code attr_t]]`
			`+cell Named entity type.`

			`+row`
			`+cell #[code ent_id]`
			`+cell #[+abbr("uint64_t") #[code hash_t]]`
			`+cell`
			`\| ID of the entity the token is an instance of, if any. Currently`
			`\| not used, but potentially for coreference resolution.`

			`+h(3, "token_get_struct_attr", "spacy/tokens/token.pxd") Token.get_struct_attr`
			`+tag staticmethod`
			`+tag nogil`

			`p Get the value of an attribute from the #[code TokenC] struct by attribute ID.`

			`+aside-code("Example").`
			`from spacy.attrs cimport IS_ALPHA`
			`from spacy.tokens cimport Token`

			`is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA)`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code token]`
			`+cell #[code const TokenC*]`
			`+cell A pointer to a #[code TokenC] struct.`

			`+row`
			`+cell #[code feat_name]`
			`+cell #[code attr_id_t]`
			`+cell`
			`\| The ID of the attribute to look up. The attributes are`
			`\| enumerated in #[code spacy.typedefs].`

			`+row("foot")`
			`+cell returns`
			`+cell #[+abbr("uint64_t") #[code attr_t]]`
			`+cell The value of the attribute.`

			`+h(3, "token_set_struct_attr", "spacy/tokens/token.pxd") Token.set_struct_attr`
			`+tag staticmethod`
			`+tag nogil`

			`p Set the value of an attribute of the #[code TokenC] struct by attribute ID.`

			`+aside-code("Example").`
			`from spacy.attrs cimport TAG`
			`from spacy.tokens cimport Token`

			`token = &doc.c[3]`
			`Token.set_struct_attr(token, TAG, 0)`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code token]`
			`+cell #[code const TokenC*]`
			`+cell A pointer to a #[code TokenC] struct.`

			`+row`
			`+cell #[code feat_name]`
			`+cell #[code attr_id_t]`
			`+cell`
			`\| The ID of the attribute to look up. The attributes are`
			`\| enumerated in #[code spacy.typedefs].`

			`+row`
			`+cell #[code value]`
			`+cell #[+abbr("uint64_t") #[code attr_t]]`
			`+cell The value to set.`

			`+h(3, "token_by_start", "spacy/tokens/doc.pxd") token_by_start`
			`+tag function`

			`p Find a token in a #[code TokenC*] array by the offset of its first character.`

			`+aside-code("Example").`
			`from spacy.tokens.doc cimport Doc, token_by_start`
			`from spacy.vocab cimport Vocab`

			`doc = Doc(Vocab(), words=[u'hello', u'world'])`
			`assert token_by_start(doc.c, doc.length, 6) == 1`
			`assert token_by_start(doc.c, doc.length, 4) == -1`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code tokens]`
			`+cell #[code const TokenC*]`
			`+cell A #[code TokenC*] array.`

			`+row`
			`+cell #[code length]`
			`+cell #[code int]`
			`+cell The number of tokens in the array.`

			`+row`
			`+cell #[code start_char]`
			`+cell #[code int]`
			`+cell The start index to search for.`

			`+row("foot")`
			`+cell returns`
			`+cell #[code int]`
			`+cell The index of the token in the array or #[code -1] if not found.`

			`+h(3, "token_by_end", "spacy/tokens/doc.pxd") token_by_end`
			`+tag function`

			`p Find a token in a #[code TokenC*] array by the offset of its final character.`

			`+aside-code("Example").`
			`from spacy.tokens.doc cimport Doc, token_by_end`
			`from spacy.vocab cimport Vocab`

			`doc = Doc(Vocab(), words=[u'hello', u'world'])`
			`assert token_by_end(doc.c, doc.length, 5) == 0`
			`assert token_by_end(doc.c, doc.length, 1) == -1`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code tokens]`
			`+cell #[code const TokenC*]`
			`+cell A #[code TokenC*] array.`

			`+row`
			`+cell #[code length]`
			`+cell #[code int]`
			`+cell The number of tokens in the array.`

			`+row`
			`+cell #[code end_char]`
			`+cell #[code int]`
			`+cell The end index to search for.`

			`+row("foot")`
			`+cell returns`
			`+cell #[code int]`
			`+cell The index of the token in the array or #[code -1] if not found.`

			`+h(3, "set_children_from_heads", "spacy/tokens/doc.pxd") set_children_from_heads`
			`+tag function`

			`p`
			`\| Set attributes that allow lookup of syntactic children on a`
			`\| #[code TokenC*] array. This function must be called after making changes`
			`\| to the #[code TokenC.head] attribute, in order to make the parse tree`
			`\| navigation consistent.`

			`+aside-code("Example").`
			`from spacy.tokens.doc cimport Doc, set_children_from_heads`
			`from spacy.vocab cimport Vocab`

			`doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe'])`
			`doc.c[0].head = 0`
			`doc.c[1].head = 0`
			`doc.c[2].head = 3`
			`doc.c[3].head = 1`
			`set_children_from_heads(doc.c, doc.length)`
			`assert doc.c[3].l_kids == 1`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code tokens]`
			`+cell #[code const TokenC*]`
			`+cell A #[code TokenC*] array.`

			`+row`
			`+cell #[code length]`
			`+cell #[code int]`
			`+cell The number of tokens in the array.`