spaCy/website/docs/_api-token.jade

//- ----------------------------------
//- 💫 DOCS > API > TOKEN
//- ----------------------------------

+section("token")
    +h(2, "token", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/tokens/token.pyx")
        | #[+tag class] Token

    p.
        A Token represents a single word, punctuation or significant whitespace
        symbol. Integer IDs are provided for all string features. The (unicode)
        string is provided by an attribute of the same name followed by an underscore,
        e.g. #[code token.orth] is an integer ID, #[code token.orth_] is the unicode
        value. The only exception is the #[code token.text] attribute, which is (unicode)
        string-typed.

    +section("token-init")
        +h(3, "token-init")
            | Token.__init__

        +code("python", "Definition").
            def __init__(vocab, doc, offset):
                return Token()

        +table(["Name", "Type", "Description"])
                +row
                    +cell vocab
                    +cell Vocab
                    +cell A Vocab object

                +row
                    +cell doc
                    +cell Doc
                    +cell The parent sequence

                +row
                    +cell offset
                    +cell #[+a(link_int) int]
                    +cell The index of the token within the document

    +section("token-stringfeatures")
        +h(3, "token-stringfeatures")
            | String Features

        +table(["Name", "Description"])
            +row
                +cell lemma / lemma_
                +cell.
                    The "base" of the word, with no inflectional suffixes, e.g.
                    the lemma of "developing" is "develop", the lemma of "geese"
                    is "goose", etc.  Note that #[em derivational] suffixes are
                    not stripped, e.g. the lemma of "instutitions" is "institution",
                    not "institute".  Lemmatization is performed using the WordNet
                    data, but extended to also cover closed-class words such as
                    pronouns.  By default, the WN lemmatizer returns "hi" as the
                    lemma of "his". We assign pronouns the lemma #[code -PRON-].

            +row
                +cell orth / orth_
                +cell.
                    The form of the word with no string normalization or processing,
                    as it appears in the string, without trailing whitespace.

            +row
                +cell lower / lower_
                +cell.
                    The form of the word, but forced to lower-case, i.e.
                    #[code lower = word.orth_.lower()]

            +row
                +cell shape / shape_
                +cell.
                    A transform of the word's string, to show orthographic features.
                    The characters a-z are mapped to x, A-Z is mapped to X, 0-9
                    is mapped to d. After these mappings, sequences of 4 or more
                    of the same character are truncated to length 4. Examples:
                    C3Po --&gt; XdXx, favorite --&gt; xxxx, :) --&gt; :)

            +row
                +cell prefix / prefix_
                +cell.
                    A length-N substring from the start of the word. Length may
                    vary by language; currently for English n=1, i.e.
                    #[code prefix = word.orth_[:1]]

            +row
                +cell suffix / suffix_
                +cell.
                    A length-N substring from the end of the word. Length may
                    vary by language; currently for English n=3, i.e.
                    #[code suffix = word.orth_[-3:]]

    +section("token-booleanflags")
        +h(3, "token-booleanflags")
            | Boolean Flags

        +table(["Name", "Description"])
            +row
                +cell is_alpha
                +cell.
                    Equivalent to #[code word.orth_.isalpha()]

            +row
                +cell is_ascii
                +cell.
                    Equivalent to any(ord(c) >= 128 for c in word.orth_)]

            +row
                +cell is_digit
                +cell.
                    Equivalent to #[code word.orth_.isdigit()]

            +row
                +cell is_lower
                +cell.
                    Equivalent to #[code word.orth_.islower()]

            +row
                +cell is_title
                +cell.
                    Equivalent to #[code word.orth_.istitle()]

            +row
                +cell is_punct
                +cell.
                    Equivalent to #[code word.orth_.ispunct()]

            +row
                +cell is_space
                +cell.
                    Equivalent to #[code word.orth_.isspace()]

            +row
                +cell like_url
                +cell.
                    Does the word resemble a URL?

            +row
                +cell like_num
                +cell.
                    Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.

            +row
                +cell like_email
                +cell.
                    Does the word resemble an email?

            +row
                +cell is_oov
                +cell.
                    Is the word out-of-vocabulary?

            +row
                +cell is_stop
                +cell.
                    Is the word part of a "stop list"? Stop lists are used to
                    improve the quality of topic models, by filtering out common,
                    domain-general words.

    +section("token-distributional")
        +h(3, "token-distributional")
            | Distributional Features

        +table(["Name", "Description"])
            +row
                +cell prob
                +cell.
                    The unigram log-probability of the word, estimated from
                    counts from a large corpus, smoothed using Simple Good Turing
                    estimation.

            +row
                +cell cluster
                +cell.
                    The Brown cluster ID of the word. These are often useful features
                    for linear models. If you’re using a non-linear model, particularly
                        a neural net or random forest, consider using the real-valued
                        word representation vector, in #[code Token.repvec], instead.

            +row
                +cell vector
                +cell.
                    A "word embedding" representation: a dense real-valued vector
                    that supports similarity queries between words. By default,
                    spaCy currently loads vectors produced by the Levy and
                    Goldberg (2014) dependency-based word2vec model.

            +row
                +cell has_vector
                +cell.
                    A boolean value indicating whether a vector.

    +section("token-alignment")
        +h(3, "token-alignment")
            | Alignment and Output

        +table(["Name", "Description"])
            +row
                +cell idx
                +cell.
                    Start index of the token in the string

            +row
                +cell len(token)
                +cell.
                    Length of the token's orth string, in unicode code-points.

            +row
                +cell unicode(token)
                +cell.
                    Same as #[code token.orth_].

            +row
                +cell str(token)
                +cell.
                    In Python 3, returns #[code token.orth_]. In Python 2, returns
                    #[code token.orth_.encode('utf8')].

            +row
                +cell text
                +cell.
                    An alias for #[code token.orth_].

            +row
                +cell text_with_ws
                +cell.
                    #[code token.orth_ + token.whitespace_], i.e. the form of the
                    word as it appears in the string, trailing whitespace. This is
                    useful when you need to use linguistic features to add inline
                    mark-up to the string.

            +row
                +cell whitespace_
                +cell.
                    The number of immediate syntactic children following the word
                    in the string.

    +section("token-postags")
        +h(3, "token-postags")
            | Part-of-Speech Tags

        +table(["Name", "Description"])
            +row
                +cell pos / pos_
                +cell.
                    A coarse-grained, less detailed tag that represents the
                    word-class of the token. The set of #[code .pos] tags are
                    consistent across languages. The available tags are #[code ADJ],
                    #[code ADP], #[code ADV], #[code AUX], #[code CONJ], #[code DET],
                    #[code INTJ], #[code NOUN], #[code NUM], #[code PART],
                    #[code PRON], #[code PROPN], #[code PUNCT], #[code SCONJ],
                    #[code SYM], #[code VERB], #[code X], #[code EOL], #[code SPACE].

            +row
                +cell tag / tag_
                +cell.
                    A fine-grained, more detailed tag that represents the
                    word-class and some basic morphological information for the
                    token. These tags are primarily designed to be good features
                    for subsequent models, particularly the syntactic parser.
                        They are language and treebank dependent. The tagger is
                        trained to predict these fine-grained tags, and then a
                        mapping table is used to reduce them to the coarse-grained
                        #[code .pos] tags.

    +section("token-navigating")
        +h(3, "token-navigating") Navigating the Parse Tree

        +table(["Name", "Description"])
            +row
                +cell dep / dep_
                +cell.
                    The syntactic relation type, aka the dependency label, connecting the word to its head.
            +row
                +cell head
                +cell.
                    The immediate syntactic head of the token. If the token is the
                    root of its sentence, it is the token itself, i.e.
                    #[code root_token.head is root_token].

            +row
                +cell children
                +cell.
                    An iterator that yields from lefts, and then yields from rights.

            +row
                +cell subtree
                +cell.
                    An iterator for the part of the sentence syntactically governed
                    by the word, including the word itself.

            +row
                +cell left_edge
                +cell.
                    The leftmost edge of the token's subtree.

            +row
                +cell right_edge
                +cell.
                    The rightmost edge of the token's subtree.

            +row
                +cell nbor(i=1)
                +cell.
                    Get the #[code i]#[sup th] next / previous neighboring token.

    +section("token-namedentities")
        +h(3, "token-namedentities")
            | Named Entity Recognition

        +table(["Name", "Description"])
            +row
                +cell ent_type
                +cell.
                    If the token is part of an entity, its entity type.

            +row
                +cell ent_iob
                +cell.
                    The IOB (inside, outside, begin) entity recognition tag for
                    the token.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								//- ----------------------------------
 								//- 💫 DOCS > API > TOKEN
 								//- ----------------------------------
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								+section("token")
 								    +h(2, "token", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/tokens/token.pyx")
 								        | #[+tag class] Token
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
 								    p.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        A Token represents a single word, punctuation or significant whitespace
 								        symbol. Integer IDs are provided for all string features. The (unicode)
 								        string is provided by an attribute of the same name followed by an underscore,
 								        e.g. #[code token.orth] is an integer ID, #[code token.orth_] is the unicode
 								        value. The only exception is the #[code token.text] attribute, which is (unicode)
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								        string-typed.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								    +section("token-init")
 								        +h(3, "token-init")
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            | Token.__init__
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            def __init__(vocab, doc, offset):
 								                return Token()
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +table(["Name", "Type", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                +row
 								                    +cell vocab
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    +cell Vocab
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    +cell A Vocab object
 								                +row
 								                    +cell doc
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    +cell Doc
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    +cell The parent sequence
 								                +row
 								                    +cell offset
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    +cell #[+a(link_int) int]
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    +cell The index of the token within the document
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								    +section("token-stringfeatures")
 								        +h(3, "token-stringfeatures")
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            | String Features
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +table(["Name", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            +row
 								                +cell lemma / lemma_
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    The "base" of the word, with no inflectional suffixes, e.g.
 								                    the lemma of "developing" is "develop", the lemma of "geese"
 								                    is "goose", etc.  Note that #[em derivational] suffixes are
 								                    not stripped, e.g. the lemma of "instutitions" is "institution",
 								                    not "institute".  Lemmatization is performed using the WordNet
 								                    data, but extended to also cover closed-class words such as
 								                    pronouns.  By default, the WN lemmatizer returns "hi" as the
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    lemma of "his". We assign pronouns the lemma #[code -PRON-].
 								            +row
 								                +cell orth / orth_
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    The form of the word with no string normalization or processing,
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    as it appears in the string, without trailing whitespace.
 								            +row
 								                +cell lower / lower_
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    The form of the word, but forced to lower-case, i.e.
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    #[code lower = word.orth_.lower()]
 								            +row
 								                +cell shape / shape_
 								                +cell.
 								                    A transform of the word's string, to show orthographic features.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    The characters a-z are mapped to x, A-Z is mapped to X, 0-9
 								                    is mapped to d. After these mappings, sequences of 4 or more
 								                    of the same character are truncated to length 4. Examples:
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    C3Po --&gt; XdXx, favorite --&gt; xxxx, :) --&gt; :)
 								            +row
 								                +cell prefix / prefix_
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    A length-N substring from the start of the word. Length may
 								                    vary by language; currently for English n=1, i.e.
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    #[code prefix = word.orth_[:1]]
 								            +row
 								                +cell suffix / suffix_
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    A length-N substring from the end of the word. Length may
 								                    vary by language; currently for English n=3, i.e.
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    #[code suffix = word.orth_[-3:]]
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								    +section("token-booleanflags")
 								        +h(3, "token-booleanflags")
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            | Boolean Flags
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +table(["Name", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            +row
 								                +cell is_alpha
 								                +cell.
 								                    Equivalent to #[code word.orth_.isalpha()]
 								            +row
 								                +cell is_ascii
 								                +cell.
 								                    Equivalent to any(ord(c) >= 128 for c in word.orth_)]
 								            +row
 								                +cell is_digit
 								                +cell.
 								                    Equivalent to #[code word.orth_.isdigit()]
 								            +row
 								                +cell is_lower
 								                +cell.
 								                    Equivalent to #[code word.orth_.islower()]
 								            +row
 								                +cell is_title
 								                +cell.
 								                    Equivalent to #[code word.orth_.istitle()]
 								            +row
 								                +cell is_punct
 								                +cell.
 								                    Equivalent to #[code word.orth_.ispunct()]
 								            +row
 								                +cell is_space
 								                +cell.
 								                    Equivalent to #[code word.orth_.isspace()]
 								            +row
 								                +cell like_url
 								                +cell.
 								                    Does the word resemble a URL?
 								            +row
 								                +cell like_num
 								                +cell.
 								                    Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
 								            +row
 								                +cell like_email
 								                +cell.
 								                    Does the word resemble an email?
 								            +row
 								                +cell is_oov
 								                +cell.
 								                    Is the word out-of-vocabulary?
 								            +row
 								                +cell is_stop
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    Is the word part of a "stop list"? Stop lists are used to
 								                    improve the quality of topic models, by filtering out common,
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    domain-general words.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								    +section("token-distributional")
 								        +h(3, "token-distributional")
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            | Distributional Features
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +table(["Name", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            +row
 								                +cell prob
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    The unigram log-probability of the word, estimated from
 								                    counts from a large corpus, smoothed using Simple Good Turing
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    estimation.
 								            +row
 								                +cell cluster
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    The Brown cluster ID of the word. These are often useful features
 								                    for linear models. If you’re using a non-linear model, particularly
 								                        a neural net or random forest, consider using the real-valued
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                        word representation vector, in #[code Token.repvec], instead.
 								            +row
 								                +cell vector
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    A "word embedding" representation: a dense real-valued vector
 								                    that supports similarity queries between words. By default,
 								                    spaCy currently loads vectors produced by the Levy and
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    Goldberg (2014) dependency-based word2vec model.
 								            +row
 								                +cell has_vector
 								                +cell.
 								                    A boolean value indicating whether a vector.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								    +section("token-alignment")
 								        +h(3, "token-alignment")
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            | Alignment and Output
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +table(["Name", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            +row
 								                +cell idx
 								                +cell.
 								                    Start index of the token in the string
 								            +row
 								                +cell len(token)
 								                +cell.
 								                    Length of the token's orth string, in unicode code-points.
 								            +row
 								                +cell unicode(token)
 								                +cell.
 								                    Same as #[code token.orth_].
 								            +row
 								                +cell str(token)
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    In Python 3, returns #[code token.orth_]. In Python 2, returns
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    #[code token.orth_.encode('utf8')].
 								            +row
 								                +cell text
 								                +cell.
 								                    An alias for #[code token.orth_].
 								            +row
 								                +cell text_with_ws
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    #[code token.orth_ + token.whitespace_], i.e. the form of the
 								                    word as it appears in the string, trailing whitespace. This is
 								                    useful when you need to use linguistic features to add inline
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    mark-up to the string.
 								            +row
 								                +cell whitespace_
 								                +cell.
 								                    The number of immediate syntactic children following the word
 								                    in the string.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								    +section("token-postags")
 								        +h(3, "token-postags")
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            | Part-of-Speech Tags
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +table(["Name", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            +row
 								                +cell pos / pos_
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    A coarse-grained, less detailed tag that represents the
 								                    word-class of the token. The set of #[code .pos] tags are
 								                    consistent across languages. The available tags are #[code ADJ],
 								                    #[code ADP], #[code ADV], #[code AUX], #[code CONJ], #[code DET],
 								                    #[code INTJ], #[code NOUN], #[code NUM], #[code PART],
 								                    #[code PRON], #[code PROPN], #[code PUNCT], #[code SCONJ],
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    #[code SYM], #[code VERB], #[code X], #[code EOL], #[code SPACE].
 								            +row
 								                +cell tag / tag_
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    A fine-grained, more detailed tag that represents the
 								                    word-class and some basic morphological information for the
 								                    token. These tags are primarily designed to be good features
 								                    for subsequent models, particularly the syntactic parser.
 								                        They are language and treebank dependent. The tagger is
 								                        trained to predict these fine-grained tags, and then a
 								                        mapping table is used to reduce them to the coarse-grained
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                        #[code .pos] tags.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								    +section("token-navigating")
 								        +h(3, "token-navigating") Navigating the Parse Tree
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +table(["Name", "Description"])
 								            +row
 								                +cell dep / dep_
 								                +cell.
 								                    The syntactic relation type, aka the dependency label, connecting the word to its head.
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            +row
 								                +cell head
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    The immediate syntactic head of the token. If the token is the
 								                    root of its sentence, it is the token itself, i.e.
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    #[code root_token.head is root_token].
 								            +row
 								                +cell children
 								                +cell.
 								                    An iterator that yields from lefts, and then yields from rights.
 								            +row
 								                +cell subtree
 								                +cell.
 								                    An iterator for the part of the sentence syntactically governed
 								                    by the word, including the word itself.
 								            +row
 								                +cell left_edge
 								                +cell.
 								                    The leftmost edge of the token's subtree.
 								            +row
 								                +cell right_edge
 								                +cell.
 								                    The rightmost edge of the token's subtree.
 								            +row
 								                +cell nbor(i=1)
 								                +cell.
 								                    Get the #[code i]#[sup th] next / previous neighboring token.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								    +section("token-namedentities")
 								        +h(3, "token-namedentities")
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            | Named Entity Recognition
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								        +table(["Name", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								            +row
 								                +cell ent_type
 								                +cell.
 								                    If the token is part of an entity, its entity type.
 								            +row
 								                +cell ent_iob
 								                +cell.
-												Update website

											
										
										
											2016-10-03 18:19:13 +00:00
+								                    The IOB (inside, outside, begin) entity recognition tag for
-												Replace website with new version

											
										
										
											2016-03-31 14:24:48 +00:00
+								                    the token.