spaCy/website/api/token.jade

576 lines
14 KiB
Plaintext
Raw Normal View History

2016-10-31 18:04:15 +00:00
//- 💫 DOCS > API > TOKEN
2017-10-03 12:27:22 +00:00
include ../_includes/_mixins
2016-10-31 18:04:15 +00:00
p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+h(2, "init") Token.__init__
+tag method
p Construct a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
2017-05-19 17:59:02 +00:00
assert token.text == u'Give'
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code offset]
+cell int
+cell The index of the token within the document.
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
+cell #[code Token]
+cell The newly constructed object.
+h(2, "len") Token.__len__
+tag method
p The number of unicode characters in the token, i.e. #[code token.text].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert len(token) == 4
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
+cell int
+cell The number of unicode characters in the token.
+h(2, "check_flag") Token.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
from spacy.attrs import IS_TITLE
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
2017-05-19 17:59:02 +00:00
assert token.check_flag(IS_TITLE) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to check.
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
+cell bool
+cell Whether the flag is set.
+h(2, "similarity") Token.similarity
+tag method
2017-05-19 18:24:46 +00:00
+tag-model("vectors")
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
2017-05-19 17:59:02 +00:00
apples, _, oranges = nlp(u'apples and oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
2017-05-19 17:59:02 +00:00
+h(2, "nbor") Token.nbor
+tag method
2017-05-19 17:59:02 +00:00
p Get a neighboring token.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_nbor = doc[0].nbor()
assert give_nbor.text == u'it'
+table(["Name", "Type", "Description"])
+row
2017-05-19 17:59:02 +00:00
+cell #[code i]
+cell int
+cell The relative position of the token to get. Defaults to #[code 1].
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
2017-05-19 17:59:02 +00:00
+cell #[code Token]
+cell The token at position #[code self.doc[self.i+i]].
2017-05-19 17:59:02 +00:00
+h(2, "is_ancestor") Token.is_ancestor
+tag method
2017-05-19 18:24:46 +00:00
+tag-model("parse")
p
2017-05-19 17:59:02 +00:00
| Check whether this token is a parent, grandparent, etc. of another
| in the dependency tree.
+aside-code("Example").
2017-05-19 17:59:02 +00:00
doc = nlp(u'Give it back! He pleaded.')
give = doc[0]
it = doc[1]
assert give.is_ancestor(it)
+table(["Name", "Type", "Description"])
2017-05-19 17:59:02 +00:00
+row
+cell descendant
+cell #[code Token]
+cell Another token.
2017-10-03 12:27:22 +00:00
+row("foot")
+cell returns
+cell bool
2017-05-19 17:59:02 +00:00
+cell Whether this token is the ancestor of the descendant.
2017-05-19 17:59:02 +00:00
+h(2, "ancestors") Token.ancestors
+tag property
2017-05-19 18:24:46 +00:00
+tag-model("parse")
2017-05-19 17:59:02 +00:00
p The rightmost token of this token's syntactic descendants.
+aside-code("Example").
2017-05-19 17:59:02 +00:00
doc = nlp(u'Give it back! He pleaded.')
it_ancestors = doc[1].ancestors
assert [t.text for t in it_ancestors] == [u'Give']
he_ancestors = doc[4].ancestors
assert [t.text for t in he_ancestors] == [u'pleaded']
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
2017-05-19 17:59:02 +00:00
+cell yields
+cell #[code Token]
+cell
| A sequence of ancestor tokens such that
| #[code ancestor.is_ancestor(self)].
+h(2, "conjuncts") Token.conjuncts
+tag property
2017-05-19 18:24:46 +00:00
+tag-model("parse")
p A sequence of coordinated tokens, including the token itself.
2017-05-19 17:59:02 +00:00
+aside-code("Example").
doc = nlp(u'I like apples and oranges')
apples_conjuncts = doc[2].conjuncts
assert [t.text for t in apples_conjuncts] == [u'oranges']
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
+cell yields
+cell #[code Token]
+cell A coordinated token.
+h(2, "children") Token.children
+tag property
2017-05-19 18:24:46 +00:00
+tag-model("parse")
p A sequence of the token's immediate syntactic children.
2017-05-19 17:59:02 +00:00
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_children = doc[0].children
assert [t.text for t in give_children] == [u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
+cell yields
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "subtree") Token.subtree
+tag property
2017-05-19 18:24:46 +00:00
+tag-model("parse")
p A sequence of all the token's syntactic descendents.
2017-05-19 17:59:02 +00:00
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_subtree = doc[0].subtree
assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
+cell yields
+cell #[code Token]
+cell A descendant token such that #[code self.is_ancestor(descendant)].
2017-05-19 17:59:02 +00:00
+h(2, "has_vector") Token.has_vector
+tag property
2017-05-19 18:24:46 +00:00
+tag-model("vectors")
2017-05-19 17:59:02 +00:00
p
| A boolean value indicating whether a word vector is associated with the
| token.
+aside-code("Example").
doc = nlp(u'I like apples')
apples = doc[2]
assert apples.has_vector
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
2017-05-19 17:59:02 +00:00
+cell returns
+cell bool
+cell Whether the token has a vector data attached.
+h(2, "vector") Token.vector
+tag property
2017-05-19 18:24:46 +00:00
+tag-model("vectors")
2017-05-19 17:59:02 +00:00
p A real-valued meaning representation.
2017-05-19 17:59:02 +00:00
+aside-code("Example").
doc = nlp(u'I like apples')
apples = doc[2]
assert apples.vector.dtype == 'float32'
assert apples.vector.shape == (300,)
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
2017-05-19 17:59:02 +00:00
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
2017-05-19 17:59:02 +00:00
+cell A 1D numpy array representing the token's semantics.
+h(2, "vector_norm") Span.vector_norm
+tag property
2017-05-19 18:24:46 +00:00
+tag-model("vectors")
2017-05-19 17:59:02 +00:00
p The L2 norm of the token's vector representation.
2017-05-19 17:59:02 +00:00
+aside-code("Example").
doc = nlp(u'I like apples and pasta')
apples = doc[2]
pasta = doc[4]
apples.vector_norm # 6.89589786529541
pasta.vector_norm # 7.759851932525635
assert apples.vector_norm != pasta.vector_norm
+table(["Name", "Type", "Description"])
2017-10-03 12:27:22 +00:00
+row("foot")
2017-05-19 17:59:02 +00:00
+cell returns
+cell float
+cell The L2 norm of the vector representation.
2016-10-31 18:04:15 +00:00
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
2017-10-03 12:27:22 +00:00
+row
+cell #[code text_with_ws]
+cell unicode
+cell Text content, with trailing space character if present.
+row
+cell #[code whitespace_]
+cell unicode
+cell Trailing space character if present.
2017-10-03 12:27:22 +00:00
+row
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+row
+cell #[code orth_]
+cell unicode
+cell
| Verbatim text content (identical to #[code Token.text]). Existst
| mostly for consistency with the other attributes.
2016-10-31 18:04:15 +00:00
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocab object of the parent #[code Doc].
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code head]
+cell #[code Token]
+cell The syntactic parent, or "governor", of this token.
+row
+cell #[code left_edge]
+cell #[code Token]
+cell The leftmost token of this token's syntactic descendants.
+row
+cell #[code right_edge]
+cell #[code Token]
+cell The rightmost token of this token's syntactic descendents.
2016-10-31 18:04:15 +00:00
+row
+cell #[code i]
+cell int
+cell The index of the token within the parent document.
2016-10-31 18:04:15 +00:00
+row
+cell #[code ent_type]
+cell int
+cell Named entity type.
2016-10-31 18:04:15 +00:00
+row
+cell #[code ent_type_]
+cell unicode
+cell Named entity type.
+row
+cell #[code ent_iob]
+cell int
+cell
2017-05-23 21:15:50 +00:00
| IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it is inside
| an entity, #[code "O"] means it is outside an entity, and
| #[code ""] means no entity tag is set.
2016-10-31 18:04:15 +00:00
+row
+cell #[code ent_iob_]
+cell unicode
+cell
| IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it is inside
| an entity, #[code "O"] means it is outside an entity, and
2016-10-31 18:04:15 +00:00
| #[code ""] means no entity tag is set.
+row
+cell #[code ent_id]
+cell int
+cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
2016-10-31 18:04:15 +00:00
+row
+cell #[code ent_id_]
+cell unicode
+cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
2016-10-31 18:04:15 +00:00
+row
+cell #[code lemma]
+cell int
+cell
2017-05-26 10:43:16 +00:00
| Base form of the token, with no inflectional suffixes.
2016-10-31 18:04:15 +00:00
+row
+cell #[code lemma_]
+cell unicode
2017-05-26 10:43:16 +00:00
+cell Base form of the token, with no inflectional suffixes.
2016-10-31 18:04:15 +00:00
+row
+cell #[code lower]
+cell int
2017-05-26 10:43:16 +00:00
+cell Lower-case form of the token.
2016-10-31 18:04:15 +00:00
+row
+cell #[code lower_]
+cell unicode
2017-05-26 10:43:16 +00:00
+cell Lower-case form of the token.
2016-10-31 18:04:15 +00:00
+row
+cell #[code shape]
+cell int
2017-05-26 10:43:16 +00:00
+cell
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
2016-10-31 18:04:15 +00:00
+row
+cell #[code shape_]
+cell unicode
+cell
2017-05-26 10:43:16 +00:00
| Transform of the tokens's string, to show orthographic features.
| For example, "Xxxx" or "dd".
2016-10-31 18:04:15 +00:00
+row
+cell #[code prefix]
+cell int
+cell
| Hash value of a length-N substring from the start of the
2017-05-26 10:43:16 +00:00
| token. Defaults to #[code N=1].
2016-10-31 18:04:15 +00:00
+row
+cell #[code prefix_]
+cell unicode
+cell
2017-05-26 10:43:16 +00:00
| A length-N substring from the start of the token. Defaults to
2016-10-31 18:04:15 +00:00
| #[code N=1].
+row
+cell #[code suffix]
+cell int
+cell
| Hash value of a length-N substring from the end of the token.
| Defaults to #[code N=3].
2016-10-31 18:04:15 +00:00
+row
+cell #[code suffix_]
+cell unicode
2017-05-26 10:43:16 +00:00
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_alpha]
+cell bool
2017-05-26 10:43:16 +00:00
+cell
| Does the token consist of alphabetic characters? Equivalent to
| #[code token.text.isalpha()].
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_ascii]
+cell bool
2017-05-26 10:43:16 +00:00
+cell
| Does the token consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in token.text)]].
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_digit]
+cell bool
2017-05-26 10:43:16 +00:00
+cell
| Does the token consist of digits? Equivalent to
| #[code token.text.isdigit()].
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_lower]
+cell bool
2017-05-26 10:43:16 +00:00
+cell
| Is the token in lowercase? Equivalent to
| #[code token.text.islower()].
2016-10-31 18:04:15 +00:00
2017-10-07 13:04:16 +00:00
+row
+cell #[code is_upper]
+cell bool
+cell
| Is the token in uppercase? Equivalent to
| #[code token.text.isupper()].
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_title]
+cell bool
2017-05-26 10:43:16 +00:00
+cell
| Is the token in titlecase? Equivalent to
| #[code token.text.istitle()].
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_punct]
+cell bool
2017-05-26 10:43:16 +00:00
+cell Is the token punctuation?
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_space]
+cell bool
2017-05-26 10:43:16 +00:00
+cell
| Does the token consist of whitespace characters? Equivalent to
| #[code token.text.isspace()].
2016-10-31 18:04:15 +00:00
+row
+cell #[code like_url]
+cell bool
2017-05-26 10:43:16 +00:00
+cell Does the token resemble a URL?
2016-10-31 18:04:15 +00:00
+row
+cell #[code like_num]
+cell bool
2017-05-26 10:43:16 +00:00
+cell Does the token represent a number? e.g. "10.9", "10", "ten", etc.
2016-10-31 18:04:15 +00:00
+row
+cell #[code like_email]
+cell bool
2017-05-26 10:43:16 +00:00
+cell Does the token resemble an email address?
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_oov]
+cell bool
2017-05-26 10:43:16 +00:00
+cell Is the token out-of-vocabulary?
2016-10-31 18:04:15 +00:00
+row
+cell #[code is_stop]
+cell bool
2017-05-26 10:43:16 +00:00
+cell Is the token part of a "stop list"?
2016-10-31 18:04:15 +00:00
+row
+cell #[code pos]
+cell int
+cell Coarse-grained part-of-speech.
+row
+cell #[code pos_]
+cell unicode
+cell Coarse-grained part-of-speech.
+row
+cell #[code tag]
+cell int
+cell Fine-grained part-of-speech.
+row
+cell #[code tag_]
+cell unicode
+cell Fine-grained part-of-speech.
+row
+cell #[code dep]
+cell int
+cell Syntactic dependency relation.
+row
+cell #[code dep_]
+cell unicode
+cell Syntactic dependency relation.
+row
+cell #[code lang]
+cell int
+cell Language of the parent document's vocabulary.
+row
+cell #[code lang_]
+cell unicode
+cell Language of the parent document's vocabulary.
+row
+cell #[code prob]
+cell float
+cell Smoothed log probability estimate of token's type.
+row
+cell #[code idx]
+cell int
+cell The character offset of the token within the parent document.
+row
+cell #[code sentiment]
+cell float
+cell A scalar value indicating the positivity or negativity of the token.
+row
+cell #[code lex_id]
+cell int
+cell ID of the token's lexical type.