spaCy/website/api/lexeme.jade

384 lines
9.4 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//- 💫 DOCS > API > LEXEME
include ../_includes/_mixins
p
| An entry in the vocabulary. A #[code Lexeme] has no string context it's
| a word type, as opposed to a word token. It therefore has no
| part-of-speech tag, dependency parse, or lemma (if lemmatization depends
| on the part-of-speech tag).
+h(2, "init") Lexeme.__init__
+tag method
p Create a #[code Lexeme] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The parent vocabulary.
+row
+cell #[code orth]
+cell int
+cell The orth id of the lexeme.
+row("foot")
+cell returns
+cell #[code Lexeme]
+cell The newly constructed object.
+h(2, "set_flag") Lexeme.set_flag
+tag method
p Change the value of a boolean flag.
+aside-code("Example").
COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to set.
+row
+cell #[code value]
+cell bool
+cell The new value of the flag.
+h(2, "check_flag") Lexeme.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
is_my_library = lambda text: text in ['spaCy', 'Thinc']
MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
assert nlp.vocab[u'spaCy'].check_flag(MY_LIBRARY) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to query.
+row("foot")
+cell returns
+cell bool
+cell The value of the flag.
+h(2, "similarity") Lexeme.similarity
+tag method
+tag-model("vectors")
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
apple = nlp.vocab[u'apple']
orange = nlp.vocab[u'orange']
apple_orange = apple.similarity(orange)
orange_apple = orange.similarity(apple)
assert apple_orange == orange_apple
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+row("foot")
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "has_vector") Lexeme.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| lexeme.
+aside-code("Example").
apple = nlp.vocab[u'apple']
assert apple.has_vector
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell bool
+cell Whether the lexeme has a vector data attached.
+h(2, "vector") Lexeme.vector
+tag property
+tag-model("vectors")
p A real-valued meaning representation.
+aside-code("Example").
apple = nlp.vocab[u'apple']
assert apple.vector.dtype == 'float32'
assert apple.vector.shape == (300,)
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the lexeme's semantics.
+h(2, "vector_norm") Lexeme.vector_norm
+tag property
+tag-model("vectors")
p The L2 norm of the lexeme's vector representation.
+aside-code("Example").
apple = nlp.vocab[u'apple']
pasta = nlp.vocab[u'pasta']
apple.vector_norm # 7.1346845626831055
pasta.vector_norm # 7.759851932525635
assert apple.vector_norm != pasta.vector_norm
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The lexeme's vocabulary.
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+row
+cell #[code orth_]
+cell unicode
+cell
| Verbatim text content (identical to #[code Lexeme.text]). Existst
| mostly for consistency with the other attributes.
+row
+cell #[code lex_id]
+cell int
+cell ID of the lexeme's lexical type.
+row
+cell #[code rank]
+cell int
+cell
| Sequential ID of the lexemes's lexical type, used to index into
| tables, e.g. for word vectors.
+row
+cell #[code flags]
+cell int
+cell Container of the lexeme's binary flags.
+row
+cell #[code norm]
+cell int
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
+row
+cell #[code norm_]
+cell unicode
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
+row
+cell #[code lower]
+cell int
+cell Lowercase form of the word.
+row
+cell #[code lower_]
+cell unicode
+cell Lowercase form of the word.
+row
+cell #[code shape]
+cell int
+cell Transform of the word's string, to show orthographic features.
+row
+cell #[code shape_]
+cell unicode
+cell Transform of the word's string, to show orthographic features.
+row
+cell #[code prefix]
+cell int
+cell
| Length-N substring from the start of the word. Defaults to
| #[code N=1].
+row
+cell #[code prefix_]
+cell unicode
+cell
| Length-N substring from the start of the word. Defaults to
| #[code N=1].
+row
+cell #[code suffix]
+cell int
+cell
| Length-N substring from the end of the word. Defaults to
| #[code N=3].
+row
+cell #[code suffix_]
+cell unicode
+cell
| Length-N substring from the start of the word. Defaults to
| #[code N=3].
+row
+cell #[code is_alpha]
+cell bool
+cell
| Does the lexeme consist of alphabetic characters? Equivalent to
| #[code lexeme.text.isalpha()].
+row
+cell #[code is_ascii]
+cell bool
+cell
| Does the lexeme consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in lexeme.text)]].
+row
+cell #[code is_digit]
+cell bool
+cell
| Does the lexeme consist of digits? Equivalent to
| #[code lexeme.text.isdigit()].
+row
+cell #[code is_lower]
+cell bool
+cell
| Is the lexeme in lowercase? Equivalent to
| #[code lexeme.text.islower()].
+row
+cell #[code is_upper]
+cell bool
+cell
| Is the lexeme in uppercase? Equivalent to
| #[code lexeme.text.isupper()].
+row
+cell #[code is_title]
+cell bool
+cell
| Is the lexeme in titlecase? Equivalent to
| #[code lexeme.text.istitle()].
+row
+cell #[code is_punct]
+cell bool
+cell Is the lexeme punctuation?
+row
+cell #[code is_left_punct]
+cell bool
+cell Is the lexeme a left punctuation mark, e.g. #[code (]?
+row
+cell #[code is_right_punct]
+cell bool
+cell Is the lexeme a right punctuation mark, e.g. #[code )]?
+row
+cell #[code is_space]
+cell bool
+cell
| Does the lexeme consist of whitespace characters? Equivalent to
| #[code lexeme.text.isspace()].
+row
+cell #[code is_bracket]
+cell bool
+cell Is the lexeme a bracket?
+row
+cell #[code is_quote]
+cell bool
+cell Is the lexeme a quotation mark?
+row
+cell #[code is_currency]
+cell bool
+cell Is the lexeme a currency symbol?
+row
+cell #[code like_url]
+cell bool
+cell Does the lexeme resemble a URL?
+row
+cell #[code like_num]
+cell bool
+cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.
+row
+cell #[code like_email]
+cell bool
+cell Does the lexeme resemble an email address?
+row
+cell #[code is_oov]
+cell bool
+cell Is the lexeme out-of-vocabulary?
+row
+cell #[code is_stop]
+cell bool
+cell Is the lexeme part of a "stop list"?
+row
+cell #[code lang]
+cell int
+cell Language of the parent vocabulary.
+row
+cell #[code lang_]
+cell unicode
+cell Language of the parent vocabulary.
+row
+cell #[code prob]
+cell float
+cell Smoothed log probability estimate of the lexeme's type.
+row
+cell #[code cluster]
+cell int
+cell Brown cluster ID.
+row
+cell #[code sentiment]
+cell float
+cell
| A scalar value indicating the positivity or negativity of the
| lexeme.