spaCy/website/api/lexeme.jade

302 lines
7.5 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//- 💫 DOCS > API > LEXEME
include ../_includes/_mixins
p
| An entry in the vocabulary. A #[code Lexeme] has no string context it's
| a word type, as opposed to a word token. It therefore has no
| part-of-speech tag, dependency parse, or lemma (if lemmatization depends
| on the part-of-speech tag).
+h(2, "init") Lexeme.__init__
+tag method
p Create a #[code Lexeme] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The parent vocabulary.
+row
+cell #[code orth]
+cell int
+cell The orth id of the lexeme.
+row("foot")
+cell returns
+cell #[code Lexeme]
+cell The newly constructed object.
+h(2, "set_flag") Lexeme.set_flag
+tag method
p Change the value of a boolean flag.
+aside-code("Example").
COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to set.
+row
+cell #[code value]
+cell bool
+cell The new value of the flag.
+h(2, "check_flag") Lexeme.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
is_my_library = lambda text: text in ['spaCy', 'Thinc']
MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
assert nlp.vocab[u'spaCy'].check_flag(MY_LIBRARY) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to query.
+row("foot")
+cell returns
+cell bool
+cell The value of the flag.
+h(2, "similarity") Lexeme.similarity
+tag method
+tag-model("vectors")
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
apple = nlp.vocab[u'apple']
orange = nlp.vocab[u'orange']
apple_orange = apple.similarity(orange)
orange_apple = orange.similarity(apple)
assert apple_orange == orange_apple
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+row("foot")
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "has_vector") Lexeme.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| lexeme.
+aside-code("Example").
apple = nlp.vocab[u'apple']
assert apple.has_vector
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell bool
+cell Whether the lexeme has a vector data attached.
+h(2, "vector") Lexeme.vector
+tag property
+tag-model("vectors")
p A real-valued meaning representation.
+aside-code("Example").
apple = nlp.vocab[u'apple']
assert apple.vector.dtype == 'float32'
assert apple.vector.shape == (300,)
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the lexeme's semantics.
+h(2, "vector_norm") Lexeme.vector_norm
+tag property
+tag-model("vectors")
p The L2 norm of the lexeme's vector representation.
+aside-code("Example").
apple = nlp.vocab[u'apple']
pasta = nlp.vocab[u'pasta']
apple.vector_norm # 7.1346845626831055
pasta.vector_norm # 7.759851932525635
assert apple.vector_norm != pasta.vector_norm
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code lex_id]
+cell int
+cell ID of the lexeme's lexical type.
+row
+cell #[code lower]
+cell int
+cell Lower-case form of the word.
+row
+cell #[code lower_]
+cell unicode
+cell Lower-case form of the word.
+row
+cell #[code shape]
+cell int
+cell Transform of the word's string, to show orthographic features.
+row
+cell #[code shape_]
+cell unicode
+cell Transform of the word's string, to show orthographic features.
+row
+cell #[code prefix]
+cell int
+cell Length-N substring from the start of the word. Defaults to #[code N=1].
+row
+cell #[code prefix_]
+cell unicode
+cell Length-N substring from the start of the word. Defaults to #[code N=1].
+row
+cell #[code suffix]
+cell int
+cell Length-N substring from the end of the word. Defaults to #[code N=3].
+row
+cell #[code suffix_]
+cell unicode
+cell Length-N substring from the start of the word. Defaults to #[code N=3].
+row
+cell #[code is_alpha]
+cell bool
+cell
| Does the lexeme consist of alphabetic characters? Equivalent to
| #[code lexeme.text.isalpha()].
+row
+cell #[code is_ascii]
+cell bool
+cell
| Does the lexeme consist of ASCII characters? Equivalent to
| #[code [any(ord(c) >= 128 for c in lexeme.text)]].
+row
+cell #[code is_digit]
+cell bool
+cell
| Does the lexeme consist of digits? Equivalent to
| #[code lexeme.text.isdigit()].
+row
+cell #[code is_lower]
+cell bool
+cell
| Is the lexeme in lowercase? Equivalent to
| #[code lexeme.text.islower()].
+row
+cell #[code is_title]
+cell bool
+cell
| Is the lexeme in titlecase? Equivalent to
| #[code lexeme.text.istitle()].
+row
+cell #[code is_punct]
+cell bool
+cell Is the lexeme punctuation?
+row
+cell #[code is_space]
+cell bool
+cell
| Does the lexeme consist of whitespace characters? Equivalent to
| #[code lexeme.text.isspace()].
+row
+cell #[code like_url]
+cell bool
+cell Does the lexeme resemble a URL?
+row
+cell #[code like_num]
+cell bool
+cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc.
+row
+cell #[code like_email]
+cell bool
+cell Does the lexeme resemble an email address?
+row
+cell #[code is_oov]
+cell bool
+cell Is the lexeme out-of-vocabulary?
+row
+cell #[code is_stop]
+cell bool
+cell Is the lexeme part of a "stop list"?
+row
+cell #[code lang]
+cell int
+cell Language of the parent vocabulary.
+row
+cell #[code lang_]
+cell unicode
+cell Language of the parent vocabulary.
+row
+cell #[code prob]
+cell float
+cell Smoothed log probability estimate of lexeme's type.
+row
+cell #[code sentiment]
+cell float
+cell A scalar value indicating the positivity or negativity of the lexeme.