mirror of https://github.com/explosion/spaCy.git
195 lines
6.8 KiB
Plaintext
195 lines
6.8 KiB
Plaintext
//- ----------------------------------
|
||
//- 💫 DOCS > API > LEXEME
|
||
//- ----------------------------------
|
||
|
||
+section("lexeme")
|
||
+h(2, "lexeme", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/lexeme.pyx")
|
||
| #[+tag class] Lexeme
|
||
|
||
p.
|
||
The Lexeme object represents a lexical type, stored in the vocabulary –
|
||
as opposed to a token, occurring in a document.
|
||
|
||
p.
|
||
Each Token object receives a reference to a lexeme object (specifically,
|
||
it receives a pointer to a #[code LexemeC] struct). This allows features
|
||
to be computed and saved once per type, rather than once per token. As
|
||
job sizes grow, this amounts to substantial efficiency improvements, as
|
||
the vocabulary size (number of types) will be much smaller than the total
|
||
number of words processed (number of tokens).
|
||
|
||
p.
|
||
All Lexeme attributes are therefore context independent, as a single lexeme
|
||
is reused for all usages of that word. Lexemes are keyed by the #[code orth]
|
||
attribute.
|
||
|
||
p.
|
||
Most Lexeme attributes can be set, with the exception of the primary key,
|
||
#[code orth]. Assigning to an attribute of the #[code Lexeme] object writes
|
||
to the underlying struct, so all tokens that are backed by that
|
||
#[code Lexeme] will inherit the new value.
|
||
|
||
+code("python", "Overview").
|
||
class Lexeme:
|
||
def __init__(self, vocab, key):
|
||
return self
|
||
|
||
int rank
|
||
|
||
int orth, lower, shape, prefix, suffix
|
||
|
||
unicode orth_, lower_, shape_, prefix_, suffix_
|
||
|
||
bool is_alpha, is_ascii, is_lower, is_title, is_punct, is_space, like_url, like_num, like_email, is_oov, is_stop
|
||
|
||
float prob
|
||
int cluster
|
||
numpy.ndarray[float64] vector
|
||
bool has_vector
|
||
|
||
def set_flag(self, flag_id, value):
|
||
return None
|
||
|
||
def check_flag(self, flag_id):
|
||
return bool
|
||
|
||
def similarity(self, other):
|
||
return float
|
||
|
||
+table(["Example", "Description"])
|
||
+row
|
||
+cell #[code.lang-python lexeme = nlp.vocab[string]]
|
||
+cell Lookup by string
|
||
+row
|
||
+cell #[code.lang-python lexeme = vocab[i]]
|
||
+cell Lookup by integer
|
||
|
||
+section("lexeme-stringfeatures")
|
||
+h(3, "lexeme-stringfeatures").
|
||
String Features
|
||
|
||
+table(["Name", "Description"])
|
||
+row
|
||
+cell orth / orth_
|
||
+cell.
|
||
The form of the word with no string normalization or processing,
|
||
as it appears in the string, without trailing whitespace.
|
||
|
||
+row
|
||
+cell lower / lower_
|
||
+cell.
|
||
The form of the word, but forced to lower-case, i.e.
|
||
#[code lower = word.orth_.lower()]
|
||
|
||
+row
|
||
+cell shape / shape_
|
||
+cell.
|
||
A transform of the word's string, to show orthographic features.
|
||
The characters a-z are mapped to x, A-Z is mapped to X, 0-9
|
||
is mapped to d. After these mappings, sequences of 4 or more
|
||
of the same character are truncated to length 4. Examples:
|
||
C3Po --> XdXx, favorite --> xxxx, :) --> :)
|
||
|
||
+row
|
||
+cell prefix / prefix_
|
||
+cell.
|
||
A length-N substring from the start of the word. Length may
|
||
vary by language; currently for English n=1, i.e.
|
||
#[code prefix = word.orth_[:1]]
|
||
|
||
+row
|
||
+cell suffix / suffix_
|
||
+cell.
|
||
A length-N substring from the end of the word. Length may vary
|
||
by language; currently for English n=3, i.e.
|
||
#[code suffix = word.orth_[-3:]]
|
||
|
||
+section("lexeme-booleanflags")
|
||
+h(3, "lexeme-booleanflags")
|
||
| Boolean Flags
|
||
|
||
+table(["Name", "Description"])
|
||
+row
|
||
+cell is_alpha
|
||
+cell Equivalent to #[code word.orth_.isalpha()]
|
||
|
||
+row
|
||
+cell is_ascii
|
||
+cell Equivalent to any(ord(c) >= 128 for c in word.orth_)]
|
||
|
||
+row
|
||
+cell is_digit
|
||
+cell Equivalent to #[code word.orth_.isdigit()]
|
||
|
||
+row
|
||
+cell is_lower
|
||
+cell Equivalent to #[code word.orth_.islower()]
|
||
|
||
+row
|
||
+cell is_title
|
||
+cell Equivalent to #[code word.orth_.istitle()]
|
||
|
||
+row
|
||
+cell is_punct
|
||
+cell Equivalent to #[code word.orth_.ispunct()]
|
||
|
||
+row
|
||
+cell is_space
|
||
+cell Equivalent to #[code word.orth_.isspace()]
|
||
|
||
+row
|
||
+cell like_url
|
||
+cell Does the word resemble a URL?
|
||
|
||
+row
|
||
+cell like_num
|
||
+cell Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
|
||
|
||
+row
|
||
+cell like_email
|
||
+cell Does the word resemble an email?
|
||
|
||
+row
|
||
+cell is_oov
|
||
+cell Is the word out-of-vocabulary?
|
||
|
||
+row
|
||
+cell is_stop
|
||
+cell.
|
||
Is the word part of a "stop list"? Stop lists are used to
|
||
improve the quality of topic models, by filtering out common,
|
||
domain-general words.
|
||
|
||
+section("lexeme-distributional")
|
||
+h(3, "lexeme-distributional")
|
||
| Distributional Features
|
||
|
||
+table(["Name", "Description"])
|
||
+row
|
||
+cell prob
|
||
+cell.
|
||
The unigram log-probability of the word, estimated from
|
||
counts from a large corpus, smoothed using Simple Good Turing
|
||
estimation.
|
||
|
||
+row
|
||
+cell cluster
|
||
+cell.
|
||
The Brown cluster ID of the word. These are often useful features
|
||
for linear models. If you’re using a non-linear model, particularly
|
||
a neural net or random forest, consider using the real-valued
|
||
word representation vector, in #[code Token.repvec], instead.
|
||
|
||
+row
|
||
+cell vector
|
||
+cell.
|
||
A "word embedding" representation: a dense real-valued vector
|
||
that supports similarity queries between words. By default,
|
||
spaCy currently loads vectors produced by the Levy and
|
||
Goldberg (2014) dependency-based word2vec model.
|
||
|
||
+row
|
||
+cell has_vector
|
||
+cell.
|
||
A boolean value indicating whether a vector.
|