mirror of https://github.com/explosion/spaCy.git
Tidy up and document Doc, Token and Span
This commit is contained in:
parent
1a559d4c95
commit
6a0483b7aa
|
@ -2,4 +2,4 @@ from .doc import Doc
|
|||
from .token import Token
|
||||
from .span import Span
|
||||
|
||||
__all__ = [Doc, Token, Span]
|
||||
__all__ = ['Doc', 'Token', 'Span']
|
||||
|
|
|
@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
|||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs import intify_attrs, IDS
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||
from ..attrs cimport SENT_START
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||
from ..attrs cimport ENT_TYPE, SENT_START
|
||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||
from ..util import normalize_slice
|
||||
from ..compat import is_config, copy_reg, pickle
|
||||
|
@ -78,24 +78,25 @@ def _get_chunker(lang):
|
|||
|
||||
cdef class Doc:
|
||||
"""A sequence of Token objects. Access sentences and named entities, export
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
||||
The `Doc` object holds an array of `TokenC` structs. The Python-level
|
||||
`Token` and `Span` objects are views of this array, i.e. they don't own
|
||||
the data themselves.
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||
strings. The `Doc` object holds an array of `TokenC` structs. The
|
||||
Python-level `Token` and `Span` objects are views of this array, i.e.
|
||||
they don't own the data themselves.
|
||||
|
||||
EXAMPLE: Construction 1
|
||||
>>> doc = nlp(u'Some text')
|
||||
|
||||
Construction 2
|
||||
>>> from spacy.tokens import Doc
|
||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
|
||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||
spaces=[True, False, False])
|
||||
"""
|
||||
@classmethod
|
||||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
nr_defined = sum(t is not None for t in (default, getter, setter, method))
|
||||
assert nr_defined == 1
|
||||
Underscore.doc_extensions[name] = (default, method, getter, setter)
|
||||
Underscore.doc_extensions[name] = (default, method, getter, setter)
|
||||
|
||||
@classmethod
|
||||
def get_extension(cls, name):
|
||||
|
@ -109,15 +110,14 @@ cdef class Doc:
|
|||
orths_and_spaces=None):
|
||||
"""Create a Doc object.
|
||||
|
||||
vocab (Vocab): A vocabulary object, which must match any models you want
|
||||
to use (e.g. tokenizer, parser, entity recognizer).
|
||||
vocab (Vocab): A vocabulary object, which must match any models you
|
||||
want to use (e.g. tokenizer, parser, entity recognizer).
|
||||
words (list or None): A list of unicode strings to add to the document
|
||||
as words. If `None`, defaults to empty list.
|
||||
spaces (list or None): A list of boolean values, of the same length as
|
||||
words. True means that the word is followed by a space, False means
|
||||
it is not. If `None`, defaults to `[True]*len(words)`
|
||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||
|
||||
RETURNS (Doc): The newly constructed object.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
|
@ -153,10 +153,10 @@ cdef class Doc:
|
|||
spaces = [True] * len(words)
|
||||
elif len(spaces) != len(words):
|
||||
raise ValueError(
|
||||
"Arguments 'words' and 'spaces' should be sequences of the "
|
||||
"same length, or 'spaces' should be left default at None. "
|
||||
"spaces should be a sequence of booleans, with True meaning "
|
||||
"that the word owns a ' ' character following it.")
|
||||
"Arguments 'words' and 'spaces' should be sequences of "
|
||||
"the same length, or 'spaces' should be left default at "
|
||||
"None. spaces should be a sequence of booleans, with True "
|
||||
"meaning that the word owns a ' ' character following it.")
|
||||
orths_and_spaces = zip(words, spaces)
|
||||
if orths_and_spaces is not None:
|
||||
for orth_space in orths_and_spaces:
|
||||
|
@ -166,7 +166,8 @@ cdef class Doc:
|
|||
elif isinstance(orth_space, bytes):
|
||||
raise ValueError(
|
||||
"orths_and_spaces expects either List(unicode) or "
|
||||
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
|
||||
"List((unicode, bool)). "
|
||||
"Got bytes instance: %s" % (str(orth_space)))
|
||||
else:
|
||||
orth, has_space = orth_space
|
||||
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
||||
|
@ -186,7 +187,8 @@ cdef class Doc:
|
|||
def __getitem__(self, object i):
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
||||
i (int or tuple) The index of the token, or the slice of the document to get.
|
||||
i (int or tuple) The index of the token, or the slice of the document
|
||||
to get.
|
||||
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
||||
`doc[start : end]`.
|
||||
|
||||
|
@ -199,11 +201,11 @@ cdef class Doc:
|
|||
>>> doc[start : end]]
|
||||
Get a `Span` object, starting at position `start` and ending at
|
||||
position `end`, where `start` and `end` are token indices. For
|
||||
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
|
||||
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
|
||||
as `Span` objects must be contiguous (cannot have gaps). You can use
|
||||
negative indices and open-ended ranges, which have their normal
|
||||
Python semantics.
|
||||
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
|
||||
4. Stepped slices (e.g. `doc[start : end : step]`) are not
|
||||
supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||
You can use negative indices and open-ended ranges, which have
|
||||
their normal Python semantics.
|
||||
"""
|
||||
if isinstance(i, slice):
|
||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
|
@ -262,8 +264,10 @@ cdef class Doc:
|
|||
doc (Doc): The parent document.
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
if not isinstance(label, int):
|
||||
|
@ -377,13 +381,14 @@ cdef class Doc:
|
|||
return self.text
|
||||
|
||||
property ents:
|
||||
"""Iterate over the entities in the document. Yields named-entity `Span`
|
||||
objects, if the entity recognizer has been applied to the document.
|
||||
"""Iterate over the entities in the document. Yields named-entity
|
||||
`Span` objects, if the entity recognizer has been applied to the
|
||||
document.
|
||||
|
||||
YIELDS (Span): Entities in the document.
|
||||
|
||||
EXAMPLE: Iterate over the span to get individual Token objects, or access
|
||||
the label:
|
||||
EXAMPLE: Iterate over the span to get individual Token objects,
|
||||
or access the label:
|
||||
|
||||
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
>>> ents = list(tokens.ents)
|
||||
|
@ -456,10 +461,11 @@ cdef class Doc:
|
|||
|
||||
property noun_chunks:
|
||||
"""Iterate over the base noun phrases in the document. Yields base
|
||||
noun-phrase #[code Span] objects, if the document has been syntactically
|
||||
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
|
||||
not permit other NPs to be nested within it – so no NP-level
|
||||
coordination, no prepositional phrases, and no relative clauses.
|
||||
noun-phrase #[code Span] objects, if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||
phrase that does not permit other NPs to be nested within it – so no
|
||||
NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
|
||||
YIELDS (Span): Noun chunks in the document.
|
||||
"""
|
||||
|
@ -467,12 +473,14 @@ cdef class Doc:
|
|||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires data to be installed. For more info, see the "
|
||||
"requires a statistical model to be installed and loaded. "
|
||||
"For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
# Accumulate the result before beginning to iterate over it. This prevents
|
||||
# the tokenisation from being changed out from under us during the iteration.
|
||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||
# so it's okay once we have the Span objects. See Issue #375
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
# prevents the tokenisation from being changed out from under us
|
||||
# during the iteration. The tricky thing here is that Span accepts
|
||||
# its tokenisation changing, so it's okay once we have the Span
|
||||
# objects. See Issue #375.
|
||||
spans = []
|
||||
for start, end, label in self.noun_chunks_iterator(self):
|
||||
spans.append(Span(self, start, end, label=label))
|
||||
|
@ -497,8 +505,9 @@ cdef class Doc:
|
|||
|
||||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
"sentence boundary detection requires the dependency parse, which "
|
||||
"requires data to be installed. For more info, see the "
|
||||
"Sentence boundary detection requires the dependency "
|
||||
"parse, which requires a statistical model to be "
|
||||
"installed and loaded. For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
cdef int i
|
||||
start = 0
|
||||
|
@ -537,12 +546,11 @@ cdef class Doc:
|
|||
@cython.boundscheck(False)
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Export given token attributes to a numpy `ndarray`.
|
||||
|
||||
If `attr_ids` is a sequence of M attributes, the output array will
|
||||
be of shape `(N, M)`, where N is the length of the `Doc`
|
||||
(in tokens). If `attr_ids` is a single attribute, the output shape will
|
||||
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
|
||||
or string name (e.g. 'LEMMA' or 'lemma').
|
||||
If `attr_ids` is a sequence of M attributes, the output array will be
|
||||
of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
|
||||
`attr_ids` is a single attribute, the output shape will be (N,). You
|
||||
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
|
||||
string name (e.g. 'LEMMA' or 'lemma').
|
||||
|
||||
attr_ids (list[]): A list of attributes (int IDs or string names).
|
||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||
|
@ -641,13 +649,12 @@ cdef class Doc:
|
|||
def from_array(self, attrs, array):
|
||||
if SENT_START in attrs and HEAD in attrs:
|
||||
raise ValueError(
|
||||
"Conflicting attributes specified in doc.from_array():\n"
|
||||
"Conflicting attributes specified in doc.from_array(): "
|
||||
"(HEAD, SENT_START)\n"
|
||||
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
|
||||
"based on the tree structure. This means the HEAD attribute would "
|
||||
"potentially override the sentence boundaries set by SENT_START.\n"
|
||||
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
|
||||
"workarounds, and to propose solutions.")
|
||||
"The HEAD attribute currently sets sentence boundaries "
|
||||
"implicitly, based on the tree structure. This means the HEAD "
|
||||
"attribute would potentially override the sentence boundaries "
|
||||
"set by SENT_START.")
|
||||
cdef int i, col
|
||||
cdef attr_id_t attr_id
|
||||
cdef TokenC* tokens = self.c
|
||||
|
@ -675,18 +682,14 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def get_lca_matrix(self):
|
||||
'''
|
||||
Calculates the lowest common ancestor matrix
|
||||
for a given Spacy doc.
|
||||
Returns LCA matrix containing the integer index
|
||||
of the ancestor, or -1 if no common ancestor is
|
||||
found (ex if span excludes a necessary ancestor).
|
||||
Apologies about the recursion, but the
|
||||
impact on performance is negligible given
|
||||
the natural limitations on the depth of a typical human sentence.
|
||||
'''
|
||||
"""Calculates the lowest common ancestor matrix for a given `Doc`.
|
||||
Returns LCA matrix containing the integer index of the ancestor, or -1
|
||||
if no common ancestor is found (ex if span excludes a necessary
|
||||
ancestor). Apologies about the recursion, but the impact on
|
||||
performance is negligible given the natural limitations on the depth
|
||||
of a typical human sentence.
|
||||
"""
|
||||
# Efficiency notes:
|
||||
#
|
||||
# We can easily improve the performance here by iterating in Cython.
|
||||
# To loop over the tokens in Cython, the easiest way is:
|
||||
# for token in doc.c[:doc.c.length]:
|
||||
|
@ -719,7 +722,6 @@ cdef class Doc:
|
|||
token_k = self[k]
|
||||
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
|
||||
lca_matrix[k][j] = lca_matrix[j][k]
|
||||
|
||||
return lca_matrix
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
|
@ -819,14 +821,15 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
|
||||
is merged into a single token. If `start_idx` and `end_idx `do not mark
|
||||
start and end token boundaries, the document remains unchanged.
|
||||
"""Retokenize the document, such that the span at
|
||||
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||
`start_idx` and `end_idx `do not mark start and end token boundaries,
|
||||
the document remains unchanged.
|
||||
|
||||
start_idx (int): The character index of the start of the slice to merge.
|
||||
end_idx (int): The character index after the end of the slice to merge.
|
||||
start_idx (int): Character index of the start of the slice to merge.
|
||||
end_idx (int): Character index after the end of the slice to merge.
|
||||
**attributes: Attributes to assign to the merged token. By default,
|
||||
attributes are inherited from the syntactic root token of the span.
|
||||
attributes are inherited from the syntactic root of the span.
|
||||
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||
indices did not fall at token boundaries.
|
||||
"""
|
||||
|
@ -847,10 +850,10 @@ cdef class Doc:
|
|||
attributes[ENT_TYPE] = attributes['ent_type']
|
||||
elif args:
|
||||
raise ValueError(
|
||||
"Doc.merge received %d non-keyword arguments. "
|
||||
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
|
||||
"Doc.merge received %d non-keyword arguments. Expected either "
|
||||
"3 arguments (deprecated), or 0 (use keyword arguments). "
|
||||
"Arguments supplied:\n%s\n"
|
||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
||||
"Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes)))
|
||||
|
||||
# More deprecated attribute handling =/
|
||||
if 'label' in attributes:
|
||||
|
@ -882,8 +885,9 @@ cdef class Doc:
|
|||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
# This is easier to work with for now than the offsets
|
||||
# Before thinking of something simpler, beware the case where a dependency
|
||||
# bridges over the entity. Here the alignment of the tokens changes.
|
||||
# Before thinking of something simpler, beware the case where a
|
||||
# dependency bridges over the entity. Here the alignment of the
|
||||
# tokens changes.
|
||||
span_root = span.root.i
|
||||
token.dep = span.root.dep
|
||||
# We update token.lex after keeping span root and dep, since
|
||||
|
@ -932,8 +936,9 @@ cdef class Doc:
|
|||
>>> trees = doc.print_tree()
|
||||
>>> trees[1]
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
|
||||
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
|
||||
'lemma': 'Alice'},
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||
|
@ -1008,7 +1013,7 @@ def pickle_doc(doc):
|
|||
|
||||
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
||||
user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
|
||||
|
||||
|
||||
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
|
||||
exclude='user_data')
|
||||
doc.user_hooks.update(doc_hooks)
|
||||
|
@ -1018,4 +1023,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
|||
|
||||
|
||||
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
||||
|
||||
|
|
|
@ -35,15 +35,16 @@ cdef class Span:
|
|||
def has_extension(cls, name):
|
||||
return name in Underscore.span_extensions
|
||||
|
||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
||||
vector_norm=None):
|
||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
|
||||
vector=None, vector_norm=None):
|
||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first token of the span.
|
||||
end (int): The index of the first token after the span.
|
||||
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
||||
of the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
if not (0 <= start <= end <= len(doc)):
|
||||
|
@ -162,7 +163,8 @@ cdef class Span:
|
|||
attributes are inherited from the syntactic root token of the span.
|
||||
RETURNS (Token): The newly merged token.
|
||||
"""
|
||||
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||
return self.doc.merge(self.start_char, self.end_char, *args,
|
||||
**attributes)
|
||||
|
||||
def similarity(self, other):
|
||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||
|
@ -179,24 +181,19 @@ cdef class Span:
|
|||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
def get_lca_matrix(self):
|
||||
'''
|
||||
Calculates the lowest common ancestor matrix
|
||||
for a given Spacy span.
|
||||
Returns LCA matrix containing the integer index
|
||||
of the ancestor, or -1 if no common ancestor is
|
||||
found (ex if span excludes a necessary ancestor).
|
||||
Apologies about the recursion, but the
|
||||
impact on performance is negligible given
|
||||
the natural limitations on the depth of a typical human sentence.
|
||||
'''
|
||||
|
||||
"""Calculates the lowest common ancestor matrix for a given `Span`.
|
||||
Returns LCA matrix containing the integer index of the ancestor, or -1
|
||||
if no common ancestor is found (ex if span excludes a necessary
|
||||
ancestor). Apologies about the recursion, but the impact on
|
||||
performance is negligible given the natural limitations on the depth
|
||||
of a typical human sentence.
|
||||
"""
|
||||
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
|
||||
offset = margins[0]
|
||||
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
|
||||
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
|
||||
token_j_i = token_j.i - offset
|
||||
token_k_i = token_k.i - offset
|
||||
|
||||
if lca_matrix[token_j_i][token_k_i] != -2:
|
||||
return lca_matrix[token_j_i][token_k_i]
|
||||
elif token_j == token_k:
|
||||
|
@ -209,23 +206,19 @@ cdef class Span:
|
|||
lca_index = -1
|
||||
else:
|
||||
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
|
||||
|
||||
lca_matrix[token_j_i][token_k_i] = lca_index
|
||||
lca_matrix[token_k_i][token_j_i] = lca_index
|
||||
|
||||
return lca_index
|
||||
|
||||
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
|
||||
lca_matrix.fill(-2)
|
||||
margins = [self.start, self.end]
|
||||
|
||||
for j in range(len(self)):
|
||||
token_j = self[j]
|
||||
for k in range(len(self)):
|
||||
token_k = self[k]
|
||||
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
|
||||
lca_matrix[k][j] = lca_matrix[j][k]
|
||||
|
||||
return lca_matrix
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
|
@ -349,7 +342,8 @@ cdef class Span:
|
|||
"""The text content of the span with a trailing whitespace character if
|
||||
the last token has one.
|
||||
|
||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
||||
RETURNS (unicode): The text content of the span (with trailing
|
||||
whitespace).
|
||||
"""
|
||||
def __get__(self):
|
||||
return u''.join([t.text_with_ws for t in self])
|
||||
|
@ -358,7 +352,8 @@ cdef class Span:
|
|||
"""Yields base noun-phrase `Span` objects, if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||
phrase that does not permit other NPs to be nested within it – so no
|
||||
NP-level coordination, no prepositional phrases, and no relative clauses.
|
||||
NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
|
||||
YIELDS (Span): Base noun-phrase `Span` objects
|
||||
"""
|
||||
|
@ -366,7 +361,8 @@ cdef class Span:
|
|||
if not self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires data to be installed. For more info, see the "
|
||||
"requires a statistical model to be installed and loaded. "
|
||||
"For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
# Accumulate the result before beginning to iterate over it. This prevents
|
||||
# the tokenisation from being changed out from under us during the iteration.
|
||||
|
@ -385,9 +381,9 @@ cdef class Span:
|
|||
|
||||
RETURNS (Token): The root token.
|
||||
|
||||
EXAMPLE: The root token has the shortest path to the root of the sentence
|
||||
(or is the root itself). If multiple words are equally high in the
|
||||
tree, the first word is taken. For example:
|
||||
EXAMPLE: The root token has the shortest path to the root of the
|
||||
sentence (or is the root itself). If multiple words are equally
|
||||
high in the tree, the first word is taken. For example:
|
||||
|
||||
>>> toks = nlp(u'I like New York in Autumn.')
|
||||
|
||||
|
@ -437,11 +433,11 @@ cdef class Span:
|
|||
if self.doc.c[i].head == 0:
|
||||
return self.doc[i]
|
||||
# If we don't have a sentence root, we do something that's not so
|
||||
# algorithmically clever, but I think should be quite fast, especially
|
||||
# for short spans.
|
||||
# algorithmically clever, but I think should be quite fast,
|
||||
# especially for short spans.
|
||||
# For each word, we count the path length, and arg min this measure.
|
||||
# We could use better tree logic to save steps here...But I think this
|
||||
# should be okay.
|
||||
# We could use better tree logic to save steps here...But I
|
||||
# think this should be okay.
|
||||
cdef int current_best = self.doc.length
|
||||
cdef int root = -1
|
||||
for i in range(self.start, self.end):
|
||||
|
@ -463,7 +459,7 @@ cdef class Span:
|
|||
YIELDS (Token):A left-child of a token of the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
for token in reversed(self): # Reverse, so we get the tokens in order
|
||||
for token in reversed(self): # Reverse, so we get tokens in order
|
||||
for left in token.lefts:
|
||||
if left.i < self.start:
|
||||
yield left
|
||||
|
@ -493,7 +489,7 @@ cdef class Span:
|
|||
yield from word.subtree
|
||||
|
||||
property ent_id:
|
||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||
"""An (integer) entity ID.
|
||||
|
||||
RETURNS (uint64): The entity ID.
|
||||
"""
|
||||
|
@ -503,8 +499,8 @@ cdef class Span:
|
|||
def __set__(self, hash_t key):
|
||||
# TODO
|
||||
raise NotImplementedError(
|
||||
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
||||
"tracker: http://github.com/explosion/spaCy/issues")
|
||||
"Can't yet set ent_id from Span. Vote for this feature on "
|
||||
"the issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
property ent_id_:
|
||||
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||
|
@ -517,13 +513,16 @@ cdef class Span:
|
|||
def __set__(self, hash_t key):
|
||||
# TODO
|
||||
raise NotImplementedError(
|
||||
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
||||
"tracker: http://github.com/explosion/spaCy/issues")
|
||||
"Can't yet set ent_id_ from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
property orth_:
|
||||
# TODO: docstring
|
||||
"""Verbatim text content (identical to Span.text). Exists mostly for
|
||||
consistency with other attributes.
|
||||
|
||||
RETURNS (unicode): The span's text."""
|
||||
def __get__(self):
|
||||
return ''.join([t.string for t in self]).strip()
|
||||
return ''.join([t.orth_ for t in self]).strip()
|
||||
|
||||
property lemma_:
|
||||
"""The span's lemma.
|
||||
|
@ -534,19 +533,19 @@ cdef class Span:
|
|||
return ' '.join([t.lemma_ for t in self]).strip()
|
||||
|
||||
property upper_:
|
||||
# TODO: docstring
|
||||
"""Deprecated. Use Span.text.upper() instead."""
|
||||
def __get__(self):
|
||||
return ''.join([t.string.upper() for t in self]).strip()
|
||||
return ''.join([t.text_with_ws.upper() for t in self]).strip()
|
||||
|
||||
property lower_:
|
||||
# TODO: docstring
|
||||
"""Deprecated. Use Span.text.lower() instead."""
|
||||
def __get__(self):
|
||||
return ''.join([t.string.lower() for t in self]).strip()
|
||||
return ''.join([t.text_with_ws.lower() for t in self]).strip()
|
||||
|
||||
property string:
|
||||
# TODO: docstring
|
||||
"""Deprecated: Use Span.text instead."""
|
||||
def __get__(self):
|
||||
return ''.join([t.string for t in self])
|
||||
return ''.join([t.text_with_ws for t in self])
|
||||
|
||||
property label_:
|
||||
"""The span's label.
|
||||
|
@ -570,7 +569,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
|||
n += 1
|
||||
if n >= sent_length:
|
||||
raise RuntimeError(
|
||||
"Array bounds exceeded while searching for root word. This likely "
|
||||
"means the parse tree is in an invalid state. Please report this "
|
||||
"issue here: http://github.com/explosion/spaCy/issues")
|
||||
"Array bounds exceeded while searching for root word. This "
|
||||
"likely means the parse tree is in an invalid state. Please "
|
||||
"report this issue here: "
|
||||
"http://github.com/explosion/spaCy/issues")
|
||||
return n
|
||||
|
|
|
@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
|
|||
from ..lexeme cimport Lexeme
|
||||
from .. import parts_of_speech
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport LEMMA, POS, TAG, DEP
|
||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||
from ..compat import is_config
|
||||
from .. import about
|
||||
from .underscore import Underscore
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
|
||||
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
||||
etc."""
|
||||
@classmethod
|
||||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
|
@ -171,10 +172,11 @@ cdef class Token:
|
|||
return self.orth_
|
||||
|
||||
property text_with_ws:
|
||||
"""The text content of the token with a trailing whitespace character if
|
||||
it has one.
|
||||
"""The text content of the token with a trailing whitespace character
|
||||
if it has one.
|
||||
|
||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
||||
RETURNS (unicode): The text content of the span (with trailing
|
||||
whitespace).
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
||||
|
@ -306,9 +308,8 @@ cdef class Token:
|
|||
def __set__(self, value):
|
||||
if self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
'Refusing to write to token.sent_start if its document is parsed, '
|
||||
'because this may cause inconsistent state. '
|
||||
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
|
||||
"Refusing to write to token.sent_start if its document "
|
||||
"is parsed, because this may cause inconsistent state.")
|
||||
if value is None:
|
||||
self.c.sent_start = 0
|
||||
elif value is True:
|
||||
|
@ -316,13 +317,12 @@ cdef class Token:
|
|||
elif value is False:
|
||||
self.c.sent_start = -1
|
||||
else:
|
||||
raise ValueError("Invalid value for token.sent_start -- must be one of "
|
||||
"None, True, False")
|
||||
raise ValueError("Invalid value for token.sent_start. Must be "
|
||||
"one of: None, True, False")
|
||||
|
||||
property lefts:
|
||||
def __get__(self):
|
||||
"""
|
||||
The leftward immediate children of the word, in the syntactic
|
||||
"""The leftward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
"""
|
||||
cdef int nr_iter = 0
|
||||
|
@ -334,13 +334,12 @@ cdef class Token:
|
|||
nr_iter += 1
|
||||
# This is ugly, but it's a way to guard out infinite loops
|
||||
if nr_iter >= 10000000:
|
||||
raise RuntimeError(
|
||||
"Possibly infinite loop encountered while looking for token.lefts")
|
||||
raise RuntimeError("Possibly infinite loop encountered "
|
||||
"while looking for token.lefts")
|
||||
|
||||
property rights:
|
||||
def __get__(self):
|
||||
"""
|
||||
The rightward immediate children of the word, in the syntactic
|
||||
"""The rightward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
"""
|
||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||
|
@ -352,27 +351,26 @@ cdef class Token:
|
|||
ptr -= 1
|
||||
nr_iter += 1
|
||||
if nr_iter >= 10000000:
|
||||
raise RuntimeError(
|
||||
"Possibly infinite loop encountered while looking for token.rights")
|
||||
raise RuntimeError("Possibly infinite loop encountered "
|
||||
"while looking for token.rights")
|
||||
tokens.reverse()
|
||||
for t in tokens:
|
||||
yield t
|
||||
|
||||
property children:
|
||||
"""
|
||||
A sequence of the token's immediate syntactic children.
|
||||
"""A sequence of the token's immediate syntactic children.
|
||||
|
||||
Yields: Token A child token such that child.head==self
|
||||
YIELDS (Token): A child token such that child.head==self
|
||||
"""
|
||||
def __get__(self):
|
||||
yield from self.lefts
|
||||
yield from self.rights
|
||||
|
||||
property subtree:
|
||||
"""
|
||||
A sequence of all the token's syntactic descendents.
|
||||
"""A sequence of all the token's syntactic descendents.
|
||||
|
||||
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
||||
YIELDS (Token): A descendent token such that
|
||||
`self.is_ancestor(descendent)`.
|
||||
"""
|
||||
def __get__(self):
|
||||
for word in self.lefts:
|
||||
|
@ -456,13 +454,15 @@ cdef class Token:
|
|||
if self.c.head > 0: # left dependent
|
||||
old_head.c.l_kids -= 1
|
||||
if self.c.l_edge == old_head.c.l_edge:
|
||||
# the token dominates the left edge so the left edge of the head
|
||||
# may change when the token is reattached
|
||||
# it may not change if the new head is a descendant of the current head
|
||||
# the token dominates the left edge so the left edge of
|
||||
# the head may change when the token is reattached, it may
|
||||
# not change if the new head is a descendant of the current
|
||||
# head
|
||||
|
||||
new_edge = self.c.l_edge
|
||||
# the new l_edge is the left-most l_edge on any of the other dependents
|
||||
# where the l_edge is left of the head, otherwise it is the head
|
||||
# the new l_edge is the left-most l_edge on any of the
|
||||
# other dependents where the l_edge is left of the head,
|
||||
# otherwise it is the head
|
||||
if not is_desc:
|
||||
new_edge = old_head.i
|
||||
for child in old_head.children:
|
||||
|
@ -472,14 +472,15 @@ cdef class Token:
|
|||
new_edge = child.c.l_edge
|
||||
old_head.c.l_edge = new_edge
|
||||
|
||||
# walk up the tree from old_head and assign new l_edge to ancestors
|
||||
# until an ancestor already has an l_edge that's further left
|
||||
# walk up the tree from old_head and assign new l_edge to
|
||||
# ancestors until an ancestor already has an l_edge that's
|
||||
# further left
|
||||
for anc in old_head.ancestors:
|
||||
if anc.c.l_edge <= new_edge:
|
||||
break
|
||||
anc.c.l_edge = new_edge
|
||||
|
||||
elif self.c.head < 0: # right dependent
|
||||
elif self.c.head < 0: # right dependent
|
||||
old_head.c.r_kids -= 1
|
||||
# do the same thing as for l_edge
|
||||
if self.c.r_edge == old_head.c.r_edge:
|
||||
|
@ -500,7 +501,7 @@ cdef class Token:
|
|||
anc.c.r_edge = new_edge
|
||||
|
||||
# update number of deps of new head
|
||||
if rel_newhead_i > 0: # left dependent
|
||||
if rel_newhead_i > 0: # left dependent
|
||||
new_head.c.l_kids += 1
|
||||
# walk up the tree from new head and set l_edge to self.l_edge
|
||||
# until you hit a token with an l_edge further to the left
|
||||
|
@ -511,7 +512,7 @@ cdef class Token:
|
|||
break
|
||||
anc.c.l_edge = self.c.l_edge
|
||||
|
||||
elif rel_newhead_i < 0: # right dependent
|
||||
elif rel_newhead_i < 0: # right dependent
|
||||
new_head.c.r_kids += 1
|
||||
# do the same as for l_edge
|
||||
if self.c.r_edge > new_head.c.r_edge:
|
||||
|
@ -572,8 +573,8 @@ cdef class Token:
|
|||
|
||||
property ent_iob_:
|
||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||
"I" means it is inside an entity, "O" means it is outside an entity, and
|
||||
"" means no entity tag is set.
|
||||
"I" means it is inside an entity, "O" means it is outside an entity,
|
||||
and "" means no entity tag is set.
|
||||
|
||||
RETURNS (unicode): IOB code of named entity tag.
|
||||
"""
|
||||
|
@ -582,8 +583,7 @@ cdef class Token:
|
|||
return iob_strings[self.c.ent_iob]
|
||||
|
||||
property ent_id:
|
||||
"""ID of the entity the token is an instance of, if any. Usually
|
||||
assigned by patterns in the Matcher.
|
||||
"""ID of the entity the token is an instance of, if any.
|
||||
|
||||
RETURNS (uint64): ID of the entity.
|
||||
"""
|
||||
|
@ -594,8 +594,7 @@ cdef class Token:
|
|||
self.c.ent_id = key
|
||||
|
||||
property ent_id_:
|
||||
"""ID of the entity the token is an instance of, if any. Usually
|
||||
assigned by patterns in the Matcher.
|
||||
"""ID of the entity the token is an instance of, if any.
|
||||
|
||||
RETURNS (unicode): ID of the entity.
|
||||
"""
|
||||
|
@ -606,34 +605,70 @@ cdef class Token:
|
|||
self.c.ent_id = self.vocab.strings.add(name)
|
||||
|
||||
property whitespace_:
|
||||
"""Trailing space character if present.
|
||||
|
||||
RETURNS (unicode): The whitespace character.
|
||||
"""
|
||||
def __get__(self):
|
||||
return ' ' if self.c.spacy else ''
|
||||
|
||||
property orth_:
|
||||
"""Verbatim text content (identical to `Token.text`). Existst mostly
|
||||
for consistency with the other attributes.
|
||||
|
||||
RETURNS (unicode): The token text.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.orth]
|
||||
|
||||
property lower_:
|
||||
"""Lowercase form of the token text. Equivalent to
|
||||
`Token.text.lower()`.
|
||||
|
||||
RETURNS (unicode): The lowercase token text.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.lower]
|
||||
|
||||
property norm_:
|
||||
"""The token's norm, i.e. a normalised form of the token text.
|
||||
Usually set in the language's tokenizer exceptions or norm exceptions.
|
||||
|
||||
RETURNS (unicode): The norm.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.norm]
|
||||
|
||||
property shape_:
|
||||
"""Transform of the tokens's string, to show orthographic features.
|
||||
For example, "Xxxx" or "dd".
|
||||
|
||||
RETURNS (unicode): The token shape.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.shape]
|
||||
|
||||
property prefix_:
|
||||
"""A length-N substring from the start of the token. Defaults to `N=1`.
|
||||
|
||||
RETURNS (unicode): The token's prefix.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.prefix]
|
||||
|
||||
property suffix_:
|
||||
"""A length-N substring from the end of the token. Defaults to `N=3`.
|
||||
|
||||
RETURNS (unicode): The token's suffix.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.suffix]
|
||||
|
||||
property lang_:
|
||||
"""Language of the parent document's vocabulary, e.g. 'en'.
|
||||
|
||||
RETURNS (unicode): The language code.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.lang]
|
||||
|
||||
|
@ -648,65 +683,152 @@ cdef class Token:
|
|||
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||
|
||||
property pos_:
|
||||
"""Coarse-grained part-of-speech.
|
||||
|
||||
RETURNS (unicode): The part-of-speech tag.
|
||||
"""
|
||||
def __get__(self):
|
||||
return parts_of_speech.NAMES[self.c.pos]
|
||||
|
||||
property tag_:
|
||||
"""Fine-grained part-of-speech.
|
||||
|
||||
RETURNS (unicode): The part-of-speech tag.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.tag]
|
||||
def __set__(self, tag):
|
||||
self.tag = self.vocab.strings.add(tag)
|
||||
|
||||
property dep_:
|
||||
"""Syntactic dependency relation.
|
||||
|
||||
RETURNS (unicode): The dependency label.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.dep]
|
||||
def __set__(self, unicode label):
|
||||
self.c.dep = self.vocab.strings.add(label)
|
||||
|
||||
property is_oov:
|
||||
"""Is the token out-of-vocabulary?
|
||||
|
||||
RETURNS (bool): Whether the token is out-of-vocabulary.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||
|
||||
property is_stop:
|
||||
"""Is the token part of a "stop list"? (defined by the language data)
|
||||
|
||||
RETURNS (bool): Whether the token is a stop word.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
|
||||
|
||||
property is_alpha:
|
||||
"""Does the token consist of alphabetic characters? Equivalent to
|
||||
`token.text.isalpha()`.
|
||||
|
||||
RETURNS (bool): Whether the token consists of alpha characters.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
|
||||
|
||||
property is_ascii:
|
||||
"""Does the token consist of ASCII characters? Equivalent to
|
||||
`[any(ord(c) >= 128 for c in token.text)]`.
|
||||
|
||||
RETURNS (bool): Whether the token consists of ASCII characters.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
|
||||
|
||||
property is_digit:
|
||||
"""Does the token consist of digits? Equivalent to
|
||||
`token.text.isdigit()`.
|
||||
|
||||
RETURNS (bool): Whether the token consists of digits.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
|
||||
|
||||
property is_lower:
|
||||
"""Is the token in lowercase? Equivalent to `token.text.islower()`.
|
||||
|
||||
RETURNS (bool): Whether the token is in lowercase.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
|
||||
|
||||
property is_upper:
|
||||
"""Is the token in uppercase? Equivalent to `token.text.isupper()`.
|
||||
|
||||
RETURNS (bool): Whether the token is in uppercase.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
|
||||
|
||||
property is_title:
|
||||
"""Is the token in titlecase? Equivalent to `token.text.istitle()`.
|
||||
|
||||
RETURNS (bool): Whether the token is in titlecase.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
|
||||
|
||||
property is_punct:
|
||||
"""Is the token punctuation?
|
||||
|
||||
RETURNS (bool): Whether the token is punctuation.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
||||
|
||||
property is_space:
|
||||
"""Does the token consist of whitespace characters? Equivalent to
|
||||
`token.text.isspace()`.
|
||||
|
||||
RETURNS (bool): Whether the token consists of whitespace characters.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
||||
|
||||
property is_bracket:
|
||||
"""Is the token a bracket?
|
||||
|
||||
RETURNS (bool): Whether the token is a bracket.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
||||
|
||||
property is_quote:
|
||||
"""Is the token a quotation mark?
|
||||
|
||||
RETURNS (bool): Whether the token is a quotation mark.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
||||
|
||||
property is_left_punct:
|
||||
"""Is the token a left punctuation mark, e.g. "("?
|
||||
|
||||
RETURNS (bool): Whether the token is a left punctuation mark.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
||||
|
||||
property is_right_punct:
|
||||
"""Is the token a left punctuation mark, e.g. "("?
|
||||
|
||||
RETURNS (bool): Whether the token is a left punctuation mark.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||
|
||||
property like_url:
|
||||
"""Does the token resemble a URL?
|
||||
|
||||
RETURNS (bool): Whether the token resembles a URL.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
||||
|
||||
property like_num:
|
||||
"""Does the token represent a number? e.g. "10.9", "10", "ten", etc.
|
||||
|
||||
RETURNS (bool): Whether the token resembles a number.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
|
||||
|
||||
property like_email:
|
||||
"""Does the token resemble an email address?
|
||||
|
||||
RETURNS (bool): Whether the token resembles an email address.
|
||||
"""
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
|
||||
|
|
|
@ -248,6 +248,28 @@ p
|
|||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "get_lca_matrix") Span.get_lca_matrix
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Calculates the lowest common ancestor matrix for a given #[code Span].
|
||||
| Returns LCA matrix containing the integer index of the ancestor, or
|
||||
| #[code -1] if no common ancestor is found, e.g. if span excludes a
|
||||
| necessary ancestor.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn')
|
||||
span = doc[1:4]
|
||||
matrix = span.get_lca_matrix()
|
||||
# array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The lowest common ancestor matrix of the #[code Span].
|
||||
|
||||
|
||||
+h(2, "to_array") Span.to_array
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
@ -495,6 +517,18 @@ p
|
|||
| The text content of the span with a trailing whitespace character
|
||||
| if the last token has one.
|
||||
|
||||
+row
|
||||
+cell #[code orth]
|
||||
+cell int
|
||||
+cell ID of the verbatim text content.
|
||||
|
||||
+row
|
||||
+cell #[code orth_]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Verbatim text content (identical to #[code Span.text]). Existst
|
||||
| mostly for consistency with the other attributes.
|
||||
|
||||
+row
|
||||
+cell #[code label]
|
||||
+cell int
|
||||
|
|
|
@ -489,15 +489,35 @@ p The L2 norm of the token's vector representation.
|
|||
+cell unicode
|
||||
+cell Base form of the token, with no inflectional suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code norm]
|
||||
+cell int
|
||||
+cell
|
||||
| The token's norm, i.e. a normalised form of the token text.
|
||||
| Usually set in the language's
|
||||
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
|
||||
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
|
||||
|
||||
+row
|
||||
+cell #[code norm_]
|
||||
+cell unicode
|
||||
+cell
|
||||
| The token's norm, i.e. a normalised form of the token text.
|
||||
| Usually set in the language's
|
||||
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
|
||||
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
|
||||
|
||||
+row
|
||||
+cell #[code lower]
|
||||
+cell int
|
||||
+cell Lower-case form of the token.
|
||||
+cell Lowercase form of the token.
|
||||
|
||||
+row
|
||||
+cell #[code lower_]
|
||||
+cell unicode
|
||||
+cell Lower-case form of the token.
|
||||
+cell
|
||||
| Lowercase form of the token text. Equivalent to
|
||||
| #[code Token.text.lower()].
|
||||
|
||||
+row
|
||||
+cell #[code shape]
|
||||
|
@ -537,7 +557,9 @@ p The L2 norm of the token's vector representation.
|
|||
+row
|
||||
+cell #[code suffix_]
|
||||
+cell unicode
|
||||
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
|
||||
+cell
|
||||
| Length-N substring from the end of the token. Defaults to
|
||||
| #[code N=3].
|
||||
|
||||
+row
|
||||
+cell #[code is_alpha]
|
||||
|
@ -672,6 +694,7 @@ p The L2 norm of the token's vector representation.
|
|||
+cell #[code lang]
|
||||
+cell int
|
||||
+cell Language of the parent document's vocabulary.
|
||||
|
||||
+row
|
||||
+cell #[code lang_]
|
||||
+cell unicode
|
||||
|
|
Loading…
Reference in New Issue