Tidy up and document Doc, Token and Span

This commit is contained in:
ines 2017-10-27 15:41:45 +02:00
parent 1a559d4c95
commit 6a0483b7aa
6 changed files with 356 additions and 173 deletions

View File

@ -2,4 +2,4 @@ from .doc import Doc
from .token import Token
from .span import Span
__all__ = [Doc, Token, Span]
__all__ = ['Doc', 'Token', 'Span']

View File

@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs, IDS
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..attrs cimport SENT_START
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle
@ -78,24 +78,25 @@ def _get_chunker(lang):
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings.
The `Doc` object holds an array of `TokenC` structs. The Python-level
`Token` and `Span` objects are views of this array, i.e. they don't own
the data themselves.
annotations to numpy arrays, losslessly serialize to compressed binary
strings. The `Doc` object holds an array of `TokenC` structs. The
Python-level `Token` and `Span` objects are views of this array, i.e.
they don't own the data themselves.
EXAMPLE: Construction 1
>>> doc = nlp(u'Some text')
Construction 2
>>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
"""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
nr_defined = sum(t is not None for t in (default, getter, setter, method))
assert nr_defined == 1
Underscore.doc_extensions[name] = (default, method, getter, setter)
Underscore.doc_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
@ -109,15 +110,14 @@ cdef class Doc:
orths_and_spaces=None):
"""Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you want
to use (e.g. tokenizer, parser, entity recognizer).
vocab (Vocab): A vocabulary object, which must match any models you
want to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings to add to the document
as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc.
RETURNS (Doc): The newly constructed object.
"""
self.vocab = vocab
@ -153,10 +153,10 @@ cdef class Doc:
spaces = [True] * len(words)
elif len(spaces) != len(words):
raise ValueError(
"Arguments 'words' and 'spaces' should be sequences of the "
"same length, or 'spaces' should be left default at None. "
"spaces should be a sequence of booleans, with True meaning "
"that the word owns a ' ' character following it.")
"Arguments 'words' and 'spaces' should be sequences of "
"the same length, or 'spaces' should be left default at "
"None. spaces should be a sequence of booleans, with True "
"meaning that the word owns a ' ' character following it.")
orths_and_spaces = zip(words, spaces)
if orths_and_spaces is not None:
for orth_space in orths_and_spaces:
@ -166,7 +166,8 @@ cdef class Doc:
elif isinstance(orth_space, bytes):
raise ValueError(
"orths_and_spaces expects either List(unicode) or "
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
"List((unicode, bool)). "
"Got bytes instance: %s" % (str(orth_space)))
else:
orth, has_space = orth_space
# Note that we pass self.mem here --- we have ownership, if LexemeC
@ -186,7 +187,8 @@ cdef class Doc:
def __getitem__(self, object i):
"""Get a `Token` or `Span` object.
i (int or tuple) The index of the token, or the slice of the document to get.
i (int or tuple) The index of the token, or the slice of the document
to get.
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
`doc[start : end]`.
@ -199,11 +201,11 @@ cdef class Doc:
>>> doc[start : end]]
Get a `Span` object, starting at position `start` and ending at
position `end`, where `start` and `end` are token indices. For
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
as `Span` objects must be contiguous (cannot have gaps). You can use
negative indices and open-ended ranges, which have their normal
Python semantics.
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
4. Stepped slices (e.g. `doc[start : end : step]`) are not
supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have
their normal Python semantics.
"""
if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -262,8 +264,10 @@ cdef class Doc:
doc (Doc): The parent document.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
label (uint64 or string): A label to attach to the Span, e.g. for
named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
RETURNS (Span): The newly constructed object.
"""
if not isinstance(label, int):
@ -377,13 +381,14 @@ cdef class Doc:
return self.text
property ents:
"""Iterate over the entities in the document. Yields named-entity `Span`
objects, if the entity recognizer has been applied to the document.
"""Iterate over the entities in the document. Yields named-entity
`Span` objects, if the entity recognizer has been applied to the
document.
YIELDS (Span): Entities in the document.
EXAMPLE: Iterate over the span to get individual Token objects, or access
the label:
EXAMPLE: Iterate over the span to get individual Token objects,
or access the label:
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
@ -456,10 +461,11 @@ cdef class Doc:
property noun_chunks:
"""Iterate over the base noun phrases in the document. Yields base
noun-phrase #[code Span] objects, if the document has been syntactically
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
not permit other NPs to be nested within it so no NP-level
coordination, no prepositional phrases, and no relative clauses.
noun-phrase #[code Span] objects, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Noun chunks in the document.
"""
@ -467,12 +473,14 @@ cdef class Doc:
if not self.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires data to be installed. For more info, see the "
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents
# the tokenisation from being changed out from under us during the iteration.
# The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
# its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375.
spans = []
for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label))
@ -497,8 +505,9 @@ cdef class Doc:
if not self.is_parsed:
raise ValueError(
"sentence boundary detection requires the dependency parse, which "
"requires data to be installed. For more info, see the "
"Sentence boundary detection requires the dependency "
"parse, which requires a statistical model to be "
"installed and loaded. For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
cdef int i
start = 0
@ -537,12 +546,11 @@ cdef class Doc:
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Export given token attributes to a numpy `ndarray`.
If `attr_ids` is a sequence of M attributes, the output array will
be of shape `(N, M)`, where N is the length of the `Doc`
(in tokens). If `attr_ids` is a single attribute, the output shape will
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
or string name (e.g. 'LEMMA' or 'lemma').
If `attr_ids` is a sequence of M attributes, the output array will be
of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
`attr_ids` is a single attribute, the output shape will be (N,). You
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
string name (e.g. 'LEMMA' or 'lemma').
attr_ids (list[]): A list of attributes (int IDs or string names).
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
@ -641,13 +649,12 @@ cdef class Doc:
def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs:
raise ValueError(
"Conflicting attributes specified in doc.from_array():\n"
"Conflicting attributes specified in doc.from_array(): "
"(HEAD, SENT_START)\n"
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
"based on the tree structure. This means the HEAD attribute would "
"potentially override the sentence boundaries set by SENT_START.\n"
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
"workarounds, and to propose solutions.")
"The HEAD attribute currently sets sentence boundaries "
"implicitly, based on the tree structure. This means the HEAD "
"attribute would potentially override the sentence boundaries "
"set by SENT_START.")
cdef int i, col
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
@ -675,18 +682,14 @@ cdef class Doc:
return self
def get_lca_matrix(self):
'''
Calculates the lowest common ancestor matrix
for a given Spacy doc.
Returns LCA matrix containing the integer index
of the ancestor, or -1 if no common ancestor is
found (ex if span excludes a necessary ancestor).
Apologies about the recursion, but the
impact on performance is negligible given
the natural limitations on the depth of a typical human sentence.
'''
"""Calculates the lowest common ancestor matrix for a given `Doc`.
Returns LCA matrix containing the integer index of the ancestor, or -1
if no common ancestor is found (ex if span excludes a necessary
ancestor). Apologies about the recursion, but the impact on
performance is negligible given the natural limitations on the depth
of a typical human sentence.
"""
# Efficiency notes:
#
# We can easily improve the performance here by iterating in Cython.
# To loop over the tokens in Cython, the easiest way is:
# for token in doc.c[:doc.c.length]:
@ -719,7 +722,6 @@ cdef class Doc:
token_k = self[k]
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
lca_matrix[k][j] = lca_matrix[j][k]
return lca_matrix
def to_disk(self, path, **exclude):
@ -819,14 +821,15 @@ cdef class Doc:
return self
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
is merged into a single token. If `start_idx` and `end_idx `do not mark
start and end token boundaries, the document remains unchanged.
"""Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If
`start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): The character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge.
start_idx (int): Character index of the start of the slice to merge.
end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span.
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
"""
@ -847,10 +850,10 @@ cdef class Doc:
attributes[ENT_TYPE] = attributes['ent_type']
elif args:
raise ValueError(
"Doc.merge received %d non-keyword arguments. "
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
"Doc.merge received %d non-keyword arguments. Expected either "
"3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
"Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes)))
# More deprecated attribute handling =/
if 'label' in attributes:
@ -882,8 +885,9 @@ cdef class Doc:
Token.set_struct_attr(token, attr_name, attr_value)
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency
# bridges over the entity. Here the alignment of the tokens changes.
# Before thinking of something simpler, beware the case where a
# dependency bridges over the entity. Here the alignment of the
# tokens changes.
span_root = span.root.i
token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since
@ -932,8 +936,9 @@ cdef class Doc:
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
@ -1008,7 +1013,7 @@ def pickle_doc(doc):
def unpickle_doc(vocab, hooks_and_data, bytes_data):
user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
exclude='user_data')
doc.user_hooks.update(doc_hooks)
@ -1018,4 +1023,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)

View File

@ -35,15 +35,16 @@ cdef class Span:
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
vector_norm=None):
def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
vector=None, vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`.
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (uint64): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
of the span.
RETURNS (Span): The newly constructed object.
"""
if not (0 <= start <= end <= len(doc)):
@ -162,7 +163,8 @@ cdef class Span:
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token.
"""
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
return self.doc.merge(self.start_char, self.end_char, *args,
**attributes)
def similarity(self, other):
"""Make a semantic similarity estimate. The default estimate is cosine
@ -179,24 +181,19 @@ cdef class Span:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
def get_lca_matrix(self):
'''
Calculates the lowest common ancestor matrix
for a given Spacy span.
Returns LCA matrix containing the integer index
of the ancestor, or -1 if no common ancestor is
found (ex if span excludes a necessary ancestor).
Apologies about the recursion, but the
impact on performance is negligible given
the natural limitations on the depth of a typical human sentence.
'''
"""Calculates the lowest common ancestor matrix for a given `Span`.
Returns LCA matrix containing the integer index of the ancestor, or -1
if no common ancestor is found (ex if span excludes a necessary
ancestor). Apologies about the recursion, but the impact on
performance is negligible given the natural limitations on the depth
of a typical human sentence.
"""
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
offset = margins[0]
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
token_j_i = token_j.i - offset
token_k_i = token_k.i - offset
if lca_matrix[token_j_i][token_k_i] != -2:
return lca_matrix[token_j_i][token_k_i]
elif token_j == token_k:
@ -209,23 +206,19 @@ cdef class Span:
lca_index = -1
else:
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
lca_matrix[token_j_i][token_k_i] = lca_index
lca_matrix[token_k_i][token_j_i] = lca_index
return lca_index
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
lca_matrix.fill(-2)
margins = [self.start, self.end]
for j in range(len(self)):
token_j = self[j]
for k in range(len(self)):
token_k = self[k]
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
lca_matrix[k][j] = lca_matrix[j][k]
return lca_matrix
cpdef np.ndarray to_array(self, object py_attr_ids):
@ -349,7 +342,8 @@ cdef class Span:
"""The text content of the span with a trailing whitespace character if
the last token has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
RETURNS (unicode): The text content of the span (with trailing
whitespace).
"""
def __get__(self):
return u''.join([t.text_with_ws for t in self])
@ -358,7 +352,8 @@ cdef class Span:
"""Yields base noun-phrase `Span` objects, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative clauses.
NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Base noun-phrase `Span` objects
"""
@ -366,7 +361,8 @@ cdef class Span:
if not self.doc.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires data to be installed. For more info, see the "
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents
# the tokenisation from being changed out from under us during the iteration.
@ -385,9 +381,9 @@ cdef class Span:
RETURNS (Token): The root token.
EXAMPLE: The root token has the shortest path to the root of the sentence
(or is the root itself). If multiple words are equally high in the
tree, the first word is taken. For example:
EXAMPLE: The root token has the shortest path to the root of the
sentence (or is the root itself). If multiple words are equally
high in the tree, the first word is taken. For example:
>>> toks = nlp(u'I like New York in Autumn.')
@ -437,11 +433,11 @@ cdef class Span:
if self.doc.c[i].head == 0:
return self.doc[i]
# If we don't have a sentence root, we do something that's not so
# algorithmically clever, but I think should be quite fast, especially
# for short spans.
# algorithmically clever, but I think should be quite fast,
# especially for short spans.
# For each word, we count the path length, and arg min this measure.
# We could use better tree logic to save steps here...But I think this
# should be okay.
# We could use better tree logic to save steps here...But I
# think this should be okay.
cdef int current_best = self.doc.length
cdef int root = -1
for i in range(self.start, self.end):
@ -463,7 +459,7 @@ cdef class Span:
YIELDS (Token):A left-child of a token of the span.
"""
def __get__(self):
for token in reversed(self): # Reverse, so we get the tokens in order
for token in reversed(self): # Reverse, so we get tokens in order
for left in token.lefts:
if left.i < self.start:
yield left
@ -493,7 +489,7 @@ cdef class Span:
yield from word.subtree
property ent_id:
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
"""An (integer) entity ID.
RETURNS (uint64): The entity ID.
"""
@ -503,8 +499,8 @@ cdef class Span:
def __set__(self, hash_t key):
# TODO
raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on the issue "
"tracker: http://github.com/explosion/spaCy/issues")
"Can't yet set ent_id from Span. Vote for this feature on "
"the issue tracker: http://github.com/explosion/spaCy/issues")
property ent_id_:
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
@ -517,13 +513,16 @@ cdef class Span:
def __set__(self, hash_t key):
# TODO
raise NotImplementedError(
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
"tracker: http://github.com/explosion/spaCy/issues")
"Can't yet set ent_id_ from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
property orth_:
# TODO: docstring
"""Verbatim text content (identical to Span.text). Exists mostly for
consistency with other attributes.
RETURNS (unicode): The span's text."""
def __get__(self):
return ''.join([t.string for t in self]).strip()
return ''.join([t.orth_ for t in self]).strip()
property lemma_:
"""The span's lemma.
@ -534,19 +533,19 @@ cdef class Span:
return ' '.join([t.lemma_ for t in self]).strip()
property upper_:
# TODO: docstring
"""Deprecated. Use Span.text.upper() instead."""
def __get__(self):
return ''.join([t.string.upper() for t in self]).strip()
return ''.join([t.text_with_ws.upper() for t in self]).strip()
property lower_:
# TODO: docstring
"""Deprecated. Use Span.text.lower() instead."""
def __get__(self):
return ''.join([t.string.lower() for t in self]).strip()
return ''.join([t.text_with_ws.lower() for t in self]).strip()
property string:
# TODO: docstring
"""Deprecated: Use Span.text instead."""
def __get__(self):
return ''.join([t.string for t in self])
return ''.join([t.text_with_ws for t in self])
property label_:
"""The span's label.
@ -570,7 +569,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
n += 1
if n >= sent_length:
raise RuntimeError(
"Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/explosion/spaCy/issues")
"Array bounds exceeded while searching for root word. This "
"likely means the parse tree is in an invalid state. Please "
"report this issue here: "
"http://github.com/explosion/spaCy/issues")
return n

View File

@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
from .. import parts_of_speech
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LEMMA, POS, TAG, DEP
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config
from .. import about
from .underscore import Underscore
cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, etc."""
"""An individual token i.e. a word, punctuation symbol, whitespace,
etc."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
@ -171,10 +172,11 @@ cdef class Token:
return self.orth_
property text_with_ws:
"""The text content of the token with a trailing whitespace character if
it has one.
"""The text content of the token with a trailing whitespace character
if it has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
RETURNS (unicode): The text content of the span (with trailing
whitespace).
"""
def __get__(self):
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -306,9 +308,8 @@ cdef class Token:
def __set__(self, value):
if self.doc.is_parsed:
raise ValueError(
'Refusing to write to token.sent_start if its document is parsed, '
'because this may cause inconsistent state. '
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
"Refusing to write to token.sent_start if its document "
"is parsed, because this may cause inconsistent state.")
if value is None:
self.c.sent_start = 0
elif value is True:
@ -316,13 +317,12 @@ cdef class Token:
elif value is False:
self.c.sent_start = -1
else:
raise ValueError("Invalid value for token.sent_start -- must be one of "
"None, True, False")
raise ValueError("Invalid value for token.sent_start. Must be "
"one of: None, True, False")
property lefts:
def __get__(self):
"""
The leftward immediate children of the word, in the syntactic
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
cdef int nr_iter = 0
@ -334,13 +334,12 @@ cdef class Token:
nr_iter += 1
# This is ugly, but it's a way to guard out infinite loops
if nr_iter >= 10000000:
raise RuntimeError(
"Possibly infinite loop encountered while looking for token.lefts")
raise RuntimeError("Possibly infinite loop encountered "
"while looking for token.lefts")
property rights:
def __get__(self):
"""
The rightward immediate children of the word, in the syntactic
"""The rightward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
@ -352,27 +351,26 @@ cdef class Token:
ptr -= 1
nr_iter += 1
if nr_iter >= 10000000:
raise RuntimeError(
"Possibly infinite loop encountered while looking for token.rights")
raise RuntimeError("Possibly infinite loop encountered "
"while looking for token.rights")
tokens.reverse()
for t in tokens:
yield t
property children:
"""
A sequence of the token's immediate syntactic children.
"""A sequence of the token's immediate syntactic children.
Yields: Token A child token such that child.head==self
YIELDS (Token): A child token such that child.head==self
"""
def __get__(self):
yield from self.lefts
yield from self.rights
property subtree:
"""
A sequence of all the token's syntactic descendents.
"""A sequence of all the token's syntactic descendents.
Yields: Token A descendent token such that self.is_ancestor(descendent)
YIELDS (Token): A descendent token such that
`self.is_ancestor(descendent)`.
"""
def __get__(self):
for word in self.lefts:
@ -456,13 +454,15 @@ cdef class Token:
if self.c.head > 0: # left dependent
old_head.c.l_kids -= 1
if self.c.l_edge == old_head.c.l_edge:
# the token dominates the left edge so the left edge of the head
# may change when the token is reattached
# it may not change if the new head is a descendant of the current head
# the token dominates the left edge so the left edge of
# the head may change when the token is reattached, it may
# not change if the new head is a descendant of the current
# head
new_edge = self.c.l_edge
# the new l_edge is the left-most l_edge on any of the other dependents
# where the l_edge is left of the head, otherwise it is the head
# the new l_edge is the left-most l_edge on any of the
# other dependents where the l_edge is left of the head,
# otherwise it is the head
if not is_desc:
new_edge = old_head.i
for child in old_head.children:
@ -472,14 +472,15 @@ cdef class Token:
new_edge = child.c.l_edge
old_head.c.l_edge = new_edge
# walk up the tree from old_head and assign new l_edge to ancestors
# until an ancestor already has an l_edge that's further left
# walk up the tree from old_head and assign new l_edge to
# ancestors until an ancestor already has an l_edge that's
# further left
for anc in old_head.ancestors:
if anc.c.l_edge <= new_edge:
break
anc.c.l_edge = new_edge
elif self.c.head < 0: # right dependent
elif self.c.head < 0: # right dependent
old_head.c.r_kids -= 1
# do the same thing as for l_edge
if self.c.r_edge == old_head.c.r_edge:
@ -500,7 +501,7 @@ cdef class Token:
anc.c.r_edge = new_edge
# update number of deps of new head
if rel_newhead_i > 0: # left dependent
if rel_newhead_i > 0: # left dependent
new_head.c.l_kids += 1
# walk up the tree from new head and set l_edge to self.l_edge
# until you hit a token with an l_edge further to the left
@ -511,7 +512,7 @@ cdef class Token:
break
anc.c.l_edge = self.c.l_edge
elif rel_newhead_i < 0: # right dependent
elif rel_newhead_i < 0: # right dependent
new_head.c.r_kids += 1
# do the same as for l_edge
if self.c.r_edge > new_head.c.r_edge:
@ -572,8 +573,8 @@ cdef class Token:
property ent_iob_:
"""IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity, and
"" means no entity tag is set.
"I" means it is inside an entity, "O" means it is outside an entity,
and "" means no entity tag is set.
RETURNS (unicode): IOB code of named entity tag.
"""
@ -582,8 +583,7 @@ cdef class Token:
return iob_strings[self.c.ent_iob]
property ent_id:
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
"""ID of the entity the token is an instance of, if any.
RETURNS (uint64): ID of the entity.
"""
@ -594,8 +594,7 @@ cdef class Token:
self.c.ent_id = key
property ent_id_:
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
"""ID of the entity the token is an instance of, if any.
RETURNS (unicode): ID of the entity.
"""
@ -606,34 +605,70 @@ cdef class Token:
self.c.ent_id = self.vocab.strings.add(name)
property whitespace_:
"""Trailing space character if present.
RETURNS (unicode): The whitespace character.
"""
def __get__(self):
return ' ' if self.c.spacy else ''
property orth_:
"""Verbatim text content (identical to `Token.text`). Existst mostly
for consistency with the other attributes.
RETURNS (unicode): The token text.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.orth]
property lower_:
"""Lowercase form of the token text. Equivalent to
`Token.text.lower()`.
RETURNS (unicode): The lowercase token text.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.lower]
property norm_:
"""The token's norm, i.e. a normalised form of the token text.
Usually set in the language's tokenizer exceptions or norm exceptions.
RETURNS (unicode): The norm.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.norm]
property shape_:
"""Transform of the tokens's string, to show orthographic features.
For example, "Xxxx" or "dd".
RETURNS (unicode): The token shape.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.shape]
property prefix_:
"""A length-N substring from the start of the token. Defaults to `N=1`.
RETURNS (unicode): The token's prefix.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.prefix]
property suffix_:
"""A length-N substring from the end of the token. Defaults to `N=3`.
RETURNS (unicode): The token's suffix.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.suffix]
property lang_:
"""Language of the parent document's vocabulary, e.g. 'en'.
RETURNS (unicode): The language code.
"""
def __get__(self):
return self.vocab.strings[self.c.lex.lang]
@ -648,65 +683,152 @@ cdef class Token:
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
"""Coarse-grained part-of-speech.
RETURNS (unicode): The part-of-speech tag.
"""
def __get__(self):
return parts_of_speech.NAMES[self.c.pos]
property tag_:
"""Fine-grained part-of-speech.
RETURNS (unicode): The part-of-speech tag.
"""
def __get__(self):
return self.vocab.strings[self.c.tag]
def __set__(self, tag):
self.tag = self.vocab.strings.add(tag)
property dep_:
"""Syntactic dependency relation.
RETURNS (unicode): The dependency label.
"""
def __get__(self):
return self.vocab.strings[self.c.dep]
def __set__(self, unicode label):
self.c.dep = self.vocab.strings.add(label)
property is_oov:
"""Is the token out-of-vocabulary?
RETURNS (bool): Whether the token is out-of-vocabulary.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
property is_stop:
"""Is the token part of a "stop list"? (defined by the language data)
RETURNS (bool): Whether the token is a stop word.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
property is_alpha:
"""Does the token consist of alphabetic characters? Equivalent to
`token.text.isalpha()`.
RETURNS (bool): Whether the token consists of alpha characters.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
property is_ascii:
"""Does the token consist of ASCII characters? Equivalent to
`[any(ord(c) >= 128 for c in token.text)]`.
RETURNS (bool): Whether the token consists of ASCII characters.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
property is_digit:
"""Does the token consist of digits? Equivalent to
`token.text.isdigit()`.
RETURNS (bool): Whether the token consists of digits.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
property is_lower:
"""Is the token in lowercase? Equivalent to `token.text.islower()`.
RETURNS (bool): Whether the token is in lowercase.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
property is_upper:
"""Is the token in uppercase? Equivalent to `token.text.isupper()`.
RETURNS (bool): Whether the token is in uppercase.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
property is_title:
"""Is the token in titlecase? Equivalent to `token.text.istitle()`.
RETURNS (bool): Whether the token is in titlecase.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
property is_punct:
"""Is the token punctuation?
RETURNS (bool): Whether the token is punctuation.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space:
"""Does the token consist of whitespace characters? Equivalent to
`token.text.isspace()`.
RETURNS (bool): Whether the token consists of whitespace characters.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket:
"""Is the token a bracket?
RETURNS (bool): Whether the token is a bracket.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote:
"""Is the token a quotation mark?
RETURNS (bool): Whether the token is a quotation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct:
"""Is the token a left punctuation mark, e.g. "("?
RETURNS (bool): Whether the token is a left punctuation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct:
"""Is the token a left punctuation mark, e.g. "("?
RETURNS (bool): Whether the token is a left punctuation mark.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url:
"""Does the token resemble a URL?
RETURNS (bool): Whether the token resembles a URL.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
property like_num:
"""Does the token represent a number? e.g. "10.9", "10", "ten", etc.
RETURNS (bool): Whether the token resembles a number.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
property like_email:
"""Does the token resemble an email address?
RETURNS (bool): Whether the token resembles an email address.
"""
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)

View File

@ -248,6 +248,28 @@ p
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "get_lca_matrix") Span.get_lca_matrix
+tag method
p
| Calculates the lowest common ancestor matrix for a given #[code Span].
| Returns LCA matrix containing the integer index of the ancestor, or
| #[code -1] if no common ancestor is found, e.g. if span excludes a
| necessary ancestor.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn')
span = doc[1:4]
matrix = span.get_lca_matrix()
# array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell The lowest common ancestor matrix of the #[code Span].
+h(2, "to_array") Span.to_array
+tag method
+tag-new(2)
@ -495,6 +517,18 @@ p
| The text content of the span with a trailing whitespace character
| if the last token has one.
+row
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+row
+cell #[code orth_]
+cell unicode
+cell
| Verbatim text content (identical to #[code Span.text]). Existst
| mostly for consistency with the other attributes.
+row
+cell #[code label]
+cell int

View File

@ -489,15 +489,35 @@ p The L2 norm of the token's vector representation.
+cell unicode
+cell Base form of the token, with no inflectional suffixes.
+row
+cell #[code norm]
+cell int
+cell
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+row
+cell #[code norm_]
+cell unicode
+cell
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+row
+cell #[code lower]
+cell int
+cell Lower-case form of the token.
+cell Lowercase form of the token.
+row
+cell #[code lower_]
+cell unicode
+cell Lower-case form of the token.
+cell
| Lowercase form of the token text. Equivalent to
| #[code Token.text.lower()].
+row
+cell #[code shape]
@ -537,7 +557,9 @@ p The L2 norm of the token's vector representation.
+row
+cell #[code suffix_]
+cell unicode
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
+cell
| Length-N substring from the end of the token. Defaults to
| #[code N=3].
+row
+cell #[code is_alpha]
@ -672,6 +694,7 @@ p The L2 norm of the token's vector representation.
+cell #[code lang]
+cell int
+cell Language of the parent document's vocabulary.
+row
+cell #[code lang_]
+cell unicode