diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index bc3794126..b4815abd2 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -2,4 +2,4 @@ from .doc import Doc from .token import Token from .span import Span -__all__ = [Doc, Token, Span] +__all__ = ['Doc', 'Token', 'Span'] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1bd61b256..7c276e3c2 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs import intify_attrs, IDS from ..attrs cimport attr_id_t -from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER -from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE -from ..attrs cimport SENT_START +from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER +from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB +from ..attrs cimport ENT_TYPE, SENT_START from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..util import normalize_slice from ..compat import is_config, copy_reg, pickle @@ -78,24 +78,25 @@ def _get_chunker(lang): cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export - annotations to numpy arrays, losslessly serialize to compressed binary strings. - The `Doc` object holds an array of `TokenC` structs. The Python-level - `Token` and `Span` objects are views of this array, i.e. they don't own - the data themselves. + annotations to numpy arrays, losslessly serialize to compressed binary + strings. The `Doc` object holds an array of `TokenC` structs. The + Python-level `Token` and `Span` objects are views of this array, i.e. + they don't own the data themselves. EXAMPLE: Construction 1 >>> doc = nlp(u'Some text') Construction 2 >>> from spacy.tokens import Doc - >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) + >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], + spaces=[True, False, False]) """ @classmethod def set_extension(cls, name, default=None, method=None, getter=None, setter=None): nr_defined = sum(t is not None for t in (default, getter, setter, method)) assert nr_defined == 1 - Underscore.doc_extensions[name] = (default, method, getter, setter) + Underscore.doc_extensions[name] = (default, method, getter, setter) @classmethod def get_extension(cls, name): @@ -109,15 +110,14 @@ cdef class Doc: orths_and_spaces=None): """Create a Doc object. - vocab (Vocab): A vocabulary object, which must match any models you want - to use (e.g. tokenizer, parser, entity recognizer). + vocab (Vocab): A vocabulary object, which must match any models you + want to use (e.g. tokenizer, parser, entity recognizer). words (list or None): A list of unicode strings to add to the document as words. If `None`, defaults to empty list. spaces (list or None): A list of boolean values, of the same length as words. True means that the word is followed by a space, False means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. - RETURNS (Doc): The newly constructed object. """ self.vocab = vocab @@ -153,10 +153,10 @@ cdef class Doc: spaces = [True] * len(words) elif len(spaces) != len(words): raise ValueError( - "Arguments 'words' and 'spaces' should be sequences of the " - "same length, or 'spaces' should be left default at None. " - "spaces should be a sequence of booleans, with True meaning " - "that the word owns a ' ' character following it.") + "Arguments 'words' and 'spaces' should be sequences of " + "the same length, or 'spaces' should be left default at " + "None. spaces should be a sequence of booleans, with True " + "meaning that the word owns a ' ' character following it.") orths_and_spaces = zip(words, spaces) if orths_and_spaces is not None: for orth_space in orths_and_spaces: @@ -166,7 +166,8 @@ cdef class Doc: elif isinstance(orth_space, bytes): raise ValueError( "orths_and_spaces expects either List(unicode) or " - "List((unicode, bool)). Got bytes instance: %s" % (str(orth_space))) + "List((unicode, bool)). " + "Got bytes instance: %s" % (str(orth_space))) else: orth, has_space = orth_space # Note that we pass self.mem here --- we have ownership, if LexemeC @@ -186,7 +187,8 @@ cdef class Doc: def __getitem__(self, object i): """Get a `Token` or `Span` object. - i (int or tuple) The index of the token, or the slice of the document to get. + i (int or tuple) The index of the token, or the slice of the document + to get. RETURNS (Token or Span): The token at `doc[i]]`, or the span at `doc[start : end]`. @@ -199,11 +201,11 @@ cdef class Doc: >>> doc[start : end]] Get a `Span` object, starting at position `start` and ending at position `end`, where `start` and `end` are token indices. For - instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4. - Stepped slices (e.g. `doc[start : end : step]`) are not supported, - as `Span` objects must be contiguous (cannot have gaps). You can use - negative indices and open-ended ranges, which have their normal - Python semantics. + instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and + 4. Stepped slices (e.g. `doc[start : end : step]`) are not + supported, as `Span` objects must be contiguous (cannot have gaps). + You can use negative indices and open-ended ranges, which have + their normal Python semantics. """ if isinstance(i, slice): start, stop = normalize_slice(len(self), i.start, i.stop, i.step) @@ -262,8 +264,10 @@ cdef class Doc: doc (Doc): The parent document. start (int): The index of the first character of the span. end (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for named entities. - vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + label (uint64 or string): A label to attach to the Span, e.g. for + named entities. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of + the span. RETURNS (Span): The newly constructed object. """ if not isinstance(label, int): @@ -377,13 +381,14 @@ cdef class Doc: return self.text property ents: - """Iterate over the entities in the document. Yields named-entity `Span` - objects, if the entity recognizer has been applied to the document. + """Iterate over the entities in the document. Yields named-entity + `Span` objects, if the entity recognizer has been applied to the + document. YIELDS (Span): Entities in the document. - EXAMPLE: Iterate over the span to get individual Token objects, or access - the label: + EXAMPLE: Iterate over the span to get individual Token objects, + or access the label: >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') >>> ents = list(tokens.ents) @@ -456,10 +461,11 @@ cdef class Doc: property noun_chunks: """Iterate over the base noun phrases in the document. Yields base - noun-phrase #[code Span] objects, if the document has been syntactically - parsed. A base noun phrase, or "NP chunk", is a noun phrase that does - not permit other NPs to be nested within it – so no NP-level - coordination, no prepositional phrases, and no relative clauses. + noun-phrase #[code Span] objects, if the document has been + syntactically parsed. A base noun phrase, or "NP chunk", is a noun + phrase that does not permit other NPs to be nested within it – so no + NP-level coordination, no prepositional phrases, and no relative + clauses. YIELDS (Span): Noun chunks in the document. """ @@ -467,12 +473,14 @@ cdef class Doc: if not self.is_parsed: raise ValueError( "noun_chunks requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "requires a statistical model to be installed and loaded. " + "For more info, see the " "documentation: \n%s\n" % about.__docs_models__) - # Accumulate the result before beginning to iterate over it. This prevents - # the tokenisation from being changed out from under us during the iteration. - # The tricky thing here is that Span accepts its tokenisation changing, - # so it's okay once we have the Span objects. See Issue #375 + # Accumulate the result before beginning to iterate over it. This + # prevents the tokenisation from being changed out from under us + # during the iteration. The tricky thing here is that Span accepts + # its tokenisation changing, so it's okay once we have the Span + # objects. See Issue #375. spans = [] for start, end, label in self.noun_chunks_iterator(self): spans.append(Span(self, start, end, label=label)) @@ -497,8 +505,9 @@ cdef class Doc: if not self.is_parsed: raise ValueError( - "sentence boundary detection requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "Sentence boundary detection requires the dependency " + "parse, which requires a statistical model to be " + "installed and loaded. For more info, see the " "documentation: \n%s\n" % about.__docs_models__) cdef int i start = 0 @@ -537,12 +546,11 @@ cdef class Doc: @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): """Export given token attributes to a numpy `ndarray`. - - If `attr_ids` is a sequence of M attributes, the output array will - be of shape `(N, M)`, where N is the length of the `Doc` - (in tokens). If `attr_ids` is a single attribute, the output shape will - be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) - or string name (e.g. 'LEMMA' or 'lemma'). + If `attr_ids` is a sequence of M attributes, the output array will be + of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If + `attr_ids` is a single attribute, the output shape will be (N,). You + can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or + string name (e.g. 'LEMMA' or 'lemma'). attr_ids (list[]): A list of attributes (int IDs or string names). RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row @@ -641,13 +649,12 @@ cdef class Doc: def from_array(self, attrs, array): if SENT_START in attrs and HEAD in attrs: raise ValueError( - "Conflicting attributes specified in doc.from_array():\n" + "Conflicting attributes specified in doc.from_array(): " "(HEAD, SENT_START)\n" - "The HEAD attribute currently sets sentence boundaries implicitly,\n" - "based on the tree structure. This means the HEAD attribute would " - "potentially override the sentence boundaries set by SENT_START.\n" - "See https://github.com/spacy-io/spaCy/issues/235 for details and " - "workarounds, and to propose solutions.") + "The HEAD attribute currently sets sentence boundaries " + "implicitly, based on the tree structure. This means the HEAD " + "attribute would potentially override the sentence boundaries " + "set by SENT_START.") cdef int i, col cdef attr_id_t attr_id cdef TokenC* tokens = self.c @@ -675,18 +682,14 @@ cdef class Doc: return self def get_lca_matrix(self): - ''' - Calculates the lowest common ancestor matrix - for a given Spacy doc. - Returns LCA matrix containing the integer index - of the ancestor, or -1 if no common ancestor is - found (ex if span excludes a necessary ancestor). - Apologies about the recursion, but the - impact on performance is negligible given - the natural limitations on the depth of a typical human sentence. - ''' + """Calculates the lowest common ancestor matrix for a given `Doc`. + Returns LCA matrix containing the integer index of the ancestor, or -1 + if no common ancestor is found (ex if span excludes a necessary + ancestor). Apologies about the recursion, but the impact on + performance is negligible given the natural limitations on the depth + of a typical human sentence. + """ # Efficiency notes: - # # We can easily improve the performance here by iterating in Cython. # To loop over the tokens in Cython, the easiest way is: # for token in doc.c[:doc.c.length]: @@ -719,7 +722,6 @@ cdef class Doc: token_k = self[k] lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) lca_matrix[k][j] = lca_matrix[j][k] - return lca_matrix def to_disk(self, path, **exclude): @@ -819,14 +821,15 @@ cdef class Doc: return self def merge(self, int start_idx, int end_idx, *args, **attributes): - """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` - is merged into a single token. If `start_idx` and `end_idx `do not mark - start and end token boundaries, the document remains unchanged. + """Retokenize the document, such that the span at + `doc.text[start_idx : end_idx]` is merged into a single token. If + `start_idx` and `end_idx `do not mark start and end token boundaries, + the document remains unchanged. - start_idx (int): The character index of the start of the slice to merge. - end_idx (int): The character index after the end of the slice to merge. + start_idx (int): Character index of the start of the slice to merge. + end_idx (int): Character index after the end of the slice to merge. **attributes: Attributes to assign to the merged token. By default, - attributes are inherited from the syntactic root token of the span. + attributes are inherited from the syntactic root of the span. RETURNS (Token): The newly merged token, or `None` if the start and end indices did not fall at token boundaries. """ @@ -847,10 +850,10 @@ cdef class Doc: attributes[ENT_TYPE] = attributes['ent_type'] elif args: raise ValueError( - "Doc.merge received %d non-keyword arguments. " - "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " + "Doc.merge received %d non-keyword arguments. Expected either " + "3 arguments (deprecated), or 0 (use keyword arguments). " "Arguments supplied:\n%s\n" - "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) + "Keyword arguments: %s\n" % (len(args), repr(args), repr(attributes))) # More deprecated attribute handling =/ if 'label' in attributes: @@ -882,8 +885,9 @@ cdef class Doc: Token.set_struct_attr(token, attr_name, attr_value) # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets - # Before thinking of something simpler, beware the case where a dependency - # bridges over the entity. Here the alignment of the tokens changes. + # Before thinking of something simpler, beware the case where a + # dependency bridges over the entity. Here the alignment of the + # tokens changes. span_root = span.root.i token.dep = span.root.dep # We update token.lex after keeping span root and dep, since @@ -932,8 +936,9 @@ cdef class Doc: >>> trees = doc.print_tree() >>> trees[1] {'modifiers': [ - {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', - 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, + {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', + 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', + 'lemma': 'Alice'}, {'modifiers': [ {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], @@ -1008,7 +1013,7 @@ def pickle_doc(doc): def unpickle_doc(vocab, hooks_and_data, bytes_data): user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data) - + doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude='user_data') doc.user_hooks.update(doc_hooks) @@ -1018,4 +1023,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data): copy_reg.pickle(Doc, pickle_doc, unpickle_doc) - diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 963292fdb..3b2d14b2b 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -35,15 +35,16 @@ cdef class Span: def has_extension(cls, name): return name in Underscore.span_extensions - def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, - vector_norm=None): + def __cinit__(self, Doc doc, int start, int end, attr_t label=0, + vector=None, vector_norm=None): """Create a `Span` object from the slice `doc[start : end]`. doc (Doc): The parent document. start (int): The index of the first token of the span. end (int): The index of the first token after the span. label (uint64): A label to attach to the Span, e.g. for named entities. - vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation + of the span. RETURNS (Span): The newly constructed object. """ if not (0 <= start <= end <= len(doc)): @@ -162,7 +163,8 @@ cdef class Span: attributes are inherited from the syntactic root token of the span. RETURNS (Token): The newly merged token. """ - return self.doc.merge(self.start_char, self.end_char, *args, **attributes) + return self.doc.merge(self.start_char, self.end_char, *args, + **attributes) def similarity(self, other): """Make a semantic similarity estimate. The default estimate is cosine @@ -179,24 +181,19 @@ cdef class Span: return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) def get_lca_matrix(self): - ''' - Calculates the lowest common ancestor matrix - for a given Spacy span. - Returns LCA matrix containing the integer index - of the ancestor, or -1 if no common ancestor is - found (ex if span excludes a necessary ancestor). - Apologies about the recursion, but the - impact on performance is negligible given - the natural limitations on the depth of a typical human sentence. - ''' - + """Calculates the lowest common ancestor matrix for a given `Span`. + Returns LCA matrix containing the integer index of the ancestor, or -1 + if no common ancestor is found (ex if span excludes a necessary + ancestor). Apologies about the recursion, but the impact on + performance is negligible given the natural limitations on the depth + of a typical human sentence. + """ def __pairwise_lca(token_j, token_k, lca_matrix, margins): offset = margins[0] token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j token_j_i = token_j.i - offset token_k_i = token_k.i - offset - if lca_matrix[token_j_i][token_k_i] != -2: return lca_matrix[token_j_i][token_k_i] elif token_j == token_k: @@ -209,23 +206,19 @@ cdef class Span: lca_index = -1 else: lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) - lca_matrix[token_j_i][token_k_i] = lca_index lca_matrix[token_k_i][token_j_i] = lca_index - return lca_index lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) lca_matrix.fill(-2) margins = [self.start, self.end] - for j in range(len(self)): token_j = self[j] for k in range(len(self)): token_k = self[k] lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) lca_matrix[k][j] = lca_matrix[j][k] - return lca_matrix cpdef np.ndarray to_array(self, object py_attr_ids): @@ -349,7 +342,8 @@ cdef class Span: """The text content of the span with a trailing whitespace character if the last token has one. - RETURNS (unicode): The text content of the span (with trailing whitespace). + RETURNS (unicode): The text content of the span (with trailing + whitespace). """ def __get__(self): return u''.join([t.text_with_ws for t in self]) @@ -358,7 +352,8 @@ cdef class Span: """Yields base noun-phrase `Span` objects, if the document has been syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no - NP-level coordination, no prepositional phrases, and no relative clauses. + NP-level coordination, no prepositional phrases, and no relative + clauses. YIELDS (Span): Base noun-phrase `Span` objects """ @@ -366,7 +361,8 @@ cdef class Span: if not self.doc.is_parsed: raise ValueError( "noun_chunks requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "requires a statistical model to be installed and loaded. " + "For more info, see the " "documentation: \n%s\n" % about.__docs_models__) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. @@ -385,9 +381,9 @@ cdef class Span: RETURNS (Token): The root token. - EXAMPLE: The root token has the shortest path to the root of the sentence - (or is the root itself). If multiple words are equally high in the - tree, the first word is taken. For example: + EXAMPLE: The root token has the shortest path to the root of the + sentence (or is the root itself). If multiple words are equally + high in the tree, the first word is taken. For example: >>> toks = nlp(u'I like New York in Autumn.') @@ -437,11 +433,11 @@ cdef class Span: if self.doc.c[i].head == 0: return self.doc[i] # If we don't have a sentence root, we do something that's not so - # algorithmically clever, but I think should be quite fast, especially - # for short spans. + # algorithmically clever, but I think should be quite fast, + # especially for short spans. # For each word, we count the path length, and arg min this measure. - # We could use better tree logic to save steps here...But I think this - # should be okay. + # We could use better tree logic to save steps here...But I + # think this should be okay. cdef int current_best = self.doc.length cdef int root = -1 for i in range(self.start, self.end): @@ -463,7 +459,7 @@ cdef class Span: YIELDS (Token):A left-child of a token of the span. """ def __get__(self): - for token in reversed(self): # Reverse, so we get the tokens in order + for token in reversed(self): # Reverse, so we get tokens in order for left in token.lefts: if left.i < self.start: yield left @@ -493,7 +489,7 @@ cdef class Span: yield from word.subtree property ent_id: - """An (integer) entity ID. Usually assigned by patterns in the `Matcher`. + """An (integer) entity ID. RETURNS (uint64): The entity ID. """ @@ -503,8 +499,8 @@ cdef class Span: def __set__(self, hash_t key): # TODO raise NotImplementedError( - "Can't yet set ent_id from Span. Vote for this feature on the issue " - "tracker: http://github.com/explosion/spaCy/issues") + "Can't yet set ent_id from Span. Vote for this feature on " + "the issue tracker: http://github.com/explosion/spaCy/issues") property ent_id_: """A (string) entity ID. Usually assigned by patterns in the `Matcher`. @@ -517,13 +513,16 @@ cdef class Span: def __set__(self, hash_t key): # TODO raise NotImplementedError( - "Can't yet set ent_id_ from Span. Vote for this feature on the issue " - "tracker: http://github.com/explosion/spaCy/issues") + "Can't yet set ent_id_ from Span. Vote for this feature on the " + "issue tracker: http://github.com/explosion/spaCy/issues") property orth_: - # TODO: docstring + """Verbatim text content (identical to Span.text). Exists mostly for + consistency with other attributes. + + RETURNS (unicode): The span's text.""" def __get__(self): - return ''.join([t.string for t in self]).strip() + return ''.join([t.orth_ for t in self]).strip() property lemma_: """The span's lemma. @@ -534,19 +533,19 @@ cdef class Span: return ' '.join([t.lemma_ for t in self]).strip() property upper_: - # TODO: docstring + """Deprecated. Use Span.text.upper() instead.""" def __get__(self): - return ''.join([t.string.upper() for t in self]).strip() + return ''.join([t.text_with_ws.upper() for t in self]).strip() property lower_: - # TODO: docstring + """Deprecated. Use Span.text.lower() instead.""" def __get__(self): - return ''.join([t.string.lower() for t in self]).strip() + return ''.join([t.text_with_ws.lower() for t in self]).strip() property string: - # TODO: docstring + """Deprecated: Use Span.text instead.""" def __get__(self): - return ''.join([t.string for t in self]) + return ''.join([t.text_with_ws for t in self]) property label_: """The span's label. @@ -570,7 +569,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: n += 1 if n >= sent_length: raise RuntimeError( - "Array bounds exceeded while searching for root word. This likely " - "means the parse tree is in an invalid state. Please report this " - "issue here: http://github.com/explosion/spaCy/issues") + "Array bounds exceeded while searching for root word. This " + "likely means the parse tree is in an invalid state. Please " + "report this issue here: " + "http://github.com/explosion/spaCy/issues") return n diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 514934ca7..04aa3f582 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -14,17 +14,18 @@ from ..typedefs cimport hash_t from ..lexeme cimport Lexeme from .. import parts_of_speech from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE -from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV -from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER -from ..attrs cimport LEMMA, POS, TAG, DEP +from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT +from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL +from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX +from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..compat import is_config from .. import about from .underscore import Underscore cdef class Token: - """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" + """An individual token – i.e. a word, punctuation symbol, whitespace, + etc.""" @classmethod def set_extension(cls, name, default=None, method=None, getter=None, setter=None): @@ -171,10 +172,11 @@ cdef class Token: return self.orth_ property text_with_ws: - """The text content of the token with a trailing whitespace character if - it has one. + """The text content of the token with a trailing whitespace character + if it has one. - RETURNS (unicode): The text content of the span (with trailing whitespace). + RETURNS (unicode): The text content of the span (with trailing + whitespace). """ def __get__(self): cdef unicode orth = self.vocab.strings[self.c.lex.orth] @@ -306,9 +308,8 @@ cdef class Token: def __set__(self, value): if self.doc.is_parsed: raise ValueError( - 'Refusing to write to token.sent_start if its document is parsed, ' - 'because this may cause inconsistent state. ' - 'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.') + "Refusing to write to token.sent_start if its document " + "is parsed, because this may cause inconsistent state.") if value is None: self.c.sent_start = 0 elif value is True: @@ -316,13 +317,12 @@ cdef class Token: elif value is False: self.c.sent_start = -1 else: - raise ValueError("Invalid value for token.sent_start -- must be one of " - "None, True, False") + raise ValueError("Invalid value for token.sent_start. Must be " + "one of: None, True, False") property lefts: def __get__(self): - """ - The leftward immediate children of the word, in the syntactic + """The leftward immediate children of the word, in the syntactic dependency parse. """ cdef int nr_iter = 0 @@ -334,13 +334,12 @@ cdef class Token: nr_iter += 1 # This is ugly, but it's a way to guard out infinite loops if nr_iter >= 10000000: - raise RuntimeError( - "Possibly infinite loop encountered while looking for token.lefts") + raise RuntimeError("Possibly infinite loop encountered " + "while looking for token.lefts") property rights: def __get__(self): - """ - The rightward immediate children of the word, in the syntactic + """The rightward immediate children of the word, in the syntactic dependency parse. """ cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) @@ -352,27 +351,26 @@ cdef class Token: ptr -= 1 nr_iter += 1 if nr_iter >= 10000000: - raise RuntimeError( - "Possibly infinite loop encountered while looking for token.rights") + raise RuntimeError("Possibly infinite loop encountered " + "while looking for token.rights") tokens.reverse() for t in tokens: yield t property children: - """ - A sequence of the token's immediate syntactic children. + """A sequence of the token's immediate syntactic children. - Yields: Token A child token such that child.head==self + YIELDS (Token): A child token such that child.head==self """ def __get__(self): yield from self.lefts yield from self.rights property subtree: - """ - A sequence of all the token's syntactic descendents. + """A sequence of all the token's syntactic descendents. - Yields: Token A descendent token such that self.is_ancestor(descendent) + YIELDS (Token): A descendent token such that + `self.is_ancestor(descendent)`. """ def __get__(self): for word in self.lefts: @@ -456,13 +454,15 @@ cdef class Token: if self.c.head > 0: # left dependent old_head.c.l_kids -= 1 if self.c.l_edge == old_head.c.l_edge: - # the token dominates the left edge so the left edge of the head - # may change when the token is reattached - # it may not change if the new head is a descendant of the current head + # the token dominates the left edge so the left edge of + # the head may change when the token is reattached, it may + # not change if the new head is a descendant of the current + # head new_edge = self.c.l_edge - # the new l_edge is the left-most l_edge on any of the other dependents - # where the l_edge is left of the head, otherwise it is the head + # the new l_edge is the left-most l_edge on any of the + # other dependents where the l_edge is left of the head, + # otherwise it is the head if not is_desc: new_edge = old_head.i for child in old_head.children: @@ -472,14 +472,15 @@ cdef class Token: new_edge = child.c.l_edge old_head.c.l_edge = new_edge - # walk up the tree from old_head and assign new l_edge to ancestors - # until an ancestor already has an l_edge that's further left + # walk up the tree from old_head and assign new l_edge to + # ancestors until an ancestor already has an l_edge that's + # further left for anc in old_head.ancestors: if anc.c.l_edge <= new_edge: break anc.c.l_edge = new_edge - elif self.c.head < 0: # right dependent + elif self.c.head < 0: # right dependent old_head.c.r_kids -= 1 # do the same thing as for l_edge if self.c.r_edge == old_head.c.r_edge: @@ -500,7 +501,7 @@ cdef class Token: anc.c.r_edge = new_edge # update number of deps of new head - if rel_newhead_i > 0: # left dependent + if rel_newhead_i > 0: # left dependent new_head.c.l_kids += 1 # walk up the tree from new head and set l_edge to self.l_edge # until you hit a token with an l_edge further to the left @@ -511,7 +512,7 @@ cdef class Token: break anc.c.l_edge = self.c.l_edge - elif rel_newhead_i < 0: # right dependent + elif rel_newhead_i < 0: # right dependent new_head.c.r_kids += 1 # do the same as for l_edge if self.c.r_edge > new_head.c.r_edge: @@ -572,8 +573,8 @@ cdef class Token: property ent_iob_: """IOB code of named entity tag. "B" means the token begins an entity, - "I" means it is inside an entity, "O" means it is outside an entity, and - "" means no entity tag is set. + "I" means it is inside an entity, "O" means it is outside an entity, + and "" means no entity tag is set. RETURNS (unicode): IOB code of named entity tag. """ @@ -582,8 +583,7 @@ cdef class Token: return iob_strings[self.c.ent_iob] property ent_id: - """ID of the entity the token is an instance of, if any. Usually - assigned by patterns in the Matcher. + """ID of the entity the token is an instance of, if any. RETURNS (uint64): ID of the entity. """ @@ -594,8 +594,7 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """ID of the entity the token is an instance of, if any. Usually - assigned by patterns in the Matcher. + """ID of the entity the token is an instance of, if any. RETURNS (unicode): ID of the entity. """ @@ -606,34 +605,70 @@ cdef class Token: self.c.ent_id = self.vocab.strings.add(name) property whitespace_: + """Trailing space character if present. + + RETURNS (unicode): The whitespace character. + """ def __get__(self): return ' ' if self.c.spacy else '' property orth_: + """Verbatim text content (identical to `Token.text`). Existst mostly + for consistency with the other attributes. + + RETURNS (unicode): The token text. + """ def __get__(self): return self.vocab.strings[self.c.lex.orth] property lower_: + """Lowercase form of the token text. Equivalent to + `Token.text.lower()`. + + RETURNS (unicode): The lowercase token text. + """ def __get__(self): return self.vocab.strings[self.c.lex.lower] property norm_: + """The token's norm, i.e. a normalised form of the token text. + Usually set in the language's tokenizer exceptions or norm exceptions. + + RETURNS (unicode): The norm. + """ def __get__(self): return self.vocab.strings[self.c.lex.norm] property shape_: + """Transform of the tokens's string, to show orthographic features. + For example, "Xxxx" or "dd". + + RETURNS (unicode): The token shape. + """ def __get__(self): return self.vocab.strings[self.c.lex.shape] property prefix_: + """A length-N substring from the start of the token. Defaults to `N=1`. + + RETURNS (unicode): The token's prefix. + """ def __get__(self): return self.vocab.strings[self.c.lex.prefix] property suffix_: + """A length-N substring from the end of the token. Defaults to `N=3`. + + RETURNS (unicode): The token's suffix. + """ def __get__(self): return self.vocab.strings[self.c.lex.suffix] property lang_: + """Language of the parent document's vocabulary, e.g. 'en'. + + RETURNS (unicode): The language code. + """ def __get__(self): return self.vocab.strings[self.c.lex.lang] @@ -648,65 +683,152 @@ cdef class Token: self.c.lemma = self.vocab.strings.add(lemma_) property pos_: + """Coarse-grained part-of-speech. + + RETURNS (unicode): The part-of-speech tag. + """ def __get__(self): return parts_of_speech.NAMES[self.c.pos] property tag_: + """Fine-grained part-of-speech. + + RETURNS (unicode): The part-of-speech tag. + """ def __get__(self): return self.vocab.strings[self.c.tag] def __set__(self, tag): self.tag = self.vocab.strings.add(tag) property dep_: + """Syntactic dependency relation. + + RETURNS (unicode): The dependency label. + """ def __get__(self): return self.vocab.strings[self.c.dep] def __set__(self, unicode label): self.c.dep = self.vocab.strings.add(label) property is_oov: + """Is the token out-of-vocabulary? + + RETURNS (bool): Whether the token is out-of-vocabulary. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) property is_stop: + """Is the token part of a "stop list"? (defined by the language data) + + RETURNS (bool): Whether the token is a stop word. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP) property is_alpha: + """Does the token consist of alphabetic characters? Equivalent to + `token.text.isalpha()`. + + RETURNS (bool): Whether the token consists of alpha characters. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) property is_ascii: + """Does the token consist of ASCII characters? Equivalent to + `[any(ord(c) >= 128 for c in token.text)]`. + + RETURNS (bool): Whether the token consists of ASCII characters. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII) property is_digit: + """Does the token consist of digits? Equivalent to + `token.text.isdigit()`. + + RETURNS (bool): Whether the token consists of digits. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) property is_lower: + """Is the token in lowercase? Equivalent to `token.text.islower()`. + + RETURNS (bool): Whether the token is in lowercase. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER) + property is_upper: + """Is the token in uppercase? Equivalent to `token.text.isupper()`. + + RETURNS (bool): Whether the token is in uppercase. + """ + def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_UPPER) + property is_title: + """Is the token in titlecase? Equivalent to `token.text.istitle()`. + + RETURNS (bool): Whether the token is in titlecase. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE) property is_punct: + """Is the token punctuation? + + RETURNS (bool): Whether the token is punctuation. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) property is_space: + """Does the token consist of whitespace characters? Equivalent to + `token.text.isspace()`. + + RETURNS (bool): Whether the token consists of whitespace characters. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) property is_bracket: + """Is the token a bracket? + + RETURNS (bool): Whether the token is a bracket. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) property is_quote: + """Is the token a quotation mark? + + RETURNS (bool): Whether the token is a quotation mark. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) property is_left_punct: + """Is the token a left punctuation mark, e.g. "("? + + RETURNS (bool): Whether the token is a left punctuation mark. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) property is_right_punct: + """Is the token a left punctuation mark, e.g. "("? + + RETURNS (bool): Whether the token is a left punctuation mark. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) property like_url: + """Does the token resemble a URL? + + RETURNS (bool): Whether the token resembles a URL. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL) property like_num: + """Does the token represent a number? e.g. "10.9", "10", "ten", etc. + + RETURNS (bool): Whether the token resembles a number. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) property like_email: + """Does the token resemble an email address? + + RETURNS (bool): Whether the token resembles an email address. + """ def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) diff --git a/website/api/span.jade b/website/api/span.jade index 2a55409f1..f00cb936f 100644 --- a/website/api/span.jade +++ b/website/api/span.jade @@ -248,6 +248,28 @@ p +cell float +cell A scalar similarity score. Higher is more similar. ++h(2, "get_lca_matrix") Span.get_lca_matrix + +tag method + +p + | Calculates the lowest common ancestor matrix for a given #[code Span]. + | Returns LCA matrix containing the integer index of the ancestor, or + | #[code -1] if no common ancestor is found, e.g. if span excludes a + | necessary ancestor. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn') + span = doc[1:4] + matrix = span.get_lca_matrix() + # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] + +cell The lowest common ancestor matrix of the #[code Span]. + + +h(2, "to_array") Span.to_array +tag method +tag-new(2) @@ -495,6 +517,18 @@ p | The text content of the span with a trailing whitespace character | if the last token has one. + +row + +cell #[code orth] + +cell int + +cell ID of the verbatim text content. + + +row + +cell #[code orth_] + +cell unicode + +cell + | Verbatim text content (identical to #[code Span.text]). Existst + | mostly for consistency with the other attributes. + +row +cell #[code label] +cell int diff --git a/website/api/token.jade b/website/api/token.jade index 4062594b4..3ce11d07a 100644 --- a/website/api/token.jade +++ b/website/api/token.jade @@ -489,15 +489,35 @@ p The L2 norm of the token's vector representation. +cell unicode +cell Base form of the token, with no inflectional suffixes. + +row + +cell #[code norm] + +cell int + +cell + | The token's norm, i.e. a normalised form of the token text. + | Usually set in the language's + | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or + | #[+a("/usage/adding-languages#norm-exceptions") norm exceptions]. + + +row + +cell #[code norm_] + +cell unicode + +cell + | The token's norm, i.e. a normalised form of the token text. + | Usually set in the language's + | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or + | #[+a("/usage/adding-languages#norm-exceptions") norm exceptions]. + +row +cell #[code lower] +cell int - +cell Lower-case form of the token. + +cell Lowercase form of the token. +row +cell #[code lower_] +cell unicode - +cell Lower-case form of the token. + +cell + | Lowercase form of the token text. Equivalent to + | #[code Token.text.lower()]. +row +cell #[code shape] @@ -537,7 +557,9 @@ p The L2 norm of the token's vector representation. +row +cell #[code suffix_] +cell unicode - +cell Length-N substring from the end of the token. Defaults to #[code N=3]. + +cell + | Length-N substring from the end of the token. Defaults to + | #[code N=3]. +row +cell #[code is_alpha] @@ -672,6 +694,7 @@ p The L2 norm of the token's vector representation. +cell #[code lang] +cell int +cell Language of the parent document's vocabulary. + +row +cell #[code lang_] +cell unicode