From e9e62b01b0ee1eb94d831dfda35a2f3cd7652791 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 19 May 2017 18:47:56 +0200 Subject: [PATCH] Update docstrings and API docs for Token --- spacy/tokens/token.pyx | 172 +++++++----- website/docs/api/token.jade | 504 +++++++++++++++++++----------------- 2 files changed, 374 insertions(+), 302 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 6430c9f29..68c19f4b5 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -23,10 +23,14 @@ from .. import about cdef class Token: - """ - An individual token --- i.e. a word, punctuation symbol, whitespace, etc. - """ + """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" def __cinit__(self, Vocab vocab, Doc doc, int offset): + """Construct a `Token` object. + + vocab (Vocab): A storage container for lexical types. + doc (Doc): The parent document. + offset (int): The index of the token within the document. + """ self.vocab = vocab self.doc = doc self.c = &self.doc.c[offset] @@ -36,8 +40,9 @@ cdef class Token: return hash((self.doc, self.i)) def __len__(self): - """ - Number of unicode characters in token.text. + """The number of unicode characters in the token, i.e. `token.text`. + + RETURNS (int): The number of unicode characters in the token. """ return self.c.lex.length @@ -75,37 +80,35 @@ cdef class Token: raise ValueError(op) cpdef bint check_flag(self, attr_id_t flag_id) except -1: - """ - Check the value of a boolean flag. + """Check the value of a boolean flag. - Arguments: - flag_id (int): The ID of the flag attribute. - Returns: - is_set (bool): Whether the flag is set. + flag_id (int): The ID of the flag attribute. + RETURNS (bool): Whether the flag is set. + + EXAMPLE: + >>> from spacy.attrs import IS_TITLE + >>> doc = nlp(u'Give it back! He pleaded.') + >>> token = doc[0] + >>> token.check_flag(IS_TITLE) + True """ return Lexeme.c_check_flag(self.c.lex, flag_id) def nbor(self, int i=1): - """ - Get a neighboring token. + """Get a neighboring token. - Arguments: - i (int): The relative position of the token to get. Defaults to 1. - Returns: - neighbor (Token): The token at position self.doc[self.i+i] + i (int): The relative position of the token to get. Defaults to 1. + RETURNS (Token): The token at position `self.doc[self.i+i]`. """ return self.doc[self.i+i] def similarity(self, other): - """ - Compute a semantic similarity estimate. Defaults to cosine over vectors. + """Make a semantic similarity estimate. The default estimate is cosine + similarity using an average of word vectors. - Arguments: - other: - The object to compare with. By default, accepts Doc, Span, - Token and Lexeme objects. - Returns: - score (float): A scalar similarity score. Higher is more similar. + other (object): The object to compare with. By default, accepts `Doc`, + `Span`, `Token` and `Lexeme` objects. + RETURNS (float): A scalar similarity score. Higher is more similar. """ if 'similarity' in self.doc.user_token_hooks: return self.doc.user_token_hooks['similarity'](self) @@ -114,10 +117,14 @@ cdef class Token: return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property lex_id: + """ID of the token's lexical type. + + RETURNS (int): ID of the token's lexical type.""" def __get__(self): return self.c.lex.id property rank: + # TODO: add docstring def __get__(self): return self.c.lex.id @@ -126,10 +133,19 @@ cdef class Token: return self.text_with_ws property text: + """A unicode representation of the token text. + + RETURNS (unicode): The original verbatim text of the token. + """ def __get__(self): return self.orth_ property text_with_ws: + """The text content of the token with a trailing whitespace character if + it has one. + + RETURNS (unicode): The text content of the span (with trailing whitespace). + """ def __get__(self): cdef unicode orth = self.vocab.strings[self.c.lex.orth] if self.c.spacy: @@ -184,6 +200,10 @@ cdef class Token: return self.c.lex.suffix property lemma: + """Base form of the word, with no inflectional suffixes. + + RETURNS (int): Token lemma. + """ def __get__(self): return self.c.lemma def __set__(self, int lemma): @@ -206,8 +226,10 @@ cdef class Token: self.c.dep = label property has_vector: - """ - A boolean value indicating whether a word vector is associated with the object. + """A boolean value indicating whether a word vector is associated with + the object. + + RETURNS (bool): Whether a word vector is associated with the object. """ def __get__(self): if 'has_vector' in self.doc.user_token_hooks: @@ -220,10 +242,10 @@ cdef class Token: return False property vector: - """ - A real-valued meaning representation. + """A real-valued meaning representation. - Type: numpy.ndarray[ndim=1, dtype='float32'] + RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array + representing the token's semantics. """ def __get__(self): if 'vector' in self.doc.user_token_hooks: @@ -239,15 +261,11 @@ cdef class Token: vector_view = self.c.lex.vector return numpy.asarray(vector_view) - property repvec: - def __get__(self): - raise AttributeError("repvec was renamed to vector in v0.100") - - property has_repvec: - def __get__(self): - raise AttributeError("has_repvec was renamed to has_vector in v0.100") - property vector_norm: + """The L2 norm of the document's vector representation. + + RETURNS (float): The L2 norm of the vector representation. + """ def __get__(self): if 'vector_norm' in self.doc.user_token_hooks: return self.doc.user_token_hooks['vector_norm'](self) @@ -324,28 +342,26 @@ cdef class Token: yield from word.subtree property left_edge: - """ - The leftmost token of this token's syntactic descendents. + """The leftmost token of this token's syntactic descendents. - Returns: Token The first token such that self.is_ancestor(token) + RETURNS (Token): The first token such that `self.is_ancestor(token)`. """ def __get__(self): return self.doc[self.c.l_edge] property right_edge: - """ - The rightmost token of this token's syntactic descendents. + """The rightmost token of this token's syntactic descendents. - Returns: Token The last token such that self.is_ancestor(token) + RETURNS (Token): The last token such that `self.is_ancestor(token)`. """ def __get__(self): return self.doc[self.c.r_edge] property ancestors: - """ - A sequence of this token's syntactic ancestors. + """A sequence of this token's syntactic ancestors. - Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self) + YIELDS (Token): A sequence of ancestor tokens such that + `ancestor.is_ancestor(self)`. """ def __get__(self): cdef const TokenC* head_ptr = self.c @@ -357,33 +373,25 @@ cdef class Token: yield self.doc[head_ptr - (self.c - self.i)] i += 1 - def is_ancestor_of(self, descendant): - # TODO: Remove after backward compatibility check. - return self.is_ancestor(descendant) - def is_ancestor(self, descendant): - """ - Check whether this token is a parent, grandparent, etc. of another + """Check whether this token is a parent, grandparent, etc. of another in the dependency tree. - Arguments: - descendant (Token): Another token. - Returns: - is_ancestor (bool): Whether this token is the ancestor of the descendant. + descendant (Token): Another token. + RETURNS (bool): Whether this token is the ancestor of the descendant. """ if self.doc is not descendant.doc: return False return any( ancestor.i == self.i for ancestor in descendant.ancestors ) property head: - """ - The syntactic parent, or "governor", of this token. + """The syntactic parent, or "governor", of this token. - Returns: Token + RETURNS (Token): The token head. """ def __get__(self): - """ - The token predicted by the parser to be the head of the current token. + """The token predicted by the parser to be the head of the current + token. """ return self.doc[self.i + self.c.head] def __set__(self, Token new_head): @@ -477,10 +485,9 @@ cdef class Token: self.c.head = rel_newhead_i property conjuncts: - """ - A sequence of coordinated tokens, including the token itself. + """A sequence of coordinated tokens, including the token itself. - Yields: Token A coordinated token + YIELDS (Token): A coordinated token. """ def __get__(self): """Get a list of conjoined words.""" @@ -495,25 +502,46 @@ cdef class Token: yield from word.conjuncts property ent_type: + """Named entity type. + + RETURNS (int): Named entity type. + """ def __get__(self): return self.c.ent_type property ent_iob: + """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag + is assigned. + + RETURNS (int): IOB code of named entity tag. + """ def __get__(self): return self.c.ent_iob property ent_type_: + """Named entity type. + + RETURNS (unicode): Named entity type. + """ def __get__(self): return self.vocab.strings[self.c.ent_type] property ent_iob_: + """IOB code of named entity tag. "B" means the token begins an entity, + "I" means it is inside an entity, "O" means it is outside an entity, and + "" means no entity tag is set. + + RETURNS (unicode): IOB code of named entity tag. + """ def __get__(self): iob_strings = ('', 'I', 'O', 'B') return iob_strings[self.c.ent_iob] property ent_id: - """ - An (integer) entity ID. Usually assigned by patterns in the Matcher. + """ID of the entity the token is an instance of, if any. Usually + assigned by patterns in the Matcher. + + RETURNS (int): ID of the entity. """ def __get__(self): return self.c.ent_id @@ -522,8 +550,10 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """ - A (string) entity ID. Usually assigned by patterns in the Matcher. + """ID of the entity the token is an instance of, if any. Usually + assigned by patterns in the Matcher. + + RETURNS (unicode): ID of the entity. """ def __get__(self): return self.vocab.strings[self.c.ent_id] @@ -564,6 +594,10 @@ cdef class Token: return self.vocab.strings[self.c.lex.lang] property lemma_: + """Base form of the word, with no inflectional suffixes. + + RETURNS (unicode): Token lemma. + """ def __get__(self): return self.vocab.strings[self.c.lemma] def __set__(self, unicode lemma_): diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 1cd4d850d..c0b9e9e3c 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -4,9 +4,255 @@ include ../../_includes/_mixins p An individual token — i.e. a word, punctuation symbol, whitespace, etc. ++h(2, "init") Token.__init__ + +tag method + +p Construct a #[code Token] object. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + token = doc[0] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[code Vocab] + +cell A storage container for lexical types. + + +row + +cell #[code doc] + +cell #[code Doc] + +cell The parent document. + + +row + +cell #[code offset] + +cell int + +cell The index of the token within the document. + + +footrow + +cell returns + +cell #[code Token] + +cell The newly constructed object. + ++h(2, "len") Token.__len__ + +tag method + +p The number of unicode characters in the token, i.e. #[code token.text]. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + token = doc[0] + assert len(token) == 4 + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell int + +cell The number of unicode characters in the token. + ++h(2, "check_flag") Token.check_flag + +tag method + +p Check the value of a boolean flag. + ++aside-code("Example"). + from spacy.attrs import IS_TITLE + doc = nlp(u'Give it back! He pleaded.') + token = doc[0] + token.check_flag(IS_TITLE) + # True + ++table(["Name", "Type", "Description"]) + +row + +cell #[code flag_id] + +cell int + +cell The attribute ID of the flag to check. + + +footrow + +cell returns + +cell bool + +cell Whether the flag is set. + ++h(2, "nbor") Token.nbor + +tag method + +p Get a neighboring token. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + token = doc[0] + token.nbor() + # it + ++table(["Name", "Type", "Description"]) + +row + +cell #[code i] + +cell int + +cell The relative position of the token to get. Defaults to #[code 1]. + + +footrow + +cell returns + +cell #[code Token] + +cell The token at position #[code self.doc[self.i+i]]. + ++h(2, "similarity") Token.similarity + +tag method + +p Compute a semantic similarity estimate. Defaults to cosine over vectors. + ++aside-code("Example"). + apples, and, oranges = nlp(u'apples and oranges') + apples_oranges = apples.similarity(oranges) + oranges_apples = oranges.similarity(apples) + assert apples_oranges == oranges_apples + ++table(["Name", "Type", "Description"]) + +row + +cell other + +cell - + +cell + | The object to compare with. By default, accepts #[code Doc], + | #[code Span], #[code Token] and #[code Lexeme] objects. + + +footrow + +cell returns + +cell float + +cell A scalar similarity score. Higher is more similar. + ++h(2, "is_ancestor") Token.is_ancestor + +tag method + +p + | Check whether this token is a parent, grandparent, etc. of another + | in the dependency tree. + ++table(["Name", "Type", "Description"]) + +row + +cell descendant + +cell #[code Token] + +cell Another token. + + +footrow + +cell returns + +cell bool + +cell Whether this token is the ancestor of the descendant. + ++h(2, "has_vector") Token.has_vector + +tag property + +tag requires model + +p + | A boolean value indicating whether a word vector is associated with the + | token. + ++aside-code("Example"). + apple = nlp(u'apple') + assert apple.has_vector + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell bool + +cell Whether the token has a vector data attached. + ++h(2, "vector") Token.vector + +tag property + +tag requires model + +p + | A real-valued meaning representation. + ++aside-code("Example"). + apple = nlp(u'apple') + (apple.vector.dtype, apple.vector.shape) + # (dtype('float32'), (300,)) + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell A 1D numpy array representing the token's semantics. + ++h(2, "vector_norm") Span.vector_norm + +tag property + +tag requires model + +p + | The L2 norm of the token's vector representation. + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell float + +cell The L2 norm of the vector representation. + ++h(2, "conjuncts") Token.conjuncts + +tag property + +p A sequence of coordinated tokens, including the token itself. + ++table(["Name", "Type", "Description"]) + +footrow + +cell yields + +cell #[code Token] + +cell A coordinated token. + ++h(2, "children") Token.children + +tag property + +p A sequence of the token's immediate syntactic children. + ++table(["Name", "Type", "Description"]) + +footrow + +cell yields + +cell #[code Token] + +cell A child token such that #[code child.head==self]. + ++h(2, "subtree") Token.subtree + +tag property + +p A sequence of all the token's syntactic descendents. + ++table(["Name", "Type", "Description"]) + +footrow + +cell yields + +cell #[code Token] + +cell A descendant token such that #[code self.is_ancestor(descendant)]. + ++h(2, "ancestors") Token.ancestors + +tag property + +p The rightmost token of this token's syntactic descendants. + ++table(["Name", "Type", "Description"]) + +footrow + +cell yields + +cell #[code Token] + +cell + | A sequence of ancestor tokens such that + | #[code ancestor.is_ancestor(self)]. + +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) + +row + +cell #[code text] + +cell unicode + +cell Verbatim text content. + +row + +cell #[code text_with_ws] + +cell unicode + +cell Text content, with trailing space character if present. + + +row + +cell #[code whitespace] + +cell int + +cell Trailing space character if present. + +row + +cell #[code whitespace_] + +cell unicode + +cell Trailing space character if present. + +row +cell #[code vocab] +cell #[code Vocab] @@ -17,14 +263,31 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc. +cell #[code Doc] +cell The parent document. + +row + +cell #[code head] + +cell #[code Token] + +cell The syntactic parent, or "governor", of this token. + + +row + +cell #[code left_edge] + +cell #[code Token] + +cell The leftmost token of this token's syntactic descendants. + + +row + +cell #[code right_edge] + +cell #[code Token] + +cell The rightmost token of this token's syntactic descendents. + +row +cell #[code i] +cell int +cell The index of the token within the parent document. + +row +cell #[code ent_type] +cell int +cell Named entity type. + +row +cell #[code ent_type_] +cell unicode @@ -42,19 +305,23 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc. +cell unicode +cell | IOB code of named entity tag. #[code "B"] - | means the token begins an entity, #[code "I"] means it inside an - | entity, #[code "O"] means it is outside an entity, and + | means the token begins an entity, #[code "I"] means it is inside + | an entity, #[code "O"] means it is outside an entity, and | #[code ""] means no entity tag is set. +row +cell #[code ent_id] +cell int - +cell ID of the entity the token is an instance of, if any. + +cell + | ID of the entity the token is an instance of, if any. Usually + | assigned by patterns in the Matcher. +row +cell #[code ent_id_] +cell unicode - +cell ID of the entity the token is an instance of, if any. + +cell + | ID of the entity the token is an instance of, if any. Usually + | assigned by patterns in the Matcher. +row +cell #[code lemma] @@ -229,232 +496,3 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc. +cell #[code lex_id] +cell int +cell ID of the token's lexical type. - - +row - +cell #[code text] - +cell unicode - +cell Verbatim text content. - +row - +cell #[code text_with_ws] - +cell unicode - +cell Text content, with trailing space character if present. - - +row - +cell #[code whitespace] - +cell int - +cell Trailing space character if present. - +row - +cell #[code whitespace_] - +cell unicode - +cell Trailing space character if present. - - -+h(2, "init") Token.__init__ - +tag method - -p Construct a #[code Token] object. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code vocab] - +cell #[code Vocab] - +cell A storage container for lexical types. - - +row - +cell #[code doc] - +cell #[code Doc] - +cell The parent document. - - +row - +cell #[code offset] - +cell int - +cell The index of the token within the document. - - +footrow - +cell returns - +cell #[code Token] - +cell The newly constructed object. - -+h(2, "len") Token.__len__ - +tag method - -p Get the number of unicode characters in the token. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell int - +cell The number of unicode characters in the token. - - -+h(2, "check_flag") Token.check_flag - +tag method - -p Check the value of a boolean flag. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code flag_id] - +cell int - +cell The attribute ID of the flag to check. - - +footrow - +cell returns - +cell bool - +cell Whether the flag is set. - -+h(2, "nbor") Token.nbor - +tag method - -p Get a neighboring token. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code i] - +cell int - +cell The relative position of the token to get. Defaults to #[code 1]. - - +footrow - +cell returns - +cell #[code Token] - +cell The token at position #[code self.doc[self.i+i]] - -+h(2, "similarity") Token.similarity - +tag method - -p Compute a semantic similarity estimate. Defaults to cosine over vectors. - -+table(["Name", "Type", "Description"]) - +row - +cell other - +cell - - +cell - | The object to compare with. By default, accepts #[code Doc], - | #[code Span], #[code Token] and #[code Lexeme] objects. - - +footrow - +cell returns - +cell float - +cell A scalar similarity score. Higher is more similar. - -+h(2, "is_ancestor") Token.is_ancestor - +tag method - -p - | Check whether this token is a parent, grandparent, etc. of another - | in the dependency tree. - -+table(["Name", "Type", "Description"]) - +row - +cell descendant - +cell #[code Token] - +cell Another token. - - +footrow - +cell returns - +cell bool - +cell Whether this token is the ancestor of the descendant. - - -+h(2, "vector") Token.vector - +tag property - -p A real-valued meaning representation. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] - +cell A 1D numpy array representing the token's semantics. - -+h(2, "has_vector") Token.has_vector - +tag property - -p - | A boolean value indicating whether a word vector is associated with the - | object. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell bool - +cell Whether the token has a vector data attached. - -+h(2, "head") Token.head - +tag property - -p The syntactic parent, or "governor", of this token. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell #[code Token] - +cell The head. - -+h(2, "conjuncts") Token.conjuncts - +tag property - -p A sequence of coordinated tokens, including the token itself. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yields - +cell #[code Token] - +cell A coordinated token. - -+h(2, "children") Token.children - +tag property - -p A sequence of the token's immediate syntactic children. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yields - +cell #[code Token] - +cell A child token such that #[code child.head==self]. - -+h(2, "subtree") Token.subtree - +tag property - -p A sequence of all the token's syntactic descendents. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yields - +cell #[code Token] - +cell A descendant token such that #[code self.is_ancestor(descendant)]. - -+h(2, "left_edge") Token.left_edge - +tag property - -p The leftmost token of this token's syntactic descendants. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell #[code Token] - +cell The first token such that #[code self.is_ancestor(token)]. - -+h(2, "right_edge") Token.right_edge - +tag property - -p The rightmost token of this token's syntactic descendents. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell #[code Token] - +cell The last token such that #[code self.is_ancestor(token)]. - -+h(2, "ancestors") Token.ancestors - +tag property - -p The rightmost token of this token's syntactic descendants. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yields - +cell #[code Token] - +cell - | A sequence of ancestor tokens such that - | #[code ancestor.is_ancestor(self)].