diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 55330af78..f7b10572e 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -66,6 +66,10 @@ cdef class Span: return hash((self.doc, self.label, self.start_char, self.end_char)) def __len__(self): + """Get the number of tokens in the span. + + RETURNS (int): The number of tokens in the span. + """ self._recalculate_indices() if self.end < self.start: return 0 @@ -77,6 +81,16 @@ cdef class Span: return self.text.encode('utf-8') def __getitem__(self, object i): + """Get a `Token` or a `Span` object + + i (int or tuple): The index of the token within the span, or slice of + the span to get. + RETURNS (Token or Span): The token at `span[i]`. + + EXAMPLE: + >>> span[0] + >>> span[1:3] + """ self._recalculate_indices() if isinstance(i, slice): start, end = normalize_slice(len(self), i.start, i.stop, i.step) @@ -88,12 +102,17 @@ cdef class Span: return self.doc[self.start + i] def __iter__(self): + """Iterate over `Token` objects. + + YIELDS (Token): A `Token` object. + """ self._recalculate_indices() for i in range(self.start, self.end): yield self.doc[i] def merge(self, *args, **attributes): - """Retokenize the document, such that the span is merged into a single token. + """Retokenize the document, such that the span is merged into a single + token. **attributes: Attributes to assign to the merged token. By default, attributes are inherited from the syntactic root token of the span. @@ -241,15 +260,15 @@ cdef class Span: The head of 'new' is 'York', and the head of "York" is "like" - >>> toks[new].head.orth_ + >>> toks[new].head.text 'York' - >>> toks[york].head.orth_ + >>> toks[york].head.text 'like' Create a span for "New York". Its root is "York". >>> new_york = toks[new:york+1] - >>> new_york.root.orth_ + >>> new_york.root.text 'York' Here's a more complicated case, raised by issue #214: @@ -370,7 +389,10 @@ cdef class Span: return ''.join([t.string for t in self]).strip() property lemma_: - # TODO: docstring + """The span's lemma. + + RETURNS (unicode): The span's lemma. + """ def __get__(self): return ' '.join([t.lemma_ for t in self]).strip() @@ -390,7 +412,10 @@ cdef class Span: return ''.join([t.string for t in self]) property label_: - # TODO: docstring + """The span's label. + + RETURNS (unicode): The span's label. + """ def __get__(self): return self.doc.vocab.strings[self.label] diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 539a64311..9fa322f3e 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -2,7 +2,265 @@ include ../../_includes/_mixins -p A slice from a #[code Doc] object. +p A slice from a #[+api("doc") #[code Doc]] object. + ++h(2, "init") Span.__init__ + +tag method + +p Create a Span object from the #[code slice doc[start : end]]. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + span = doc[1:4] + print([token.text for token in span]) + # ['it', 'back', '!'] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The parent document. + + +row + +cell #[code start] + +cell int + +cell The index of the first token of the span. + + +row + +cell #[code end] + +cell int + +cell The index of the first token after the span. + + +row + +cell #[code label] + +cell int + +cell A label to attach to the span, e.g. for named entities. + + +row + +cell #[code vector] + +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell A meaning representation of the span. + + +footrow + +cell returns + +cell #[code Span] + +cell The newly constructed object. + ++h(2, "getitem") Span.__getitem__ + +tag method + +p Get a #[code Token] object. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + span = doc[1:4] + assert span[1].text == 'back' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code i] + +cell int + +cell The index of the token within the span. + + +footrow + +cell returns + +cell #[code Token] + +cell The token at #[code span[i]]. + +p Get a #[code Span] object. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + span = doc[1:4] + assert span[1:3].text == 'back!' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code start_end] + +cell tuple + +cell The slice of the span to get. + + +footrow + +cell returns + +cell #[code Span] + +cell The span at #[code span[start : end]]. + ++h(2, "iter") Span.__iter__ + +tag method + +p Iterate over #[code Token] objects. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + span = doc[1:4] + print([token.text for token in span]) + # ['it', 'back', '!'] + ++table(["Name", "Type", "Description"]) + +footrow + +cell yields + +cell #[code Token] + +cell A #[code Token] object. + ++h(2, "len") Span.__len__ + +tag method + +p Get the number of tokens in the span. + ++aside-code("Example"). + doc = nlp(u'Give it back! He pleaded.') + span = doc[1:4] + assert len(span) == 3 + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell int + +cell The number of tokens in the span. + ++h(2, "similarity") Span.similarity + +tag method + +tag requires model + +p + | Make a semantic similarity estimate. The default estimate is cosine + | similarity using an average of word vectors. + ++aside-code("Example"). + doc = nlp(u'apples and oranges') + apples = doc[0] + oranges = doc[1] + apples_oranges = apples.similarity(oranges) + oranges_apples = oranges.similarity(apples) + assert apples_oranges == oranges_apples + ++table(["Name", "Type", "Description"]) + +row + +cell #[code other] + +cell - + +cell + | The object to compare with. By default, accepts #[code Doc], + | #[code Span], #[code Token] and #[code Lexeme] objects. + + +footrow + +cell returns + +cell float + +cell A scalar similarity score. Higher is more similar. + ++h(2, "merge") Span.merge + +tag method + +p Retokenize the document, such that the span is merged into a single token. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code **attributes] + +cell - + +cell + | Attributes to assign to the merged token. By default, attributes + | are inherited from the syntactic root token of the span. + + +footrow + +cell returns + +cell #[code Token] + +cell The newly merged token. + ++h(2, "text") Span.text + +tag property + ++aside-code("Example"). + doc = nlp('Give it back! He pleaded.') + assert doc[1:4].text == 'it back!' + +p A unicode representation of the span text. + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell unicode + +cell The original verbatim text of the span. + ++h(2, "text_with_ws") Span.text_with_ws + +tag property + ++aside-code("Example"). + doc = nlp('Give it back! He pleaded.') + assert doc[1:4].text_with_ws == 'it back! ' + +p + | The text content of the span with a trailing whitespace character if the + | last token has one. + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell unicode + +cell The text content of the span (with trailing whitespace). + ++h(2, "sent") Span.sent + +tag property + +p The sentence span that this span is a part of. + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell #[code Span] + +cell The sentence this is part of. + ++h(2, "root") Span.root + +tag property + +p + | The token within the span that's highest in the parse tree. If there's a + | tie, the earlist is prefered. + ++aside-code("Example"). + tokens = nlp(u'I like New York in Autumn.') + i, like, new, york, in_, autumn, dot = range(len(tokens)) + assert tokens[new].head.text == 'York' + assert tokens[york].head.text == 'like' + new_york = tokens[new:york+1] + assert new_york.root.text == 'York' + ++table(["Name", "Type", "Description"]) + +footrow + +cell returns + +cell #[code Token] + +cell The root token. + ++h(2, "lefts") Span.lefts + +tag property + +p Tokens that are to the left of the span, whose head is within the span. + ++table(["Name", "Type", "Description"]) + +footrow + +cell yields + +cell #[code Token] + +cell A left-child of a token of the span. + ++h(2, "rights") Span.rights + +tag property + +p Tokens that are to the right of the span, whose head is within the span. + ++table(["Name", "Type", "Description"]) + +footrow + +cell yields + +cell #[code Token] + +cell A right-child of a token of the span. + ++h(2, "subtree") Span.subtree + +tag property + +p Tokens that descend from tokens in the span, but fall outside it. + ++table(["Name", "Type", "Description"]) + +footrow + +cell yields + +cell #[code Token] + +cell A descendant of a token within the span. +h(2, "attributes") Attributes @@ -56,209 +314,3 @@ p A slice from a #[code Doc] object. +cell #[code ent_id_] +cell unicode +cell The string ID of the named entity the token is an instance of. - -+h(2, "init") Span.__init__ - +tag method - -p Create a Span object from the #[code slice doc[start : end]]. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code doc] - +cell #[code Doc] - +cell The parent document. - - +row - +cell #[code start] - +cell int - +cell The index of the first token of the span. - - +row - +cell #[code end] - +cell int - +cell The index of the first token after the span. - - +row - +cell #[code label] - +cell int - +cell A label to attach to the span, e.g. for named entities. - - +row - +cell #[code vector] - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] - +cell A meaning representation of the span. - - +footrow - +cell returns - +cell #[code Span] - +cell The newly constructed object. - -+h(2, "getitem") Span.__getitem__ - +tag method - -p Get a #[code Token] object. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code i] - +cell int - +cell The index of the token within the span. - - +footrow - +cell returns - +cell #[code Token] - +cell The token at #[code span[i]]. - -p Get a #[code Span] object. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code start_end] - +cell tuple - +cell The slice of the span to get. - - +footrow - +cell returns - +cell #[code Span] - +cell The span at #[code span[start : end]]. - -+h(2, "iter") Span.__iter__ - +tag method - -p Iterate over #[code Token] objects. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yields - +cell #[code Token] - +cell A #[code Token] object. - -+h(2, "len") Span.__len__ - +tag method - -p Get the number of tokens in the span. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell int - +cell The number of tokens in the span. - -+h(2, "similarity") Span.similarity - +tag method - -p - | Make a semantic similarity estimate. The default estimate is cosine - | similarity using an average of word vectors. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code other] - +cell - - +cell - | The object to compare with. By default, accepts #[code Doc], - | #[code Span], #[code Token] and #[code Lexeme] objects. - - +footrow - +cell returns - +cell float - +cell A scalar similarity score. Higher is more similar. - -+h(2, "merge") Span.merge - +tag method - -p Retokenize the document, such that the span is merged into a single token. - -+table(["Name", "Type", "Description"]) - +row - +cell #[code **attributes] - +cell - - +cell - | Attributes to assign to the merged token. By default, attributes - | are inherited from the syntactic root token of the span. - - +footrow - +cell returns - +cell #[code Token] - +cell The newly merged token. - -+h(2, "text") Span.text - +tag property - -p A unicode representation of the span text. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell unicode - +cell The original verbatim text of the span. - -+h(2, "text_with_ws") Span.text_with_ws - +tag property - -p - | The text content of the span with a trailing whitespace character if the - | last token has one. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell unicode - +cell The text content of the span (with trailing whitespace). - -+h(2, "sent") Span.sent - +tag property - -p The sentence span that this span is a part of. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell #[code Span] - +cell The sentence this is part of. - -+h(2, "root") Span.root - +tag property - -p - | The token within the span that's highest in the parse tree. If there's a - | tie, the earlist is prefered. - -+table(["Name", "Type", "Description"]) - +footrow - +cell returns - +cell #[code Token] - +cell The root token. - -+h(2, "lefts") Span.lefts - +tag property - -p Tokens that are to the left of the span, whose head is within the span. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yields - +cell #[code Token] - +cell A left-child of a token of the span. - -+h(2, "rights") Span.rights - +tag property - -p Tokens that are to the right of the span, whose head is within the span. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yields - +cell #[code Token] - +cell A right-child of a token of the span. - -+h(2, "subtree") Span.subtree - +tag property - -p Tokens that descend from tokens in the span, but fall outside it. - -+table(["Name", "Type", "Description"]) - +footrow - +cell yields - +cell #[code Token] - +cell A descendant of a token within the span.