mirror of https://github.com/explosion/spaCy.git
Improve docstrings for Doc object
This commit is contained in:
parent
81a47c01d8
commit
1b520e7bab
|
@ -59,10 +59,42 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""
|
"""
|
||||||
Container class for annotated text. Constructed via English.__call__ or
|
A sequence of `Token` objects. Access sentences and named entities,
|
||||||
Tokenizer.__call__.
|
export annotations to numpy arrays, losslessly serialize to compressed
|
||||||
|
binary strings.
|
||||||
|
|
||||||
|
Aside: Internals
|
||||||
|
The `Doc` object holds an array of `TokenC` structs.
|
||||||
|
The Python-level `Token` and `Span` objects are views of this
|
||||||
|
array, i.e. they don't own the data themselves.
|
||||||
|
|
||||||
|
Code: Construction 1
|
||||||
|
doc = nlp.tokenizer(u'Some text')
|
||||||
|
|
||||||
|
Code: Construction 2
|
||||||
|
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, orths_and_spaces=None):
|
def __init__(self, Vocab vocab, orths_and_spaces=None):
|
||||||
|
'''
|
||||||
|
Create a Doc object.
|
||||||
|
|
||||||
|
Aside: Implementation
|
||||||
|
This method of constructing a `Doc` object is usually only used
|
||||||
|
for deserialization. Standard usage is to construct the document via
|
||||||
|
a call to the language object.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
vocab:
|
||||||
|
A Vocabulary object, which must match any models you want to
|
||||||
|
use (e.g. tokenizer, parser, entity recognizer).
|
||||||
|
|
||||||
|
orths_and_spaces:
|
||||||
|
A list of tokens in the document as a sequence of
|
||||||
|
`(orth_id, has_space)` tuples, where `orth_id` is an
|
||||||
|
integer and `has_space` is a boolean, indicating whether the
|
||||||
|
token has a trailing space.
|
||||||
|
'''
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
size = 20
|
size = 20
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -102,11 +134,21 @@ cdef class Doc:
|
||||||
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
|
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a Token or a Span from the Doc.
|
'''
|
||||||
|
doc[i]
|
||||||
Returns:
|
Get the Token object at position i, where i is an integer.
|
||||||
token (Token) or span (Span):
|
Negative indexing is supported, and follows the usual Python
|
||||||
"""
|
semantics, i.e. doc[-2] is doc[len(doc) - 2].
|
||||||
|
doc[start : end]]
|
||||||
|
Get a `Span` object, starting at position `start`
|
||||||
|
and ending at position `end`, where `start` and
|
||||||
|
`end` are token indices. For instance,
|
||||||
|
`doc[2:5]` produces a span consisting of
|
||||||
|
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
|
||||||
|
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||||
|
You can use negative indices and open-ended ranges, which have their
|
||||||
|
normal Python semantics.
|
||||||
|
'''
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
return Span(self, start, stop, label=0)
|
return Span(self, start, stop, label=0)
|
||||||
|
@ -120,11 +162,15 @@ cdef class Doc:
|
||||||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the tokens.
|
'''
|
||||||
|
for token in doc
|
||||||
Yields:
|
Iterate over `Token` objects, from which the annotations can
|
||||||
token (Token):
|
be easily accessed. This is the main way of accessing Token
|
||||||
"""
|
objects, which are the main way annotations are accessed from
|
||||||
|
Python. If faster-than-Python speeds are required, you can
|
||||||
|
instead access the annotations as a numpy array, or access the
|
||||||
|
underlying C data directly from Cython.
|
||||||
|
'''
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if self._py_tokens[i] is not None:
|
if self._py_tokens[i] is not None:
|
||||||
|
@ -133,6 +179,10 @@ cdef class Doc:
|
||||||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
'''
|
||||||
|
len(doc)
|
||||||
|
The number of tokens in the document.
|
||||||
|
'''
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
|
@ -161,7 +211,10 @@ cdef class Doc:
|
||||||
property vector:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self._vector is None:
|
if self._vector is None:
|
||||||
|
if len(self):
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
self._vector = sum(t.vector for t in self) / len(self)
|
||||||
|
else:
|
||||||
|
return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
|
||||||
return self._vector
|
return self._vector
|
||||||
|
|
||||||
def __set__(self, value):
|
def __set__(self, value):
|
||||||
|
@ -193,18 +246,22 @@ cdef class Doc:
|
||||||
return u''.join(t.text_with_ws for t in self)
|
return u''.join(t.text_with_ws for t in self)
|
||||||
|
|
||||||
property ents:
|
property ents:
|
||||||
|
'''
|
||||||
|
Yields named-entity `Span` objects, if the entity recognizer
|
||||||
|
has been applied to the document. Iterate over the span to get
|
||||||
|
individual Token objects, or access the label:
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from spacy.en import English
|
||||||
|
nlp = English()
|
||||||
|
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||||
|
ents = list(tokens.ents)
|
||||||
|
assert ents[0].label == 346
|
||||||
|
assert ents[0].label_ == 'PERSON'
|
||||||
|
assert ents[0].orth_ == 'Best'
|
||||||
|
assert ents[0].text == 'Mr. Best'
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""Yields named-entity Span objects.
|
|
||||||
|
|
||||||
Iterate over the span to get individual Token objects, or access the label:
|
|
||||||
|
|
||||||
>>> from spacy.en import English
|
|
||||||
>>> nlp = English()
|
|
||||||
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
|
||||||
>>> ents = list(tokens.ents)
|
|
||||||
>>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0])
|
|
||||||
(112504, u'PERSON', u'Best ')
|
|
||||||
"""
|
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
cdef int start = -1
|
cdef int start = -1
|
||||||
|
@ -263,9 +320,15 @@ cdef class Doc:
|
||||||
# Set start as B
|
# Set start as B
|
||||||
self.c[start].ent_iob = 3
|
self.c[start].ent_iob = 3
|
||||||
|
|
||||||
@property
|
property:
|
||||||
def noun_chunks(self):
|
'''
|
||||||
"""Yield spans for base noun phrases."""
|
Yields base noun-phrase #[code Span] objects, if the document
|
||||||
|
has been syntactically parsed. A base noun phrase, or
|
||||||
|
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||||
|
be nested within it – so no NP-level coordination, no prepositional
|
||||||
|
phrases, and no relative clauses. For example:
|
||||||
|
'''
|
||||||
|
def __get__(self):
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
|
@ -282,11 +345,20 @@ cdef class Doc:
|
||||||
for span in spans:
|
for span in spans:
|
||||||
yield span
|
yield span
|
||||||
|
|
||||||
@property
|
property sents:
|
||||||
def sents(self):
|
|
||||||
"""
|
"""
|
||||||
Yield a list of sentence Span objects, calculated from the dependency parse.
|
Yields sentence `Span` objects. Sentence spans have no label.
|
||||||
|
To improve accuracy on informal texts, spaCy calculates sentence
|
||||||
|
boundaries from the syntactic dependency parse. If the parser is disabled,
|
||||||
|
`sents` iterator will be unavailable.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from spacy.en import English
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp("This is a sentence. Here's another...")
|
||||||
|
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
||||||
"""
|
"""
|
||||||
|
def __get__(self):
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"sentence boundary detection requires the dependency parse, which "
|
"sentence boundary detection requires the dependency parse, which "
|
||||||
|
@ -324,8 +396,16 @@ cdef class Doc:
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
"""
|
||||||
of shape N*M, where N is the length of the sentence.
|
Given a list of M attribute IDs, export the tokens to a numpy
|
||||||
|
`ndarray` of shape (N, M), where `N` is the length
|
||||||
|
of the document. The values will be 32-bit integers.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from spacy import attrs
|
||||||
|
doc = nlp(text)
|
||||||
|
# All strings mapped to integers, for easy export to numpy
|
||||||
|
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
attr_ids (list[int]): A list of attribute ID ints.
|
attr_ids (list[int]): A list of attribute ID ints.
|
||||||
|
@ -351,16 +431,22 @@ cdef class Doc:
|
||||||
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||||
by the values of the given attribute ID.
|
by the values of the given attribute ID.
|
||||||
|
|
||||||
>>> from spacy.en import English, attrs
|
Example:
|
||||||
>>> nlp = English()
|
from spacy.en import English, attrs
|
||||||
>>> tokens = nlp(u'apple apple orange banana')
|
nlp = English()
|
||||||
>>> tokens.count_by(attrs.ORTH)
|
tokens = nlp(u'apple apple orange banana')
|
||||||
{12800L: 1, 11880L: 2, 7561L: 1}
|
tokens.count_by(attrs.ORTH)
|
||||||
>>> tokens.to_array([attrs.ORTH])
|
# {12800L: 1, 11880L: 2, 7561L: 1}
|
||||||
array([[11880],
|
tokens.to_array([attrs.ORTH])
|
||||||
[11880],
|
# array([[11880],
|
||||||
[ 7561],
|
# [11880],
|
||||||
[12800]])
|
# [ 7561],
|
||||||
|
# [12800]])
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
attr_id
|
||||||
|
int
|
||||||
|
The attribute ID to key the counts.
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef attr_t attr
|
cdef attr_t attr
|
||||||
|
@ -408,6 +494,8 @@ cdef class Doc:
|
||||||
self.c[i] = parsed[i]
|
self.c[i] = parsed[i]
|
||||||
|
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, array):
|
||||||
|
'''Write to a `Doc` object, from an `(M, N)` array of attributes.
|
||||||
|
'''
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
|
@ -448,16 +536,34 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
|
'''Serialize, producing a byte string.'''
|
||||||
byte_string = self.vocab.serializer.pack(self)
|
byte_string = self.vocab.serializer.pack(self)
|
||||||
cdef uint32_t length = len(byte_string)
|
cdef uint32_t length = len(byte_string)
|
||||||
return struct.pack('I', length) + byte_string
|
return struct.pack('I', length) + byte_string
|
||||||
|
|
||||||
def from_bytes(self, data):
|
def from_bytes(self, data):
|
||||||
|
'''Deserialize, loading from bytes.'''
|
||||||
self.vocab.serializer.unpack_into(data[4:], self)
|
self.vocab.serializer.unpack_into(data[4:], self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_bytes(file_):
|
def read_bytes(file_):
|
||||||
|
'''
|
||||||
|
A static method, used to read serialized #[code Doc] objects from
|
||||||
|
a file. For example:
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from spacy.tokens.doc import Doc
|
||||||
|
loc = 'test_serialize.bin'
|
||||||
|
with open(loc, 'wb') as file_:
|
||||||
|
file_.write(nlp(u'This is a document.').to_bytes())
|
||||||
|
file_.write(nlp(u'This is another.').to_bytes())
|
||||||
|
docs = []
|
||||||
|
with open(loc, 'rb') as file_:
|
||||||
|
for byte_string in Doc.read_bytes(file_):
|
||||||
|
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
||||||
|
assert len(docs) == 2
|
||||||
|
'''
|
||||||
keep_reading = True
|
keep_reading = True
|
||||||
while keep_reading:
|
while keep_reading:
|
||||||
try:
|
try:
|
||||||
|
@ -472,8 +578,7 @@ cdef class Doc:
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
||||||
unicode ent_type):
|
unicode ent_type):
|
||||||
"""Merge a multi-word expression into a single token. Currently
|
"""Merge a multi-word expression into a single token."""
|
||||||
experimental; API is likely to change."""
|
|
||||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||||
if start == -1:
|
if start == -1:
|
||||||
return None
|
return None
|
||||||
|
|
Loading…
Reference in New Issue