2016-10-03 18:19:13 +00:00
|
|
|
|
//- ----------------------------------
|
|
|
|
|
//- 💫 DOCS > API > DOC
|
|
|
|
|
//- ----------------------------------
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc")
|
|
|
|
|
+h(2, "doc", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/tokens/doc.pyx")
|
|
|
|
|
| #[+tag class] Doc
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
|
|
|
|
p
|
2016-10-03 18:19:13 +00:00
|
|
|
|
| A sequence of #[code Token] objects. Access sentences and named entities,
|
|
|
|
|
| export annotations to numpy arrays, losslessly serialize to compressed
|
2016-03-31 14:24:48 +00:00
|
|
|
|
| binary strings.
|
|
|
|
|
|
|
|
|
|
+aside.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
Internally, the #[code Doc] object holds an array of #[code TokenC] structs.
|
|
|
|
|
The Python-level #[code Token] and #[code Span] objects are views of this
|
2016-03-31 14:24:48 +00:00
|
|
|
|
array, i.e. they don't own the data themselves.
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Overview").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
class Doc:
|
|
|
|
|
def __init__(self, vocab, orths_and_spaces=None):
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def __getitem__(self, int i):
|
|
|
|
|
return Token()
|
|
|
|
|
def __getitem__(self, slice i_j):
|
|
|
|
|
return Span()
|
|
|
|
|
def __iter__(self):
|
|
|
|
|
yield Token()
|
|
|
|
|
def __len__(self):
|
|
|
|
|
return int
|
2016-10-03 18:19:13 +00:00
|
|
|
|
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def __unicode__(self):
|
|
|
|
|
return unicode
|
|
|
|
|
def __bytes__(self):
|
|
|
|
|
return utf8
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return unicode
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def text(self):
|
|
|
|
|
return unicode
|
|
|
|
|
@property
|
|
|
|
|
def text_with_ws(self):
|
|
|
|
|
return unicode
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def vector(self):
|
|
|
|
|
return numpy.ndarray(dtype='float32')
|
|
|
|
|
@property
|
|
|
|
|
def vector_norm(self):
|
|
|
|
|
return float
|
|
|
|
|
@property
|
|
|
|
|
def ents(self):
|
|
|
|
|
yield Span()
|
|
|
|
|
@property
|
|
|
|
|
def noun_chunks(self):
|
|
|
|
|
yield Span()
|
|
|
|
|
@property
|
|
|
|
|
def sents(self):
|
|
|
|
|
yield Span()
|
|
|
|
|
|
|
|
|
|
def similarity(self, other):
|
|
|
|
|
return float
|
|
|
|
|
|
|
|
|
|
def merge(self, start_char, end_char, tag, lemma, ent_type):
|
|
|
|
|
return None
|
2016-10-03 18:19:13 +00:00
|
|
|
|
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def to_array(self, attr_ids):
|
|
|
|
|
return numpy.ndarray(shape=(len(self), len(attr_ids)), dtype='int64')
|
|
|
|
|
|
|
|
|
|
def count_by(self, attr_id, exclude=None, counts=None):
|
|
|
|
|
return dict
|
|
|
|
|
|
|
|
|
|
def to_bytes(self):
|
|
|
|
|
return bytes
|
|
|
|
|
|
|
|
|
|
def from_array(self, attrs, array):
|
|
|
|
|
return None
|
2016-10-03 18:19:13 +00:00
|
|
|
|
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def from_bytes(self, data):
|
|
|
|
|
return self
|
2016-10-03 18:19:13 +00:00
|
|
|
|
|
2016-03-31 14:24:48 +00:00
|
|
|
|
@staticmethod
|
|
|
|
|
def read_bytes(file_):
|
|
|
|
|
yield bytes
|
2016-10-03 18:19:13 +00:00
|
|
|
|
|
|
|
|
|
+section("doc-init")
|
|
|
|
|
+h(3, "doc-init")
|
|
|
|
|
| #[+tag method] Doc.__init__
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
|
|
|
|
.has-aside
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Definition").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def __init__(self, vocab, orths_and_spaces=None):
|
|
|
|
|
return Doc
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+aside("Implementation").
|
|
|
|
|
This method of constructing a #[code Doc] object is usually only used
|
|
|
|
|
for deserialization. Standard usage is to construct the document via
|
2016-03-31 14:24:48 +00:00
|
|
|
|
a call to the language object.
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+table(["Name", "Type", "Description"])
|
2016-03-31 14:24:48 +00:00
|
|
|
|
+row
|
|
|
|
|
+cell vocab
|
|
|
|
|
+cell.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
A Vocabulary object, which must match any models you want to
|
2016-03-31 14:24:48 +00:00
|
|
|
|
use (e.g. tokenizer, parser, entity recognizer).
|
|
|
|
|
|
|
|
|
|
+row
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+cell orths_and_spaces
|
2016-03-31 14:24:48 +00:00
|
|
|
|
+cell.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
A list of tokens in the document as a sequence of
|
|
|
|
|
#[code (orth_id, has_space)] tuples, where #[code orth_id]
|
2016-03-31 14:24:48 +00:00
|
|
|
|
is an integer and #[code has_space] is a boolean, indicating
|
|
|
|
|
whether the token has a trailing space.
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc-sequenceapi")
|
|
|
|
|
+h(3, "doc-sequenceapi")
|
|
|
|
|
| #[+tag Section] Sequence API
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+table(["Example", "Description"])
|
|
|
|
|
+row
|
|
|
|
|
+cell #[code doc[i]]
|
2016-03-31 14:24:48 +00:00
|
|
|
|
+cell.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
Get the Token object at position i, where i is an integer.
|
|
|
|
|
Negative indexing is supported, and follows the usual Python
|
2016-03-31 14:24:48 +00:00
|
|
|
|
semantics, i.e. doc[-2] is doc[len(doc) - 2].
|
|
|
|
|
|
|
|
|
|
+row
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+cell #[code doc[start : end]]
|
2016-03-31 14:24:48 +00:00
|
|
|
|
+cell.
|
|
|
|
|
Get a #[code Span] object, starting at position #[code start]
|
|
|
|
|
and ending at position #[code end], where #[code start] and
|
|
|
|
|
#[code end] are token indices. For instance,
|
2016-10-03 18:19:13 +00:00
|
|
|
|
#[code doc[2:5]] produces a span consisting of
|
|
|
|
|
tokens 2, 3 and 4. Stepped slices (e.g. #[code doc[start : end : step]])
|
|
|
|
|
are not supported, as #[code Span] objects must be contiguous
|
2016-03-31 14:24:48 +00:00
|
|
|
|
(cannot have gaps). You can use negative indices and open-ended
|
|
|
|
|
ranges, which have their normal Python semantics.
|
|
|
|
|
|
|
|
|
|
+row
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+cell #[code for token in doc]
|
2016-03-31 14:24:48 +00:00
|
|
|
|
+cell.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
Iterate over Token objects, from which the annotations can
|
|
|
|
|
be easily accessed. This is the main way of accessing Token
|
|
|
|
|
objects, which are the main way annotations are accessed from
|
|
|
|
|
Python. If faster-than-Python speeds are required, you can
|
|
|
|
|
instead access the annotations as a numpy array, or access the
|
2016-03-31 14:24:48 +00:00
|
|
|
|
underlying C data directly from Cython.
|
|
|
|
|
|
|
|
|
|
+row
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+cell #[code len(doc)]
|
2016-03-31 14:24:48 +00:00
|
|
|
|
+cell.
|
|
|
|
|
The number of tokens in the document.
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc-spans")
|
|
|
|
|
+h(3, "doc-spans-sents")
|
|
|
|
|
| #[+tag property] Doc.sents
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
|
|
|
|
p.
|
|
|
|
|
Yields sentence #[code Span] objects. Sentence spans have no label.
|
|
|
|
|
To improve accuracy on informal texts, spaCy calculates sentence
|
|
|
|
|
boundaries from the syntactic dependency parse. If the parser is disabled,
|
|
|
|
|
the #[code sents] iterator will be unavailable.
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Example").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
from spacy.en import English
|
|
|
|
|
nlp = English()
|
|
|
|
|
doc = nlp("This is a sentence. Here's another...")
|
|
|
|
|
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+h(3, "doc-spans-ents")
|
|
|
|
|
| #[+tag property] Doc.ents
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
|
|
|
|
p.
|
|
|
|
|
Yields named-entity #[code Span] objects, if the entity recognizer
|
2016-10-03 18:19:13 +00:00
|
|
|
|
has been applied to the document. Iterate over the span to get
|
2016-03-31 14:24:48 +00:00
|
|
|
|
individual Token objects, or access the label:
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Example").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
from spacy.en import English
|
|
|
|
|
nlp = English()
|
|
|
|
|
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
|
|
|
|
ents = list(tokens.ents)
|
|
|
|
|
assert ents[0].label == 346
|
|
|
|
|
assert ents[0].label_ == 'PERSON'
|
|
|
|
|
assert ents[0].orth_ == 'Best'
|
2016-10-03 18:19:13 +00:00
|
|
|
|
assert ents[0].text == 'Mr. Best'
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+h(3, "doc-spans-nounchunks")
|
|
|
|
|
| #[+tag property] Doc.noun_chunks
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
|
|
|
|
p.
|
|
|
|
|
Yields base noun-phrase #[code Span] objects, if the document
|
2016-10-03 18:19:13 +00:00
|
|
|
|
has been syntactically parsed. A base noun phrase, or
|
|
|
|
|
'NP chunk', is a noun phrase that does not permit other NPs to
|
|
|
|
|
be nested within it – so no NP-level coordination, no prepositional
|
2016-03-31 14:24:48 +00:00
|
|
|
|
phrases, and no relative clauses. For example:
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Example").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
from spacy.en import English
|
|
|
|
|
nlp = English()
|
|
|
|
|
doc = nlp(u'The sentence in this example has three noun chunks.')
|
|
|
|
|
for chunk in doc.noun_chunks:
|
|
|
|
|
print(chunk.label_, chunk.orth_, '<--', chunk.root.head.orth_)
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc-exportimport-toarray")
|
|
|
|
|
+h(3, "doc-exportimport-toarray")
|
|
|
|
|
| #[+tag method] Doc.to_array
|
|
|
|
|
|
2016-03-31 14:24:48 +00:00
|
|
|
|
p.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
Given a list of M attribute IDs, export the tokens to a numpy
|
|
|
|
|
#[code ndarray] of shape #[code N*M], where #[code N] is the length
|
2016-03-31 14:24:48 +00:00
|
|
|
|
of the document. The values will be 32-bit integers.
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Example").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
from spacy import attrs
|
|
|
|
|
doc = nlp(text)
|
|
|
|
|
# All strings mapped to integers, for easy export to numpy
|
|
|
|
|
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
|
2016-10-03 18:19:13 +00:00
|
|
|
|
|
|
|
|
|
+code("python", "Definition").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def to_array(self, attr_ids):
|
|
|
|
|
return numpy.ndarray(shape=(len(self), len(attr_ids)), dtype='int64')
|
2016-10-03 18:19:13 +00:00
|
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
2016-03-31 14:24:48 +00:00
|
|
|
|
+row
|
|
|
|
|
+cell attr_ids
|
|
|
|
|
+cell list of ints
|
|
|
|
|
+cell.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
A list of attribute ID ints. Attribute IDs can be imported
|
2016-03-31 14:24:48 +00:00
|
|
|
|
from #[code spacy.attrs] or #[code spacy.symbols].
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc-exportimport-countby")
|
|
|
|
|
+h(4, "doc-exportimport-countby")
|
|
|
|
|
| #[+tag method] Doc.count_by
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
|
|
|
|
p.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
Produce a dict of #[code {attribute (int): count (ints)}] frequencies,
|
2016-03-31 14:24:48 +00:00
|
|
|
|
keyed by the values of the given attribute ID.
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Example").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def count_by(self, attr_id):
|
|
|
|
|
return dict
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+table(["Name", "Type", "Description"])
|
2016-03-31 14:24:48 +00:00
|
|
|
|
+row
|
|
|
|
|
+cell attr_id
|
|
|
|
|
+cell int
|
|
|
|
|
+cell.
|
|
|
|
|
The attribute ID to key the counts.
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc-exportimport-fromarray")
|
|
|
|
|
+h(4, "doc-exportimport-fromarray")
|
|
|
|
|
| #[+tag method] Doc.from_array
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
p Write to a #[code Doc] object, from an M*N array of attributes.
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Definition").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def from_array(self, attrs, array):
|
|
|
|
|
return None
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc-exportimport-frombytes")
|
|
|
|
|
+h(4, "doc-exportimport-frombytes") Doc.from_bytes
|
|
|
|
|
|
|
|
|
|
p Deserialize, loading from bytes.
|
|
|
|
|
|
|
|
|
|
+code("python", "Definition").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def from_bytes(self, byte_string):
|
|
|
|
|
return Doc
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc-exportimport-tobytes")
|
|
|
|
|
+h(4, "doc-exportimport-tobytes")
|
|
|
|
|
| #[+tag method] Doc.to_bytes
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
p Serialize, producing a byte string.
|
|
|
|
|
|
|
|
|
|
+code("python", "Definition").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
def to_bytes(self):
|
|
|
|
|
return bytes
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+section("doc-exportimport-readbytes")
|
|
|
|
|
+h(4, "doc-exportimport-readbytes")
|
|
|
|
|
| #[+tag method] Doc.read_bytes
|
2016-03-31 14:24:48 +00:00
|
|
|
|
|
|
|
|
|
p.
|
2016-10-03 18:19:13 +00:00
|
|
|
|
A static method, used to read serialized #[code Doc] objects from
|
2016-03-31 14:24:48 +00:00
|
|
|
|
a file. For example:
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Example").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
from spacy.tokens.doc import Doc
|
|
|
|
|
loc = 'test_serialize.bin'
|
|
|
|
|
with open(loc, 'wb') as file_:
|
|
|
|
|
file_.write(nlp(u'This is a document.').to_bytes())
|
|
|
|
|
file_.write(nlp(u'This is another.').to_bytes())
|
|
|
|
|
docs = []
|
|
|
|
|
with open(loc, 'rb') as file_:
|
|
|
|
|
for byte_string in Doc.read_bytes(file_):
|
|
|
|
|
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
|
|
|
|
assert len(docs) == 2
|
|
|
|
|
|
2016-10-03 18:19:13 +00:00
|
|
|
|
+code("python", "Definition").
|
2016-03-31 14:24:48 +00:00
|
|
|
|
@staticmethod
|
|
|
|
|
def read_bytes(file_):
|
|
|
|
|
yield bytes
|