mirror of https://github.com/explosion/spaCy.git
118 lines
4.3 KiB
Plaintext
118 lines
4.3 KiB
Plaintext
include ../../_includes/_mixins
|
|
|
|
p.u-text-large spaCy assumes by default that your data is raw text. However, sometimes your data is partially annotated, e.g. with pre-existing tokenization, part-of-speech tags, etc. This tutorial explains how to use these annotations in spaCy.
|
|
|
|
+h(2, "quick-reference") Quick Reference
|
|
|
|
+table(['Description', 'Usage'], 'code')
|
|
+row
|
|
+cell Use pre-existing tokenization
|
|
+cell #[code.lang-python doc = Doc(nlp.vocab, [('A', True), ('token', False), ('!', False)])]
|
|
+row
|
|
+cell Use pre-existing tokenization (deprecated)
|
|
+cell #[code.lang-python doc = nlp.tokenizer.tokens_from_list([u'A', u'token', u'!'])]
|
|
|
|
+row
|
|
+cell Assign pre-existing tags
|
|
+cell #[code.lang-python nlp.tagger.tag_from_strings(doc, ['DT', 'NN'])]
|
|
+row
|
|
+cell Assign named entity annotations from an array
|
|
+cell #[code.lang-python doc.from_array([ENT_TYPE, ENT_IOB], values)]
|
|
+row
|
|
+cell Assign dependency parse annotations from an array
|
|
+cell #[code.lang-python doc.from_array([HEAD, DEP], values)]
|
|
|
|
+h(2, "examples") Examples
|
|
|
|
+code('python', 'Tokenization').
|
|
import spacy
|
|
|
|
nlp = spacy.load('en')
|
|
|
|
tokens = [u'A', u'list', u'of', u'strings', u'.']
|
|
|
|
doc = nlp.tokenizer.tokens_from_list(tokens)
|
|
|
|
assert len(doc) == len(tokens)
|
|
# With this method, we don't get to specify how the corresponding string
|
|
# would be spaced, so we have to assume a space before every token.
|
|
assert doc.text == u'A list of strings .'
|
|
|
|
+code('python', 'Tokenization').
|
|
import spacy
|
|
from spacy.tokens import Doc
|
|
|
|
nlp = spacy.load('en')
|
|
|
|
tokens = [u'A', u'list', u'of', u'strings', u'.']
|
|
has_space = [True, True, True, False, False]
|
|
|
|
doc = Doc(nlp.vocab, orth_and_spaces=zip(tokens, has_space))
|
|
|
|
assert len(doc) == len(tokens)
|
|
# Spacing is correct, given by boolean values above.
|
|
assert doc.text == u'A list of strings.'
|
|
# Here's how it would look with different boolean values.
|
|
tokens = [u'A', u'list', u'of', u'strings', u'.']
|
|
has_space = [False, True, True, True, False]
|
|
doc = Doc(nlp.vocab, orth_and_spaces=zip(tokens, has_space))
|
|
assert doc.text == u'Alist of strings .'
|
|
|
|
+code('python', 'POS Tags').
|
|
import spacy
|
|
|
|
nlp = spacy.load('en')
|
|
|
|
# Tokenize a string into a Doc, but don't apply the whole pipeline ---
|
|
# that is, don't predict the part-of-speech tags, syntactic parse, named
|
|
# entities, etc.
|
|
doc = nlp.tokenizer(u'A unicode string, untokenized.')
|
|
nlp.tagger.tag_from_strings([u'DT', u'JJ', u'NN', u',', u'VBN', u'.'])
|
|
# Now predict dependency parse and named entities. Note that if you assign
|
|
# tags in a way that's very unlike the behaviour of the POS tagger model,
|
|
# the subsequent models may perform worse. These models use the POS tags
|
|
# as features, so if you give them unexpected tags, you may be giving them
|
|
# run-time conditions that don't resemble the training data.
|
|
nlp.parser(doc)
|
|
nlp.entity(doc)
|
|
|
|
+code('python', 'Dependency Parse').
|
|
import spacy
|
|
from spacy.attrs import HEAD, DEP
|
|
from spacy.symbols import det, nmod, root, punct
|
|
from numpy import ndarray
|
|
|
|
nlp = spacy.load('en')
|
|
|
|
# Get the Doc object, and apply the pipeline except the dependency parser
|
|
doc = nlp(u'A unicode string.', parse=False)
|
|
|
|
columns = [HEAD, DEP]
|
|
values = ndarray(shape=(len(columns), len(doc)), dtype='int32')
|
|
# Syntactic parse specified as head offsets
|
|
heads = [2, 1, 0, -1]
|
|
# Integer IDs for the dependency labels. See the parse in the displaCy
|
|
# demo at spacy.io/demos/displacy
|
|
labels = [det, nmod, root, punct]
|
|
values[0] = heads
|
|
values[1] = labels
|
|
doc.from_array(columns, values)
|
|
|
|
+code('python', 'Named Entities').
|
|
import spacy
|
|
from spacy.attrs import ENT_TYPE, ENT_IOB
|
|
from spacy.symbols import PERSON, ORG
|
|
from numpy import ndarray
|
|
|
|
nlp = spacy.load('en')
|
|
|
|
# Get the Doc object, and apply the pipeline except the entity recognizer
|
|
doc = nlp(u'My name is Matt.', entity=False)
|
|
|
|
columns = [ENT_TYPE, ENT_IOB]
|
|
values = ndarray(shape=(len(columns), len(doc)), dtype='int32')
|
|
# IOB values are 0=missing, 1=I, 2=O, 3=B
|
|
values[0] = [1, 1, 1, 3, 1]
|
|
values[1] = [0, 0, 0, PERSON, 0]
|
|
doc.from_array(columns, values)
|