mixin example(name) details summary h4= name block +example("Load resources and process text") pre.language-python: code | from __future__ import unicode_literals, print_function | from spacy.en import English | nlp = English() | doc = nlp('Hello, world. Here are two sentences.') +example("Get tokens and sentences") pre.language-python: code | token = doc[0] | sentence = doc.sents[0] | assert token[0] is sentence[0] +example("Use integer IDs for any string") pre.language-python: code | hello_id = nlp.vocab.strings['Hello'] | hello_str = nlp.vocab.strings[hello_id] | | assert token.orth == hello_id == 52 | assert token.orth_ == hello_str == 'Hello' +example("Get and set string views and flags") pre.language-python: code | assert token.shape_ == 'Xxxx' | for lexeme in nlp.vocab: | if lexeme.is_alpha: | lexeme.shape_ = 'W' | elif lexeme.is_digit: | lexeme.shape_ = 'D' | elif lexeme.is_punct: | lexeme.shape_ = 'P' | else: | lexeme.shape_ = 'M' | assert token.shape_ == 'W' +example("Export to numpy arrays") pre.language-python: code | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV | | attr_ids = [ORTH, LIKE_URL, IS_OOV] | doc_array = doc.to_array(attr_ids) | assert doc_array.shape == (len(doc), len(attrs) | assert doc[0].orth == doc_array[0, 0] | assert doc[1].orth == doc_array[1, 0] | assert doc[0].like_url == doc_array[0, 1] | assert doc_array[, 1] == [t.like_url for t in doc] +example("Word vectors") pre.language-python: code | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") | | apples = doc[0] | oranges = doc[1] | boots = doc[6] | hippos = doc[8] | | assert apples.similarity(oranges) > boots.similarity(hippos) +example("Part-of-speech tags") pre.language-python: code | from spacy.parts_of_speech import ADV | | def is_adverb(token): | return token.pos == spacy.parts_of_speech.ADV | | # These are data-specific, so no constants are provided. You have to look | # up the IDs from the StringStore. | NNS = nlp.vocab.strings['NNS'] | NNPS = nlp.vocab.strings['NNPS'] | def is_plural_noun(token): | return token.tag == NNS or token.tag == NNPS | | def print_coarse_pos(token): | print(token.pos_) | | def print_fine_pos(token): | print(token.tag_) +example("Syntactic dependencies") pre.language-python: code | def dependency_labels_to_root(token): | '''Walk up the syntactic tree, collecting the arc labels.''' | dep_labels = [] | while token.root is not token: | dep_labels.append(token.dep) | token = token.head | return dep_labels +example("Named entities") pre.language-python: code | def iter_products(docs): | for doc in docs: | for ent in doc.ents: | if ent.label_ == 'PRODUCT': | yield ent | | def word_is_in_entity(word): | return word.ent_type != 0 | | def count_parent_verb_by_person(docs): | counts = defaultdict(defaultdict(int)) | for doc in docs: | for ent in doc.ents: | if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: | counts[ent.orth_][ent.root.head.lemma_] += 1 | return counts //+example("Define custom NER rules") // pre.language-python: code // | nlp.matcher +example("Calculate inline mark-up on original string") pre.language-python: code | def put_spans_around_tokens(doc, get_classes): | '''Given some function to compute class names, put each token in a | span element, with the appropriate classes computed. | | All whitespace is preserved, outside of the spans. (Yes, I know HTML | won't display it. But the point is no information is lost, so you can | calculate what you need, e.g.
tags,

tags, etc.) | ''' | output = [] | template = '<span classes="{classes}">{word}</span>{space}' | for token in doc: | if token.is_space: | output.append(token.orth_) | else: | output.append( | template.format( | classes=' '.join(get_classes(token)), | word=token.orth_, | space=token.whitespace_)) | string = ''.join(output) | string = string.replace('\n', '
') | string = string.replace('\t', '    ' | return string +example("Efficient binary serialization") pre.language-python: code | | byte_string = doc.as_bytes() | open('/tmp/moby_dick.bin', 'wb').write(byte_string) | | nlp = spacy.en.English() | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): | doc = Doc(nlp.vocab) | doc.from_bytes(byte_string) p | See the a(href="docs.html") docs page | for a(href="docs.html#api") API documentation, a(href="docs.html#tutorials") tutorials, | and a(href="docs.html#spec") annotation specs.