spaCy/docs/redesign/usage_examples.jade

mixin example(name)
  details
    summary
      h4= name
    block


+example("Load resources and process text")
  pre.language-python: code
    | from __future__ import unicode_literals, print_function
    | from spacy.en import English
    | nlp = English()
    | doc = nlp('Hello, world. Here are two sentences.')

+example("Get tokens and sentences")
  pre.language-python: code
    | token = doc[0]
    | sentence = doc.sents[0]
    | assert token[0] is sentence[0]

+example("Use integer IDs for any string")
  pre.language-python: code
    | hello_id = nlp.vocab.strings['Hello']
    | hello_str = nlp.vocab.strings[hello_id]
    |
    | assert token.orth  == hello_id  == 52
    | assert token.orth_ == hello_str == 'Hello'

+example("Get and set string views and flags")
  pre.language-python: code
    | assert token.shape_ == 'Xxxx'
    | for lexeme in nlp.vocab:
    |     if lexeme.is_alpha:
    |         lexeme.shape_ = 'W'
    |     elif lexeme.is_digit:
    |         lexeme.shape_ = 'D'
    |     elif lexeme.is_punct:
    |         lexeme.shape_ = 'P'
    |     else:
    |         lexeme.shape_ = 'M'
    | assert token.shape_ == 'W'

+example("Export to numpy arrays")
  pre.language-python: code
    | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
    |
    | attr_ids = [ORTH, LIKE_URL, IS_OOV]
    | doc_array = doc.to_array(attr_ids)
    | assert doc_array.shape == (len(doc), len(attrs)
    | assert doc[0].orth == doc_array[0, 0]
    | assert doc[1].orth == doc_array[1, 0]
    | assert doc[0].like_url == doc_array[0, 1]
    | assert doc_array[, 1] == [t.like_url for t in doc]

+example("Word vectors")
  pre.language-python: code
    | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
    |
    | apples = doc[0]
    | oranges = doc[1]
    | boots = doc[6]
    | hippos = doc[8]
    |
    | assert apples.similarity(oranges) > boots.similarity(hippos)


+example("Part-of-speech tags")
  pre.language-python: code
    | from spacy.parts_of_speech import ADV
    |
    | def is_adverb(token):
    |     return token.pos == spacy.parts_of_speech.ADV
    |
    | # These are data-specific, so no constants are provided. You have to look
    | # up the IDs from the StringStore.
    | NNS = nlp.vocab.strings['NNS']
    | NNPS = nlp.vocab.strings['NNPS']
    | def is_plural_noun(token):
    |     return token.tag == NNS or token.tag == NNPS
    |
    | def print_coarse_pos(token):
    |     print(token.pos_)
    |
    | def print_fine_pos(token):
    |     print(token.tag_)

+example("Syntactic dependencies")
  pre.language-python: code
    | def dependency_labels_to_root(token):
    |     '''Walk up the syntactic tree, collecting the arc labels.'''
    |     dep_labels = []
    |     while token.root is not token:
    |         dep_labels.append(token.dep)
    |         token = token.head
    |     return dep_labels

+example("Named entities")
  pre.language-python: code
    | def iter_products(docs):
    |     for doc in docs:
    |         for ent in doc.ents:
    |             if ent.label_ == 'PRODUCT':
    |                 yield ent
    |
    | def word_is_in_entity(word):
    |     return word.ent_type != 0
    |
    | def count_parent_verb_by_person(docs):
    |     counts = defaultdict(defaultdict(int))
    |     for doc in docs:
    |         for ent in doc.ents:
    |             if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
    |                 counts[ent.orth_][ent.root.head.lemma_] += 1
    |     return counts

  //+example("Define custom NER rules")
  //  pre.language-python: code
  //    | nlp.matcher


+example("Calculate inline mark-up on original string")
  pre.language-python: code
    | def put_spans_around_tokens(doc, get_classes):
    |     '''Given some function to compute class names, put each token in a
    |     span element, with the appropriate classes computed.
    |
    |     All whitespace is preserved, outside of the spans. (Yes, I know HTML
    |     won't display it. But the point is no information is lost, so you can
    |     calculate what you need, e.g. <br /> tags, <p> tags, etc.)
    |     '''
    |     output = []
    |     template = '&lt;span classes="{classes}"&gt;{word}&lt;/span&gt;{space}'
    |     for token in doc:
    |         if token.is_space:
    |             output.append(token.orth_)
    |         else:
    |             output.append(
    |               template.format(
    |                 classes=' '.join(get_classes(token)),
    |                 word=token.orth_,
    |                 space=token.whitespace_))
    |     string = ''.join(output)
    |     string = string.replace('\n', '<br />')
    |     string = string.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;'
    |     return string


+example("Efficient binary serialization")
  pre.language-python: code
    |
    | byte_string = doc.as_bytes()
    | open('/tmp/moby_dick.bin', 'wb').write(byte_string)
    |
    | nlp = spacy.en.English()
    | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')):
    |    doc = Doc(nlp.vocab)
    |    doc.from_bytes(byte_string)


p
  | See the
  a(href="docs.html") docs page
  | for
  a(href="docs.html#api") API documentation,
  a(href="docs.html#tutorials") tutorials,
  | and
  a(href="docs.html#spec") annotation specs.