From ee521a52a84be00bab5f83f045bcf1e92416ac94 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Thu, 24 Sep 2015 16:57:11 +0200 Subject: [PATCH 1/4] proposal for doctests --- .travis.yml | 2 +- website/Makefile | 7 ++- website/run_jade | 59 +++++++++++++++++++ website/src/jade/home/_usage.jade | 19 +----- .../tests/test_get_tokens_and_sentences.txt | 8 +++ .../test_load_resources_and_process_text.txt | 5 ++ .../test_use_interger_ids_for_any_strings.txt | 10 ++++ 7 files changed, 91 insertions(+), 19 deletions(-) create mode 100755 website/run_jade create mode 100644 website/tests/test_get_tokens_and_sentences.txt create mode 100644 website/tests/test_load_resources_and_process_text.txt create mode 100644 website/tests/test_use_interger_ids_for_any_strings.txt diff --git a/.travis.yml b/.travis.yml index f21301db1..1ea1f8375 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,4 +24,4 @@ install: # run tests script: - - "py.test tests/ -x" + - "py.test tests/ website/tests/ -x" diff --git a/website/Makefile b/website/Makefile index e2002c97f..87c9077ec 100644 --- a/website/Makefile +++ b/website/Makefile @@ -1,9 +1,12 @@ -all: site +all: dir site + +dir: + mkdir -p site site: site/index.html site/blog/ site/docs/ site/license/ site/blog/introducing-spacy/ site/blog/parsing-english-in-python/ site/blog/part-of-speech-POS-tagger-in-python/ site/tutorials/twitter-filter/ site/tutorials/syntax-search/ site/tutorials/mark-adverbs/ site/blog/writing-c-in-cython/ site/blog/how-spacy-works/ site/index.html: src/jade/header.jade src/jade/*.jade - jade -P src/jade/home/index.jade --out site/ + ./run_jade src/jade/home/index.jade $@ site/docs/: src/jade/docs/*.jade src/jade/header.jade jade -P src/jade/docs/index.jade --out $@ diff --git a/website/run_jade b/website/run_jade new file mode 100755 index 000000000..83ac87ef7 --- /dev/null +++ b/website/run_jade @@ -0,0 +1,59 @@ +#!/usr/bin/env node +'use strict'; + +var fs = require('fs'); +var jade = require('jade'); + +// returns all: code and return value (default) +jade.filters.doctest_all = function (html, _, use_rv) { + use_rv = use_rv === undefined ? true : use_rv; + + var lines = html.trim().split(/\n/), + block = [], + res = ''; + + lines.forEach(function (line) { + if (line.indexOf('>>> ') === 0) { + // we use ### to suppress lines + if (line.indexOf("###") === -1) { + block.push(line.replace(/^>>> /gm, '')); + } + } else if (block.length > 0) { + res += '
' + block.join('\n') + '
'; + block = []; + + if (use_rv) { + res += '

Which produces:

'; + res += '
' + line + '
'; + } + } + }); + + if (block.length > 0) { + res += '
' + block.join('\n') + '
'; + } + + return res; +}; + +// returns only code +jade.filters.doctest = function (html) { + return jade.filters.doctest_all(html, null, false); +}; + +if (process.argv[0] === "node") { + process.argv.shift(); +} + +var in_file = process.argv[1]; +var out_file = process.argv[2]; + +var html = jade.renderFile(in_file, { + pretty: true +}); + +fs.writeFile(out_file, html, function (err) { + if (err) { + throw err; + } +}); diff --git a/website/src/jade/home/_usage.jade b/website/src/jade/home/_usage.jade index a26823358..12dc9e2a5 100644 --- a/website/src/jade/home/_usage.jade +++ b/website/src/jade/home/_usage.jade @@ -6,26 +6,13 @@ mixin example(name) +example("Load resources and process text") - pre.language-python: code - | from __future__ import unicode_literals, print_function - | from spacy.en import English - | nlp = English() - | doc = nlp('Hello, world. Here are two sentences.') + include:doctest_all ../../../tests/test_load_resources_and_process_text.txt +example("Get tokens and sentences") - pre.language-python: code - | token = doc[0] - | sentence = doc.sents.next() - | assert token is sentence[0] - | assert sentence.text == 'Hello, world.' + include:doctest ../../../tests/test_get_tokens_and_sentences.txt +example("Use integer IDs for any string") - pre.language-python: code - | hello_id = nlp.vocab.strings['Hello'] - | hello_str = nlp.vocab.strings[hello_id] - | - | assert token.orth == hello_id == 469755 - | assert token.orth_ == hello_str == 'Hello' + include:doctest ../../../tests/test_use_interger_ids_for_any_strings.txt +example("Get and set string views and flags") pre.language-python: code diff --git a/website/tests/test_get_tokens_and_sentences.txt b/website/tests/test_get_tokens_and_sentences.txt new file mode 100644 index 000000000..e649a6f6d --- /dev/null +++ b/website/tests/test_get_tokens_and_sentences.txt @@ -0,0 +1,8 @@ +>>> from spacy.en import English ### +>>> nlp = English() ### +>>> doc = nlp(u'Hello, world. Here are two sentences.') ### +>>> +>>> token = doc[0] +>>> sentence = doc.sents.next() +>>> assert token is sentence[0] +>>> assert sentence.text == 'Hello, world.' diff --git a/website/tests/test_load_resources_and_process_text.txt b/website/tests/test_load_resources_and_process_text.txt new file mode 100644 index 000000000..9937be1b0 --- /dev/null +++ b/website/tests/test_load_resources_and_process_text.txt @@ -0,0 +1,5 @@ +>>> from spacy.en import English +>>> nlp = English() +>>> doc = nlp(u'Hello, world. Here are two sentences.') +>>> print([s.string for s in doc.sents]) +[u'Hello, world. ', u'Here are two sentences.'] diff --git a/website/tests/test_use_interger_ids_for_any_strings.txt b/website/tests/test_use_interger_ids_for_any_strings.txt new file mode 100644 index 000000000..ec66c2729 --- /dev/null +++ b/website/tests/test_use_interger_ids_for_any_strings.txt @@ -0,0 +1,10 @@ +>>> from spacy.en import English ### +>>> nlp = English() ### +>>> doc = nlp(u'Hello, world. Here are two sentences.') ### +>>> token = doc[0] ### +>>> +>>> hello_id = nlp.vocab.strings['Hello'] +>>> hello_str = nlp.vocab.strings[hello_id] +>>> +>>> assert token.orth == hello_id == 469755 +>>> assert token.orth_ == hello_str == 'Hello' From e08eca837c7cfcd24d8c84735f0fe290433ed10c Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Sep 2015 11:52:14 +0200 Subject: [PATCH 2/4] new proposal for doctests --- website/Makefile | 9 +++--- website/create_code_samples | 54 +++++++++++++++++++++++++++++++++ website/run_jade | 59 ------------------------------------- website/tests/test_home.py | 40 +++++++++++++++++++++++++ 4 files changed, 99 insertions(+), 63 deletions(-) create mode 100755 website/create_code_samples delete mode 100755 website/run_jade create mode 100644 website/tests/test_home.py diff --git a/website/Makefile b/website/Makefile index 87c9077ec..ef26d410d 100644 --- a/website/Makefile +++ b/website/Makefile @@ -1,12 +1,13 @@ -all: dir site +all: src/code site -dir: - mkdir -p site +src/code: + mkdir -p src/code/ + ./create_code_samples tests/ src/code/ site: site/index.html site/blog/ site/docs/ site/license/ site/blog/introducing-spacy/ site/blog/parsing-english-in-python/ site/blog/part-of-speech-POS-tagger-in-python/ site/tutorials/twitter-filter/ site/tutorials/syntax-search/ site/tutorials/mark-adverbs/ site/blog/writing-c-in-cython/ site/blog/how-spacy-works/ site/index.html: src/jade/header.jade src/jade/*.jade - ./run_jade src/jade/home/index.jade $@ + jade -P src/jade/home/index.jade --out site/ site/docs/: src/jade/docs/*.jade src/jade/header.jade jade -P src/jade/docs/index.jade --out $@ diff --git a/website/create_code_samples b/website/create_code_samples new file mode 100755 index 000000000..a87d843c9 --- /dev/null +++ b/website/create_code_samples @@ -0,0 +1,54 @@ +#!/usr/bin/env python +import sys +import re +import os +import ast + + +src_dirname = sys.argv[1] +dst_dirname = sys.argv[2] +prefix = "test_" + + +for filename in os.listdir(src_dirname): + match = re.match(re.escape(prefix) + r"(.+)\.py", filename) + if not match: + continue + + name = match.group(1) + source = open(os.path.join(src_dirname, filename)).readlines() + tree = ast.parse("".join(source)) + + for item in tree.body: + if isinstance(item, ast.FunctionDef) and item.name.startswith(prefix): + + # only ast.expr and ast.stmt have line numbers, see: + # https://docs.python.org/2/library/ast.html#ast.AST.lineno + line_numbers = [x.lineno for x in ast.iter_child_nodes(item) + if isinstance(x, ast.expr) or + isinstance(x, ast.stmt)] + + body = source[min(line_numbers)-1:max(line_numbers)] + + # make sure we are inside an indented function body + assert all([re.match(r"\s", l[0]) for l in body]) + + offset = 0 + for line in body: + match = re.search(r"[^\s]", line) + if match: + offset = match.start(0) + + # remove indentation + assert offset > 0 + + for i in range(len(body)): + body[i] = body[i][offset:] if len(body[i]) > offset else "\n" + + # make sure empty lines contain a newline + assert all([l[-1] == "\n" for l in body]) + + code_filename = "%s.%s" % (name, item.name[len(prefix):]) + + with open(os.path.join(dst_dirname, code_filename), "w") as f: + f.write("".join(body)) diff --git a/website/run_jade b/website/run_jade deleted file mode 100755 index 83ac87ef7..000000000 --- a/website/run_jade +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env node -'use strict'; - -var fs = require('fs'); -var jade = require('jade'); - -// returns all: code and return value (default) -jade.filters.doctest_all = function (html, _, use_rv) { - use_rv = use_rv === undefined ? true : use_rv; - - var lines = html.trim().split(/\n/), - block = [], - res = ''; - - lines.forEach(function (line) { - if (line.indexOf('>>> ') === 0) { - // we use ### to suppress lines - if (line.indexOf("###") === -1) { - block.push(line.replace(/^>>> /gm, '')); - } - } else if (block.length > 0) { - res += '
' + block.join('\n') + '
'; - block = []; - - if (use_rv) { - res += '

Which produces:

'; - res += '
' + line + '
'; - } - } - }); - - if (block.length > 0) { - res += '
' + block.join('\n') + '
'; - } - - return res; -}; - -// returns only code -jade.filters.doctest = function (html) { - return jade.filters.doctest_all(html, null, false); -}; - -if (process.argv[0] === "node") { - process.argv.shift(); -} - -var in_file = process.argv[1]; -var out_file = process.argv[2]; - -var html = jade.renderFile(in_file, { - pretty: true -}); - -fs.writeFile(out_file, html, function (err) { - if (err) { - throw err; - } -}); diff --git a/website/tests/test_home.py b/website/tests/test_home.py new file mode 100644 index 000000000..7fad47e6b --- /dev/null +++ b/website/tests/test_home.py @@ -0,0 +1,40 @@ +from __future__ import unicode_literals +import pytest + + +@pytest.fixture(scope="session") +def nlp(): + from spacy.en import English + return English() + + +@pytest.fixture() +def doc(nlp): + return nlp('Hello, world. Here are two sentences.') + + +@pytest.fixture() +def token(doc): + return doc[0] + + +def test_load_resources_and_process_text(): + from spacy.en import English + nlp = English() + doc = nlp('Hello, world. Here are two sentences.') + + +def test_get_tokens_and_sentences(doc): + token = doc[0] + sentence = doc.sents.next() + + assert token is sentence[0] + assert sentence.text == 'Hello, world.' + + +def test_use_integer_ids_for_any_strings(nlp, token): + hello_id = nlp.vocab.strings['Hello'] + hello_str = nlp.vocab.strings[hello_id] + + assert token.orth == hello_id == 3404 + assert token.orth_ == hello_str == 'Hello' From 22d111203165e84cc2fc8248045e298948fb05e2 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Sep 2015 11:54:32 +0200 Subject: [PATCH 3/4] new proposal for doctests --- website/src/jade/home/_usage.jade | 9 ++++++--- website/tests/test_get_tokens_and_sentences.txt | 8 -------- website/tests/test_load_resources_and_process_text.txt | 5 ----- .../tests/test_use_interger_ids_for_any_strings.txt | 10 ---------- 4 files changed, 6 insertions(+), 26 deletions(-) delete mode 100644 website/tests/test_get_tokens_and_sentences.txt delete mode 100644 website/tests/test_load_resources_and_process_text.txt delete mode 100644 website/tests/test_use_interger_ids_for_any_strings.txt diff --git a/website/src/jade/home/_usage.jade b/website/src/jade/home/_usage.jade index 12dc9e2a5..15e73a568 100644 --- a/website/src/jade/home/_usage.jade +++ b/website/src/jade/home/_usage.jade @@ -6,13 +6,16 @@ mixin example(name) +example("Load resources and process text") - include:doctest_all ../../../tests/test_load_resources_and_process_text.txt + pre.language-python: code + include ../../code/home.load_resources_and_process_text +example("Get tokens and sentences") - include:doctest ../../../tests/test_get_tokens_and_sentences.txt + pre.language-python: code + include ../../code/home.get_tokens_and_sentences +example("Use integer IDs for any string") - include:doctest ../../../tests/test_use_interger_ids_for_any_strings.txt + pre.language-python: code + include ../../code/home.use_integer_ids_for_any_strings +example("Get and set string views and flags") pre.language-python: code diff --git a/website/tests/test_get_tokens_and_sentences.txt b/website/tests/test_get_tokens_and_sentences.txt deleted file mode 100644 index e649a6f6d..000000000 --- a/website/tests/test_get_tokens_and_sentences.txt +++ /dev/null @@ -1,8 +0,0 @@ ->>> from spacy.en import English ### ->>> nlp = English() ### ->>> doc = nlp(u'Hello, world. Here are two sentences.') ### ->>> ->>> token = doc[0] ->>> sentence = doc.sents.next() ->>> assert token is sentence[0] ->>> assert sentence.text == 'Hello, world.' diff --git a/website/tests/test_load_resources_and_process_text.txt b/website/tests/test_load_resources_and_process_text.txt deleted file mode 100644 index 9937be1b0..000000000 --- a/website/tests/test_load_resources_and_process_text.txt +++ /dev/null @@ -1,5 +0,0 @@ ->>> from spacy.en import English ->>> nlp = English() ->>> doc = nlp(u'Hello, world. Here are two sentences.') ->>> print([s.string for s in doc.sents]) -[u'Hello, world. ', u'Here are two sentences.'] diff --git a/website/tests/test_use_interger_ids_for_any_strings.txt b/website/tests/test_use_interger_ids_for_any_strings.txt deleted file mode 100644 index ec66c2729..000000000 --- a/website/tests/test_use_interger_ids_for_any_strings.txt +++ /dev/null @@ -1,10 +0,0 @@ ->>> from spacy.en import English ### ->>> nlp = English() ### ->>> doc = nlp(u'Hello, world. Here are two sentences.') ### ->>> token = doc[0] ### ->>> ->>> hello_id = nlp.vocab.strings['Hello'] ->>> hello_str = nlp.vocab.strings[hello_id] ->>> ->>> assert token.orth == hello_id == 469755 ->>> assert token.orth_ == hello_str == 'Hello' From 936edea42583802536297dc9d29e33c56dcf0dd0 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Mon, 28 Sep 2015 02:39:14 +0200 Subject: [PATCH 4/4] doctests for website: 'home'-section --- website/Makefile | 2 +- website/create_code_samples | 23 +++++- website/src/jade/home/_usage.jade | 107 ++----------------------- website/tests/test_home.py | 125 +++++++++++++++++++++++++++++- 4 files changed, 151 insertions(+), 106 deletions(-) diff --git a/website/Makefile b/website/Makefile index ef26d410d..cb05dfc3c 100644 --- a/website/Makefile +++ b/website/Makefile @@ -1,6 +1,6 @@ all: src/code site -src/code: +src/code: tests/test_*.py mkdir -p src/code/ ./create_code_samples tests/ src/code/ diff --git a/website/create_code_samples b/website/create_code_samples index a87d843c9..978e409c4 100755 --- a/website/create_code_samples +++ b/website/create_code_samples @@ -4,6 +4,12 @@ import re import os import ast +# cgi.escape is deprecated since py32 +try: + from html import escape +except ImportError: + from cgi import escape + src_dirname = sys.argv[1] dst_dirname = sys.argv[2] @@ -24,10 +30,18 @@ for filename in os.listdir(src_dirname): # only ast.expr and ast.stmt have line numbers, see: # https://docs.python.org/2/library/ast.html#ast.AST.lineno - line_numbers = [x.lineno for x in ast.iter_child_nodes(item) - if isinstance(x, ast.expr) or - isinstance(x, ast.stmt)] + line_numbers = [] + def fill_line_numbers(node): + for child in ast.iter_child_nodes(node): + if ((isinstance(child, ast.expr) or + isinstance(child, ast.stmt)) and + child.lineno > item.lineno): + + line_numbers.append(child.lineno) + fill_line_numbers(child) + + fill_line_numbers(item) body = source[min(line_numbers)-1:max(line_numbers)] # make sure we are inside an indented function body @@ -38,6 +52,7 @@ for filename in os.listdir(src_dirname): match = re.search(r"[^\s]", line) if match: offset = match.start(0) + break # remove indentation assert offset > 0 @@ -51,4 +66,4 @@ for filename in os.listdir(src_dirname): code_filename = "%s.%s" % (name, item.name[len(prefix):]) with open(os.path.join(dst_dirname, code_filename), "w") as f: - f.write("".join(body)) + f.write(escape("".join(body))) diff --git a/website/src/jade/home/_usage.jade b/website/src/jade/home/_usage.jade index 15e73a568..c87c01ad0 100644 --- a/website/src/jade/home/_usage.jade +++ b/website/src/jade/home/_usage.jade @@ -19,89 +19,27 @@ mixin example(name) +example("Get and set string views and flags") pre.language-python: code - | assert token.shape_ == 'Xxxxx' - | for lexeme in nlp.vocab: - | if lexeme.is_alpha: - | lexeme.shape_ = 'W' - | elif lexeme.is_digit: - | lexeme.shape_ = 'D' - | elif lexeme.is_punct: - | lexeme.shape_ = 'P' - | else: - | lexeme.shape_ = 'M' - | assert token.shape_ == 'W' + include ../../code/home.get_and_set_string_views_and_flags +example("Export to numpy arrays") pre.language-python: code - | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV - | - | attr_ids = [ORTH, LIKE_URL, IS_OOV] - | doc_array = doc.to_array(attr_ids) - | assert doc_array.shape == (len(doc), len(attr_ids)) - | assert doc[0].orth == doc_array[0, 0] - | assert doc[1].orth == doc_array[1, 0] - | assert doc[0].like_url == doc_array[0, 1] - | assert list(doc_array[:, 1]) == [t.like_url for t in doc] + include ../../code/home.export_to_numpy_arrays +example("Word vectors") pre.language-python: code - | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") - | - | apples = doc[0] - | oranges = doc[1] - | boots = doc[6] - | hippos = doc[8] - | - | assert apples.similarity(oranges) > boots.similarity(hippos) + include ../../code/home.word_vectors +example("Part-of-speech tags") pre.language-python: code - | from spacy.parts_of_speech import ADV - | - | def is_adverb(token): - | return token.pos == spacy.parts_of_speech.ADV - | - | # These are data-specific, so no constants are provided. You have to look - | # up the IDs from the StringStore. - | NNS = nlp.vocab.strings['NNS'] - | NNPS = nlp.vocab.strings['NNPS'] - | def is_plural_noun(token): - | return token.tag == NNS or token.tag == NNPS - | - | def print_coarse_pos(token): - | print(token.pos_) - | - | def print_fine_pos(token): - | print(token.tag_) + include ../../code/home.part_of_speech_tags +example("Syntactic dependencies") pre.language-python: code - | def dependency_labels_to_root(token): - | '''Walk up the syntactic tree, collecting the arc labels.''' - | dep_labels = [] - | while token.head is not token: - | dep_labels.append(token.dep) - | token = token.head - | return dep_labels + include ../../code/home.syntactic_dependencies +example("Named entities") pre.language-python: code - | def iter_products(docs): - | for doc in docs: - | for ent in doc.ents: - | if ent.label_ == 'PRODUCT': - | yield ent - | - | def word_is_in_entity(word): - | return word.ent_type != 0 - | - | def count_parent_verb_by_person(docs): - | counts = defaultdict(defaultdict(int)) - | for doc in docs: - | for ent in doc.ents: - | if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: - | counts[ent.orth_][ent.root.head.lemma_] += 1 - | return counts + include ../../code/home.named_entities //+example("Define custom NER rules") // pre.language-python: code @@ -110,40 +48,11 @@ mixin example(name) +example("Calculate inline mark-up on original string") pre.language-python: code - | def put_spans_around_tokens(doc, get_classes): - | '''Given some function to compute class names, put each token in a - | span element, with the appropriate classes computed. - | - | All whitespace is preserved, outside of the spans. (Yes, I know HTML - | won't display it. But the point is no information is lost, so you can - | calculate what you need, e.g. <br /> tags, <p> tags, etc.) - | ''' - | output = [] - | template = '<span classes="{classes}">{word}</span>{space}' - | for token in doc: - | if token.is_space: - | output.append(token.orth_) - | else: - | output.append( - | template.format( - | classes=' '.join(get_classes(token)), - | word=token.orth_, - | space=token.whitespace_)) - | string = ''.join(output) - | string = string.replace('\n', '
') - | string = string.replace('\t', '    ') - | return string - + include ../../code/home.calculate_inline_mark_up_on_original_string +example("Efficient binary serialization") pre.language-python: code - | byte_string = doc.as_bytes() - | open('/tmp/moby_dick.bin', 'wb').write(byte_string) - | - | nlp = spacy.en.English() - | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): - | doc = Doc(nlp.vocab) - | doc.from_bytes(byte_string) + include ../../code/home.efficient_binary_serialization +example("Full documentation") ul diff --git a/website/tests/test_home.py b/website/tests/test_home.py index 7fad47e6b..ed710e107 100644 --- a/website/tests/test_home.py +++ b/website/tests/test_home.py @@ -27,7 +27,6 @@ def test_load_resources_and_process_text(): def test_get_tokens_and_sentences(doc): token = doc[0] sentence = doc.sents.next() - assert token is sentence[0] assert sentence.text == 'Hello, world.' @@ -36,5 +35,127 @@ def test_use_integer_ids_for_any_strings(nlp, token): hello_id = nlp.vocab.strings['Hello'] hello_str = nlp.vocab.strings[hello_id] - assert token.orth == hello_id == 3404 + assert token.orth == hello_id == 469755 assert token.orth_ == hello_str == 'Hello' + + +def test_get_and_set_string_views_and_flags(nlp, token): + assert token.shape_ == 'Xxxxx' + for lexeme in nlp.vocab: + if lexeme.is_alpha: + lexeme.shape_ = 'W' + elif lexeme.is_digit: + lexeme.shape_ = 'D' + elif lexeme.is_punct: + lexeme.shape_ = 'P' + else: + lexeme.shape_ = 'M' + assert token.shape_ == 'W' + + +def test_export_to_numpy_arrays(nlp, doc): + from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV + + attr_ids = [ORTH, LIKE_URL, IS_OOV] + doc_array = doc.to_array(attr_ids) + assert doc_array.shape == (len(doc), len(attr_ids)) + assert doc[0].orth == doc_array[0, 0] + assert doc[1].orth == doc_array[1, 0] + assert doc[0].like_url == doc_array[0, 1] + assert list(doc_array[:, 1]) == [t.like_url for t in doc] + + +def test_word_vectors(nlp): + doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") + + apples = doc[0] + oranges = doc[1] + boots = doc[6] + hippos = doc[8] + + assert apples.similarity(oranges) > boots.similarity(hippos) + + +def test_part_of_speech_tags(nlp): + from spacy.parts_of_speech import ADV + + def is_adverb(token): + return token.pos == spacy.parts_of_speech.ADV + + # These are data-specific, so no constants are provided. You have to look + # up the IDs from the StringStore. + NNS = nlp.vocab.strings['NNS'] + NNPS = nlp.vocab.strings['NNPS'] + def is_plural_noun(token): + return token.tag == NNS or token.tag == NNPS + + def print_coarse_pos(token): + print(token.pos_) + + def print_fine_pos(token): + print(token.tag_) + + +def test_syntactic_dependencies(): + def dependency_labels_to_root(token): + '''Walk up the syntactic tree, collecting the arc labels.''' + dep_labels = [] + while token.head is not token: + dep_labels.append(token.dep) + token = token.head + return dep_labels + + +def test_named_entities(): + def iter_products(docs): + for doc in docs: + for ent in doc.ents: + if ent.label_ == 'PRODUCT': + yield ent + + def word_is_in_entity(word): + return word.ent_type != 0 + + def count_parent_verb_by_person(docs): + counts = defaultdict(defaultdict(int)) + for doc in docs: + for ent in doc.ents: + if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: + counts[ent.orth_][ent.root.head.lemma_] += 1 + return counts + + +def test_calculate_inline_mark_up_on_original_string(): + def put_spans_around_tokens(doc, get_classes): + '''Given some function to compute class names, put each token in a + span element, with the appropriate classes computed. + + All whitespace is preserved, outside of the spans. (Yes, I know HTML + won't display it. But the point is no information is lost, so you can + calculate what you need, e.g.
tags,

tags, etc.) + ''' + output = [] + template = '{word}{space}' + for token in doc: + if token.is_space: + output.append(token.orth_) + else: + output.append( + template.format( + classes=' '.join(get_classes(token)), + word=token.orth_, + space=token.whitespace_)) + string = ''.join(output) + string = string.replace('\n', '') + string = string.replace('\t', ' ') + return string + + +def test_efficient_binary_serialization(doc): + byte_string = doc.as_bytes() + open('/tmp/moby_dick.bin', 'wb').write(byte_string) + + nlp = spacy.en.English() + for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): + doc = Doc(nlp.vocab) + doc.from_bytes(byte_string)