diff --git a/.travis.yml b/.travis.yml index f21301db1..1ea1f8375 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,4 +24,4 @@ install: # run tests script: - - "py.test tests/ -x" + - "py.test tests/ website/tests/ -x" diff --git a/website/Makefile b/website/Makefile index e2002c97f..cb05dfc3c 100644 --- a/website/Makefile +++ b/website/Makefile @@ -1,4 +1,8 @@ -all: site +all: src/code site + +src/code: tests/test_*.py + mkdir -p src/code/ + ./create_code_samples tests/ src/code/ site: site/index.html site/blog/ site/docs/ site/license/ site/blog/introducing-spacy/ site/blog/parsing-english-in-python/ site/blog/part-of-speech-POS-tagger-in-python/ site/tutorials/twitter-filter/ site/tutorials/syntax-search/ site/tutorials/mark-adverbs/ site/blog/writing-c-in-cython/ site/blog/how-spacy-works/ diff --git a/website/create_code_samples b/website/create_code_samples new file mode 100755 index 000000000..978e409c4 --- /dev/null +++ b/website/create_code_samples @@ -0,0 +1,69 @@ +#!/usr/bin/env python +import sys +import re +import os +import ast + +# cgi.escape is deprecated since py32 +try: + from html import escape +except ImportError: + from cgi import escape + + +src_dirname = sys.argv[1] +dst_dirname = sys.argv[2] +prefix = "test_" + + +for filename in os.listdir(src_dirname): + match = re.match(re.escape(prefix) + r"(.+)\.py", filename) + if not match: + continue + + name = match.group(1) + source = open(os.path.join(src_dirname, filename)).readlines() + tree = ast.parse("".join(source)) + + for item in tree.body: + if isinstance(item, ast.FunctionDef) and item.name.startswith(prefix): + + # only ast.expr and ast.stmt have line numbers, see: + # https://docs.python.org/2/library/ast.html#ast.AST.lineno + line_numbers = [] + + def fill_line_numbers(node): + for child in ast.iter_child_nodes(node): + if ((isinstance(child, ast.expr) or + isinstance(child, ast.stmt)) and + child.lineno > item.lineno): + + line_numbers.append(child.lineno) + fill_line_numbers(child) + + fill_line_numbers(item) + body = source[min(line_numbers)-1:max(line_numbers)] + + # make sure we are inside an indented function body + assert all([re.match(r"\s", l[0]) for l in body]) + + offset = 0 + for line in body: + match = re.search(r"[^\s]", line) + if match: + offset = match.start(0) + break + + # remove indentation + assert offset > 0 + + for i in range(len(body)): + body[i] = body[i][offset:] if len(body[i]) > offset else "\n" + + # make sure empty lines contain a newline + assert all([l[-1] == "\n" for l in body]) + + code_filename = "%s.%s" % (name, item.name[len(prefix):]) + + with open(os.path.join(dst_dirname, code_filename), "w") as f: + f.write(escape("".join(body))) diff --git a/website/src/jade/home/_usage.jade b/website/src/jade/home/_usage.jade index a26823358..c87c01ad0 100644 --- a/website/src/jade/home/_usage.jade +++ b/website/src/jade/home/_usage.jade @@ -7,111 +7,39 @@ mixin example(name) +example("Load resources and process text") pre.language-python: code - | from __future__ import unicode_literals, print_function - | from spacy.en import English - | nlp = English() - | doc = nlp('Hello, world. Here are two sentences.') + include ../../code/home.load_resources_and_process_text +example("Get tokens and sentences") pre.language-python: code - | token = doc[0] - | sentence = doc.sents.next() - | assert token is sentence[0] - | assert sentence.text == 'Hello, world.' + include ../../code/home.get_tokens_and_sentences +example("Use integer IDs for any string") pre.language-python: code - | hello_id = nlp.vocab.strings['Hello'] - | hello_str = nlp.vocab.strings[hello_id] - | - | assert token.orth == hello_id == 469755 - | assert token.orth_ == hello_str == 'Hello' + include ../../code/home.use_integer_ids_for_any_strings +example("Get and set string views and flags") pre.language-python: code - | assert token.shape_ == 'Xxxxx' - | for lexeme in nlp.vocab: - | if lexeme.is_alpha: - | lexeme.shape_ = 'W' - | elif lexeme.is_digit: - | lexeme.shape_ = 'D' - | elif lexeme.is_punct: - | lexeme.shape_ = 'P' - | else: - | lexeme.shape_ = 'M' - | assert token.shape_ == 'W' + include ../../code/home.get_and_set_string_views_and_flags +example("Export to numpy arrays") pre.language-python: code - | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV - | - | attr_ids = [ORTH, LIKE_URL, IS_OOV] - | doc_array = doc.to_array(attr_ids) - | assert doc_array.shape == (len(doc), len(attr_ids)) - | assert doc[0].orth == doc_array[0, 0] - | assert doc[1].orth == doc_array[1, 0] - | assert doc[0].like_url == doc_array[0, 1] - | assert list(doc_array[:, 1]) == [t.like_url for t in doc] + include ../../code/home.export_to_numpy_arrays +example("Word vectors") pre.language-python: code - | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") - | - | apples = doc[0] - | oranges = doc[1] - | boots = doc[6] - | hippos = doc[8] - | - | assert apples.similarity(oranges) > boots.similarity(hippos) + include ../../code/home.word_vectors +example("Part-of-speech tags") pre.language-python: code - | from spacy.parts_of_speech import ADV - | - | def is_adverb(token): - | return token.pos == spacy.parts_of_speech.ADV - | - | # These are data-specific, so no constants are provided. You have to look - | # up the IDs from the StringStore. - | NNS = nlp.vocab.strings['NNS'] - | NNPS = nlp.vocab.strings['NNPS'] - | def is_plural_noun(token): - | return token.tag == NNS or token.tag == NNPS - | - | def print_coarse_pos(token): - | print(token.pos_) - | - | def print_fine_pos(token): - | print(token.tag_) + include ../../code/home.part_of_speech_tags +example("Syntactic dependencies") pre.language-python: code - | def dependency_labels_to_root(token): - | '''Walk up the syntactic tree, collecting the arc labels.''' - | dep_labels = [] - | while token.head is not token: - | dep_labels.append(token.dep) - | token = token.head - | return dep_labels + include ../../code/home.syntactic_dependencies +example("Named entities") pre.language-python: code - | def iter_products(docs): - | for doc in docs: - | for ent in doc.ents: - | if ent.label_ == 'PRODUCT': - | yield ent - | - | def word_is_in_entity(word): - | return word.ent_type != 0 - | - | def count_parent_verb_by_person(docs): - | counts = defaultdict(defaultdict(int)) - | for doc in docs: - | for ent in doc.ents: - | if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: - | counts[ent.orth_][ent.root.head.lemma_] += 1 - | return counts + include ../../code/home.named_entities //+example("Define custom NER rules") // pre.language-python: code @@ -120,40 +48,11 @@ mixin example(name) +example("Calculate inline mark-up on original string") pre.language-python: code - | def put_spans_around_tokens(doc, get_classes): - | '''Given some function to compute class names, put each token in a - | span element, with the appropriate classes computed. - | - | All whitespace is preserved, outside of the spans. (Yes, I know HTML - | won't display it. But the point is no information is lost, so you can - | calculate what you need, e.g. <br /> tags, <p> tags, etc.) - | ''' - | output = [] - | template = '<span classes="{classes}">{word}</span>{space}' - | for token in doc: - | if token.is_space: - | output.append(token.orth_) - | else: - | output.append( - | template.format( - | classes=' '.join(get_classes(token)), - | word=token.orth_, - | space=token.whitespace_)) - | string = ''.join(output) - | string = string.replace('\n', '
') - | string = string.replace('\t', '    ') - | return string - + include ../../code/home.calculate_inline_mark_up_on_original_string +example("Efficient binary serialization") pre.language-python: code - | byte_string = doc.as_bytes() - | open('/tmp/moby_dick.bin', 'wb').write(byte_string) - | - | nlp = spacy.en.English() - | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): - | doc = Doc(nlp.vocab) - | doc.from_bytes(byte_string) + include ../../code/home.efficient_binary_serialization +example("Full documentation") ul diff --git a/website/tests/test_home.py b/website/tests/test_home.py new file mode 100644 index 000000000..ed710e107 --- /dev/null +++ b/website/tests/test_home.py @@ -0,0 +1,161 @@ +from __future__ import unicode_literals +import pytest + + +@pytest.fixture(scope="session") +def nlp(): + from spacy.en import English + return English() + + +@pytest.fixture() +def doc(nlp): + return nlp('Hello, world. Here are two sentences.') + + +@pytest.fixture() +def token(doc): + return doc[0] + + +def test_load_resources_and_process_text(): + from spacy.en import English + nlp = English() + doc = nlp('Hello, world. Here are two sentences.') + + +def test_get_tokens_and_sentences(doc): + token = doc[0] + sentence = doc.sents.next() + assert token is sentence[0] + assert sentence.text == 'Hello, world.' + + +def test_use_integer_ids_for_any_strings(nlp, token): + hello_id = nlp.vocab.strings['Hello'] + hello_str = nlp.vocab.strings[hello_id] + + assert token.orth == hello_id == 469755 + assert token.orth_ == hello_str == 'Hello' + + +def test_get_and_set_string_views_and_flags(nlp, token): + assert token.shape_ == 'Xxxxx' + for lexeme in nlp.vocab: + if lexeme.is_alpha: + lexeme.shape_ = 'W' + elif lexeme.is_digit: + lexeme.shape_ = 'D' + elif lexeme.is_punct: + lexeme.shape_ = 'P' + else: + lexeme.shape_ = 'M' + assert token.shape_ == 'W' + + +def test_export_to_numpy_arrays(nlp, doc): + from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV + + attr_ids = [ORTH, LIKE_URL, IS_OOV] + doc_array = doc.to_array(attr_ids) + assert doc_array.shape == (len(doc), len(attr_ids)) + assert doc[0].orth == doc_array[0, 0] + assert doc[1].orth == doc_array[1, 0] + assert doc[0].like_url == doc_array[0, 1] + assert list(doc_array[:, 1]) == [t.like_url for t in doc] + + +def test_word_vectors(nlp): + doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") + + apples = doc[0] + oranges = doc[1] + boots = doc[6] + hippos = doc[8] + + assert apples.similarity(oranges) > boots.similarity(hippos) + + +def test_part_of_speech_tags(nlp): + from spacy.parts_of_speech import ADV + + def is_adverb(token): + return token.pos == spacy.parts_of_speech.ADV + + # These are data-specific, so no constants are provided. You have to look + # up the IDs from the StringStore. + NNS = nlp.vocab.strings['NNS'] + NNPS = nlp.vocab.strings['NNPS'] + def is_plural_noun(token): + return token.tag == NNS or token.tag == NNPS + + def print_coarse_pos(token): + print(token.pos_) + + def print_fine_pos(token): + print(token.tag_) + + +def test_syntactic_dependencies(): + def dependency_labels_to_root(token): + '''Walk up the syntactic tree, collecting the arc labels.''' + dep_labels = [] + while token.head is not token: + dep_labels.append(token.dep) + token = token.head + return dep_labels + + +def test_named_entities(): + def iter_products(docs): + for doc in docs: + for ent in doc.ents: + if ent.label_ == 'PRODUCT': + yield ent + + def word_is_in_entity(word): + return word.ent_type != 0 + + def count_parent_verb_by_person(docs): + counts = defaultdict(defaultdict(int)) + for doc in docs: + for ent in doc.ents: + if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: + counts[ent.orth_][ent.root.head.lemma_] += 1 + return counts + + +def test_calculate_inline_mark_up_on_original_string(): + def put_spans_around_tokens(doc, get_classes): + '''Given some function to compute class names, put each token in a + span element, with the appropriate classes computed. + + All whitespace is preserved, outside of the spans. (Yes, I know HTML + won't display it. But the point is no information is lost, so you can + calculate what you need, e.g.
tags,

tags, etc.) + ''' + output = [] + template = '{word}{space}' + for token in doc: + if token.is_space: + output.append(token.orth_) + else: + output.append( + template.format( + classes=' '.join(get_classes(token)), + word=token.orth_, + space=token.whitespace_)) + string = ''.join(output) + string = string.replace('\n', '') + string = string.replace('\t', ' ') + return string + + +def test_efficient_binary_serialization(doc): + byte_string = doc.as_bytes() + open('/tmp/moby_dick.bin', 'wb').write(byte_string) + + nlp = spacy.en.English() + for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): + doc = Doc(nlp.vocab) + doc.from_bytes(byte_string)