diff --git a/.travis.yml b/.travis.yml
index f21301db1..1ea1f8375 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,4 +24,4 @@ install:
# run tests
script:
- - "py.test tests/ -x"
+ - "py.test tests/ website/tests/ -x"
diff --git a/website/Makefile b/website/Makefile
index e2002c97f..cb05dfc3c 100644
--- a/website/Makefile
+++ b/website/Makefile
@@ -1,4 +1,8 @@
-all: site
+all: src/code site
+
+src/code: tests/test_*.py
+ mkdir -p src/code/
+ ./create_code_samples tests/ src/code/
site: site/index.html site/blog/ site/docs/ site/license/ site/blog/introducing-spacy/ site/blog/parsing-english-in-python/ site/blog/part-of-speech-POS-tagger-in-python/ site/tutorials/twitter-filter/ site/tutorials/syntax-search/ site/tutorials/mark-adverbs/ site/blog/writing-c-in-cython/ site/blog/how-spacy-works/
diff --git a/website/create_code_samples b/website/create_code_samples
new file mode 100755
index 000000000..978e409c4
--- /dev/null
+++ b/website/create_code_samples
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+import sys
+import re
+import os
+import ast
+
+# cgi.escape is deprecated since py32
+try:
+ from html import escape
+except ImportError:
+ from cgi import escape
+
+
+src_dirname = sys.argv[1]
+dst_dirname = sys.argv[2]
+prefix = "test_"
+
+
+for filename in os.listdir(src_dirname):
+ match = re.match(re.escape(prefix) + r"(.+)\.py", filename)
+ if not match:
+ continue
+
+ name = match.group(1)
+ source = open(os.path.join(src_dirname, filename)).readlines()
+ tree = ast.parse("".join(source))
+
+ for item in tree.body:
+ if isinstance(item, ast.FunctionDef) and item.name.startswith(prefix):
+
+ # only ast.expr and ast.stmt have line numbers, see:
+ # https://docs.python.org/2/library/ast.html#ast.AST.lineno
+ line_numbers = []
+
+ def fill_line_numbers(node):
+ for child in ast.iter_child_nodes(node):
+ if ((isinstance(child, ast.expr) or
+ isinstance(child, ast.stmt)) and
+ child.lineno > item.lineno):
+
+ line_numbers.append(child.lineno)
+ fill_line_numbers(child)
+
+ fill_line_numbers(item)
+ body = source[min(line_numbers)-1:max(line_numbers)]
+
+ # make sure we are inside an indented function body
+ assert all([re.match(r"\s", l[0]) for l in body])
+
+ offset = 0
+ for line in body:
+ match = re.search(r"[^\s]", line)
+ if match:
+ offset = match.start(0)
+ break
+
+ # remove indentation
+ assert offset > 0
+
+ for i in range(len(body)):
+ body[i] = body[i][offset:] if len(body[i]) > offset else "\n"
+
+ # make sure empty lines contain a newline
+ assert all([l[-1] == "\n" for l in body])
+
+ code_filename = "%s.%s" % (name, item.name[len(prefix):])
+
+ with open(os.path.join(dst_dirname, code_filename), "w") as f:
+ f.write(escape("".join(body)))
diff --git a/website/src/jade/home/_usage.jade b/website/src/jade/home/_usage.jade
index a26823358..c87c01ad0 100644
--- a/website/src/jade/home/_usage.jade
+++ b/website/src/jade/home/_usage.jade
@@ -7,111 +7,39 @@ mixin example(name)
+example("Load resources and process text")
pre.language-python: code
- | from __future__ import unicode_literals, print_function
- | from spacy.en import English
- | nlp = English()
- | doc = nlp('Hello, world. Here are two sentences.')
+ include ../../code/home.load_resources_and_process_text
+example("Get tokens and sentences")
pre.language-python: code
- | token = doc[0]
- | sentence = doc.sents.next()
- | assert token is sentence[0]
- | assert sentence.text == 'Hello, world.'
+ include ../../code/home.get_tokens_and_sentences
+example("Use integer IDs for any string")
pre.language-python: code
- | hello_id = nlp.vocab.strings['Hello']
- | hello_str = nlp.vocab.strings[hello_id]
- |
- | assert token.orth == hello_id == 469755
- | assert token.orth_ == hello_str == 'Hello'
+ include ../../code/home.use_integer_ids_for_any_strings
+example("Get and set string views and flags")
pre.language-python: code
- | assert token.shape_ == 'Xxxxx'
- | for lexeme in nlp.vocab:
- | if lexeme.is_alpha:
- | lexeme.shape_ = 'W'
- | elif lexeme.is_digit:
- | lexeme.shape_ = 'D'
- | elif lexeme.is_punct:
- | lexeme.shape_ = 'P'
- | else:
- | lexeme.shape_ = 'M'
- | assert token.shape_ == 'W'
+ include ../../code/home.get_and_set_string_views_and_flags
+example("Export to numpy arrays")
pre.language-python: code
- | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
- |
- | attr_ids = [ORTH, LIKE_URL, IS_OOV]
- | doc_array = doc.to_array(attr_ids)
- | assert doc_array.shape == (len(doc), len(attr_ids))
- | assert doc[0].orth == doc_array[0, 0]
- | assert doc[1].orth == doc_array[1, 0]
- | assert doc[0].like_url == doc_array[0, 1]
- | assert list(doc_array[:, 1]) == [t.like_url for t in doc]
+ include ../../code/home.export_to_numpy_arrays
+example("Word vectors")
pre.language-python: code
- | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
- |
- | apples = doc[0]
- | oranges = doc[1]
- | boots = doc[6]
- | hippos = doc[8]
- |
- | assert apples.similarity(oranges) > boots.similarity(hippos)
+ include ../../code/home.word_vectors
+example("Part-of-speech tags")
pre.language-python: code
- | from spacy.parts_of_speech import ADV
- |
- | def is_adverb(token):
- | return token.pos == spacy.parts_of_speech.ADV
- |
- | # These are data-specific, so no constants are provided. You have to look
- | # up the IDs from the StringStore.
- | NNS = nlp.vocab.strings['NNS']
- | NNPS = nlp.vocab.strings['NNPS']
- | def is_plural_noun(token):
- | return token.tag == NNS or token.tag == NNPS
- |
- | def print_coarse_pos(token):
- | print(token.pos_)
- |
- | def print_fine_pos(token):
- | print(token.tag_)
+ include ../../code/home.part_of_speech_tags
+example("Syntactic dependencies")
pre.language-python: code
- | def dependency_labels_to_root(token):
- | '''Walk up the syntactic tree, collecting the arc labels.'''
- | dep_labels = []
- | while token.head is not token:
- | dep_labels.append(token.dep)
- | token = token.head
- | return dep_labels
+ include ../../code/home.syntactic_dependencies
+example("Named entities")
pre.language-python: code
- | def iter_products(docs):
- | for doc in docs:
- | for ent in doc.ents:
- | if ent.label_ == 'PRODUCT':
- | yield ent
- |
- | def word_is_in_entity(word):
- | return word.ent_type != 0
- |
- | def count_parent_verb_by_person(docs):
- | counts = defaultdict(defaultdict(int))
- | for doc in docs:
- | for ent in doc.ents:
- | if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
- | counts[ent.orth_][ent.root.head.lemma_] += 1
- | return counts
+ include ../../code/home.named_entities
//+example("Define custom NER rules")
// pre.language-python: code
@@ -120,40 +48,11 @@ mixin example(name)
+example("Calculate inline mark-up on original string")
pre.language-python: code
- | def put_spans_around_tokens(doc, get_classes):
- | '''Given some function to compute class names, put each token in a
- | span element, with the appropriate classes computed.
- |
- | All whitespace is preserved, outside of the spans. (Yes, I know HTML
- | won't display it. But the point is no information is lost, so you can
- | calculate what you need, e.g. <br /> tags, <p> tags, etc.)
- | '''
- | output = []
- | template = '<span classes="{classes}">{word}</span>{space}'
- | for token in doc:
- | if token.is_space:
- | output.append(token.orth_)
- | else:
- | output.append(
- | template.format(
- | classes=' '.join(get_classes(token)),
- | word=token.orth_,
- | space=token.whitespace_))
- | string = ''.join(output)
- | string = string.replace('\n', '
')
- | string = string.replace('\t', ' ')
- | return string
-
+ include ../../code/home.calculate_inline_mark_up_on_original_string
+example("Efficient binary serialization")
pre.language-python: code
- | byte_string = doc.as_bytes()
- | open('/tmp/moby_dick.bin', 'wb').write(byte_string)
- |
- | nlp = spacy.en.English()
- | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')):
- | doc = Doc(nlp.vocab)
- | doc.from_bytes(byte_string)
+ include ../../code/home.efficient_binary_serialization
+example("Full documentation")
ul
diff --git a/website/tests/test_home.py b/website/tests/test_home.py
new file mode 100644
index 000000000..ed710e107
--- /dev/null
+++ b/website/tests/test_home.py
@@ -0,0 +1,161 @@
+from __future__ import unicode_literals
+import pytest
+
+
+@pytest.fixture(scope="session")
+def nlp():
+ from spacy.en import English
+ return English()
+
+
+@pytest.fixture()
+def doc(nlp):
+ return nlp('Hello, world. Here are two sentences.')
+
+
+@pytest.fixture()
+def token(doc):
+ return doc[0]
+
+
+def test_load_resources_and_process_text():
+ from spacy.en import English
+ nlp = English()
+ doc = nlp('Hello, world. Here are two sentences.')
+
+
+def test_get_tokens_and_sentences(doc):
+ token = doc[0]
+ sentence = doc.sents.next()
+ assert token is sentence[0]
+ assert sentence.text == 'Hello, world.'
+
+
+def test_use_integer_ids_for_any_strings(nlp, token):
+ hello_id = nlp.vocab.strings['Hello']
+ hello_str = nlp.vocab.strings[hello_id]
+
+ assert token.orth == hello_id == 469755
+ assert token.orth_ == hello_str == 'Hello'
+
+
+def test_get_and_set_string_views_and_flags(nlp, token):
+ assert token.shape_ == 'Xxxxx'
+ for lexeme in nlp.vocab:
+ if lexeme.is_alpha:
+ lexeme.shape_ = 'W'
+ elif lexeme.is_digit:
+ lexeme.shape_ = 'D'
+ elif lexeme.is_punct:
+ lexeme.shape_ = 'P'
+ else:
+ lexeme.shape_ = 'M'
+ assert token.shape_ == 'W'
+
+
+def test_export_to_numpy_arrays(nlp, doc):
+ from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
+
+ attr_ids = [ORTH, LIKE_URL, IS_OOV]
+ doc_array = doc.to_array(attr_ids)
+ assert doc_array.shape == (len(doc), len(attr_ids))
+ assert doc[0].orth == doc_array[0, 0]
+ assert doc[1].orth == doc_array[1, 0]
+ assert doc[0].like_url == doc_array[0, 1]
+ assert list(doc_array[:, 1]) == [t.like_url for t in doc]
+
+
+def test_word_vectors(nlp):
+ doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
+
+ apples = doc[0]
+ oranges = doc[1]
+ boots = doc[6]
+ hippos = doc[8]
+
+ assert apples.similarity(oranges) > boots.similarity(hippos)
+
+
+def test_part_of_speech_tags(nlp):
+ from spacy.parts_of_speech import ADV
+
+ def is_adverb(token):
+ return token.pos == spacy.parts_of_speech.ADV
+
+ # These are data-specific, so no constants are provided. You have to look
+ # up the IDs from the StringStore.
+ NNS = nlp.vocab.strings['NNS']
+ NNPS = nlp.vocab.strings['NNPS']
+ def is_plural_noun(token):
+ return token.tag == NNS or token.tag == NNPS
+
+ def print_coarse_pos(token):
+ print(token.pos_)
+
+ def print_fine_pos(token):
+ print(token.tag_)
+
+
+def test_syntactic_dependencies():
+ def dependency_labels_to_root(token):
+ '''Walk up the syntactic tree, collecting the arc labels.'''
+ dep_labels = []
+ while token.head is not token:
+ dep_labels.append(token.dep)
+ token = token.head
+ return dep_labels
+
+
+def test_named_entities():
+ def iter_products(docs):
+ for doc in docs:
+ for ent in doc.ents:
+ if ent.label_ == 'PRODUCT':
+ yield ent
+
+ def word_is_in_entity(word):
+ return word.ent_type != 0
+
+ def count_parent_verb_by_person(docs):
+ counts = defaultdict(defaultdict(int))
+ for doc in docs:
+ for ent in doc.ents:
+ if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
+ counts[ent.orth_][ent.root.head.lemma_] += 1
+ return counts
+
+
+def test_calculate_inline_mark_up_on_original_string():
+ def put_spans_around_tokens(doc, get_classes):
+ '''Given some function to compute class names, put each token in a
+ span element, with the appropriate classes computed.
+
+ All whitespace is preserved, outside of the spans. (Yes, I know HTML
+ won't display it. But the point is no information is lost, so you can
+ calculate what you need, e.g.
tags,
tags, etc.) + ''' + output = [] + template = '{word}{space}' + for token in doc: + if token.is_space: + output.append(token.orth_) + else: + output.append( + template.format( + classes=' '.join(get_classes(token)), + word=token.orth_, + space=token.whitespace_)) + string = ''.join(output) + string = string.replace('\n', '') + string = string.replace('\t', ' ') + return string + + +def test_efficient_binary_serialization(doc): + byte_string = doc.as_bytes() + open('/tmp/moby_dick.bin', 'wb').write(byte_string) + + nlp = spacy.en.English() + for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): + doc = Doc(nlp.vocab) + doc.from_bytes(byte_string)