Merge branch 'develop' into develop-irish

2017-08-08 17:21:27 +01:00 · 2017-08-08 17:21:27 +01:00 · 95921d7d4c
parent 70f4d26c10 42bd26f6f3
commit 95921d7d4c
278 changed files with 38598 additions and 25044 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -0,0 +1 @@
+build: off
--- a/.gitignore
+++ b/.gitignore
@ -30,6 +30,7 @@ Profile.prof
 __pycache__/
 *.py[cod]
 .env/
+.env*
 .~env/
 .venv
 venv/
--- a/README.rst
+++ b/README.rst
@ -4,12 +4,10 @@ spaCy: Industrial-strength NLP
 spaCy is a library for advanced natural language processing in Python and
 Cython. spaCy is built on  the very latest research, but it isn't researchware.
 It was designed from day one to be used in real products. spaCy currently supports
-English, German and French, as well as tokenization for Spanish, Italian,
-Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
-Chinese and Japanese. It's commercial open-source software, released under the
-MIT license.
-
-📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
+English, German, French and Spanish, as well as tokenization for Italian,
+Portuguese, Dutch, Swedish, Finnish, Norwegian, Danish, Hungarian, Polish,
+Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software,
+released under the MIT license.

 💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_

@ -85,7 +83,7 @@ Features
 * GIL-free **multi-threading**
 * Efficient binary serialization
 * Easy **deep learning** integration
-* Statistical models for **English** and **German**
+* Statistical models for **English**, **German**, **French** and **Spanish**
 * State-of-the-art speed
 * Robust, rigorously evaluated accuracy

@ -197,7 +195,7 @@ To load a model, use ``spacy.load()`` with the model's shortcut link:
 .. code:: python

    import spacy
-    nlp = spacy.load('en_default')
+    nlp = spacy.load('en')
    doc = nlp(u'This is a sentence.')

 If you've installed a model via pip, you can also ``import`` it directly and
@ -313,7 +311,7 @@ and ``--model`` are optional and enable additional tests:
    # make sure you are using recent pytest version
    python -m pip install -U pytest

-    python -m pytest <spacy-directory> --vectors --models --slow
+    python -m pytest <spacy-directory>

 🛠 Changelog
 ============
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -1,68 +1,27 @@
 from __future__ import unicode_literals, print_function
-import json
-import pathlib
+
 import random

-import spacy
-from spacy.pipeline import EntityRecognizer
-from spacy.gold import GoldParse
-from spacy.tagger import Tagger
-
- 
-try:
-    unicode
-except:
-    unicode = str
+from spacy.lang.en import English
+from spacy.gold import GoldParse, biluo_tags_from_offsets


-def train_ner(nlp, train_data, entity_types):
-    # Add new words to vocab.
-    for raw_text, _ in train_data:
-        doc = nlp.make_doc(raw_text)
-        for word in doc:
-            _ = nlp.vocab[word.orth]
-
-    # Train NER.
-    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
-    for itn in range(5):
-        random.shuffle(train_data)
-        for raw_text, entity_offsets in train_data:
-            doc = nlp.make_doc(raw_text)
-            gold = GoldParse(doc, entities=entity_offsets)
-            ner.update(doc, gold)
-    return ner
-
-def save_model(ner, model_dir):
-    model_dir = pathlib.Path(model_dir)
-    if not model_dir.exists():
-        model_dir.mkdir()
-    assert model_dir.is_dir()
-
-    with (model_dir / 'config.json').open('wb') as file_:
-        data = json.dumps(ner.cfg)
-        if isinstance(data, unicode):
-            data = data.encode('utf8')
-        file_.write(data)
-    ner.model.dump(str(model_dir / 'model'))
-    if not (model_dir / 'vocab').exists():
-        (model_dir / 'vocab').mkdir()
-    ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
-    with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
-        ner.vocab.strings.dump(file_)
+def reformat_train_data(tokenizer, examples):
+    """Reformat data to match JSON format"""
+    output = []
+    for i, (text, entity_offsets) in enumerate(examples):
+        doc = tokenizer(text)
+        ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
+        words = [w.text for w in doc]
+        tags = ['-'] * len(doc)
+        heads = [0] * len(doc)
+        deps = [''] * len(doc)
+        sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
+        output.append((text, [(sentence, [])]))
+    return output


 def main(model_dir=None):
-    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
-
-    # v1.1.2 onwards
-    if nlp.tagger is None:
-        print('---- WARNING ----')
-        print('Data directory not found')
-        print('please run: `python -m spacy.en.download --force all` for better performance')
-        print('Using feature templates for tagging')
-        print('-----------------')
-        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
-
    train_data = [
        (
            'Who is Shaka Khan?',
@ -74,23 +33,35 @@ def main(model_dir=None):
            (len('I like London and '), len('I like London and Berlin'), 'LOC')]
        )
    ]
-    ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])
-
-    doc = nlp.make_doc('Who is Shaka Khan?')
-    nlp.tagger(doc)
-    ner(doc)
-    for word in doc:
-        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
-
-    if model_dir is not None:
-        save_model(ner, model_dir)
-
-
-
-
+    nlp = English(pipeline=['tensorizer', 'ner'])
+    get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
+    optimizer = nlp.begin_training(get_data)
+    for itn in range(100):
+        random.shuffle(train_data)
+        losses = {}
+        for raw_text, entity_offsets in train_data:
+            doc = nlp.make_doc(raw_text)
+            gold = GoldParse(doc, entities=entity_offsets)
+            nlp.update(
+                [doc], # Batch of Doc objects
+                [gold], # Batch of GoldParse objects
+                drop=0.5, # Dropout -- make it harder to memorise data
+                sgd=optimizer, # Callable to update weights
+                losses=losses)
+        print(losses)
+    print("Save to", model_dir)
+    nlp.to_disk(model_dir)
+    print("Load from", model_dir)
+    nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner'])
+    nlp.from_disk(model_dir)
+    for raw_text, _ in train_data:
+        doc = nlp(raw_text)
+        for word in doc:
+            print(word.text, word.ent_type_, word.ent_iob_)

 if __name__ == '__main__':
-    main('ner')
+    import plac
+    plac.call(main)
    # Who "" 2
    # is "" 2
    # Shaka "" PERSON 3
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -0,0 +1,109 @@
+from __future__ import unicode_literals
+import plac
+import random
+import tqdm
+
+from thinc.neural.optimizers import Adam
+from thinc.neural.ops import NumpyOps
+import thinc.extra.datasets
+
+import spacy.lang.en
+from spacy.gold import GoldParse, minibatch
+from spacy.util import compounding
+from spacy.pipeline import TextCategorizer
+
+
+def train_textcat(tokenizer, textcat,
+                  train_texts, train_cats, dev_texts, dev_cats,
+                  n_iter=20):
+    '''
+    Train the TextCategorizer without associated pipeline.
+    '''
+    textcat.begin_training()
+    optimizer = Adam(NumpyOps(), 0.001)
+    train_docs = [tokenizer(text) for text in train_texts]
+    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
+                  zip(train_docs, train_cats)]
+    train_data = zip(train_docs, train_gold)
+    batch_sizes = compounding(4., 128., 1.001)
+    for i in range(n_iter):
+        losses = {}
+        train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
+        for batch in minibatch(train_data, size=batch_sizes):
+            docs, golds = zip(*batch)
+            textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
+                losses=losses)
+        with textcat.model.use_params(optimizer.averages):
+            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
+        yield losses['textcat'], scores
+
+
+def evaluate(tokenizer, textcat, texts, cats):
+    docs = (tokenizer(text) for text in texts)
+    tp = 1e-8 # True positives
+    fp = 1e-8 # False positives
+    fn = 1e-8 # False negatives
+    tn = 1e-8 # True negatives
+    for i, doc in enumerate(textcat.pipe(docs)):
+        gold = cats[i]
+        for label, score in doc.cats.items():
+            if score >= 0.5 and label in gold:
+                tp += 1.
+            elif score >= 0.5 and label not in gold:
+                fp += 1.
+            elif score < 0.5 and label not in gold:
+                tn += 1
+            if score < 0.5 and label in gold:
+                fn += 1
+    precis = tp / (tp + fp)
+    recall = tp / (tp + fn)
+    fscore = 2 * (precis * recall) / (precis + recall)
+    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}  
+
+
+def load_data():
+    # Partition off part of the train data --- avoid running experiments
+    # against test.
+    train_data, _ = thinc.extra.datasets.imdb()
+
+    random.shuffle(train_data)
+
+    texts, labels = zip(*train_data)
+    cats = [(['POSITIVE'] if y else []) for y in labels]
+
+    split = int(len(train_data) * 0.8)
+
+    train_texts = texts[:split]
+    train_cats = cats[:split]
+    dev_texts = texts[split:]
+    dev_cats = cats[split:]
+    return (train_texts, train_cats), (dev_texts, dev_cats)
+
+
+def main(model_loc=None):
+    nlp = spacy.lang.en.English()
+    tokenizer = nlp.tokenizer
+    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
+
+    print("Load IMDB data")
+    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
+
+    print("Itn.\tLoss\tP\tR\tF")
+    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
+
+    for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
+                                       train_texts, train_cats,
+                                       dev_texts, dev_cats, n_iter=20)):
+        print(progress.format(i=i, loss=loss, **scores))
+    # How to save, load and use
+    nlp.pipeline.append(textcat)
+    if model_loc is not None:
+        nlp.to_disk(model_loc)
+
+        nlp = spacy.load(model_loc)
+        doc = nlp(u'This movie sucked!')
+        print(doc.cats)
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/requirements.txt
+++ b/requirements.txt
@ -3,8 +3,8 @@ pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.6.0,<6.7.0
-murmurhash>=0.26,<0.27
+thinc>=6.8.0,<6.9.0
+murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
 ujson>=1.35
@ -14,3 +14,6 @@ regex==2017.4.5
 ftfy>=4.4.2,<5.0.0
 pytest>=3.0.6,<4.0.0
 pip>=9.0.0,<10.0.0
+mock>=2.0.0,<3.0.0
+msgpack-python
+msgpack-numpy
--- a/setup.py
+++ b/setup.py
@ -44,7 +44,8 @@ MOD_NAMES = [
    'spacy.matcher',
    'spacy.syntax.ner',
    'spacy.symbols',
-    'spacy.syntax.iterators']
+    'spacy.vectors',
+]


 COMPILE_OPTIONS =  {
@ -188,10 +189,10 @@ def setup_package():
            ext_modules=ext_modules,
            install_requires=[
                'numpy>=1.7',
-                'murmurhash>=0.26,<0.27',
+                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.6.0,<6.7.0',
+                'thinc>=6.8.0,<6.9.0',
                'plac<1.0.0,>=0.9.6',
                'pip>=9.0.0,<10.0.0',
                'six',
@ -200,7 +201,9 @@ def setup_package():
                'dill>=0.2,<0.3',
                'requests>=2.13.0,<3.0.0',
                'regex==2017.4.5',
-                'ftfy>=4.4.2,<5.0.0'],
+                'ftfy>=4.4.2,<5.0.0',
+                'msgpack-python',
+                'msgpack-numpy'],
            classifiers=[
                'Development Status :: 5 - Production/Stable',
                'Environment :: Console',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,22 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals

-import importlib
-
-from .compat import basestring_
-from .cli.info import info
+from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
+from .about import __version__
 from . import util


 def load(name, **overrides):
    name = resolve_load_name(name, **overrides)
-    model_path = util.resolve_model_path(name)
-    meta = util.parse_package_meta(model_path)
-    if 'lang' not in meta:
-        raise IOError('No language setting found in model meta.')
-    cls = util.get_lang_class(meta['lang'])
-    overrides['meta'] = meta
-    overrides['path'] = model_path
-    return cls(**overrides)
+    return util.load_model(name, **overrides)
+
+
+def blank(name, **kwargs):
+    LangClass = util.get_lang_class(name)
+    return LangClass(**kwargs)
+
+
+def info(model=None, markdown=False):
+    return cli_info(None, model, markdown)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -3,135 +3,21 @@ from __future__ import print_function
 # NB! This breaks in plac on Python 2!!
 #from __future__ import unicode_literals

-import plac
-from spacy.cli import download as cli_download
-from spacy.cli import link as cli_link
-from spacy.cli import info as cli_info
-from spacy.cli import package as cli_package
-from spacy.cli import train as cli_train
-from spacy.cli import model as cli_model
-from spacy.cli import convert as cli_convert
-
-
-class CLI(object):
-    """
-    Command-line interface for spaCy
-    """
-    commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
-
-    @plac.annotations(
-        model=("model to download (shortcut or model name)", "positional", None, str),
-        direct=("force direct download. Needs model name with version and won't "
-                "perform compatibility check", "flag", "d", bool)
-    )
-    def download(self, model, direct=False):
-        """
-        Download compatible model from default download path using pip. Model
-        can be shortcut, model name or, if --direct flag is set, full model name
-        with version.
-        """
-        cli_download(model, direct)
-
-
-    @plac.annotations(
-        origin=("package name or local path to model", "positional", None, str),
-        link_name=("name of shortuct link to create", "positional", None, str),
-        force=("force overwriting of existing link", "flag", "f", bool)
-    )
-    def link(self, origin, link_name, force=False):
-        """
-        Create a symlink for models within the spacy/data directory. Accepts
-        either the name of a pip package, or the local path to the model data
-        directory. Linking models allows loading them via spacy.load(link_name).
-        """
-        cli_link(origin, link_name, force)
-
-
-    @plac.annotations(
-        model=("optional: shortcut link of model", "positional", None, str),
-        markdown=("generate Markdown for GitHub issues", "flag", "md", str)
-    )
-    def info(self, model=None, markdown=False):
-        """
-        Print info about spaCy installation. If a model shortcut link is
-        speficied as an argument, print model information. Flag --markdown
-        prints details in Markdown for easy copy-pasting to GitHub issues.
-        """
-        cli_info(model, markdown)
-
-
-    @plac.annotations(
-        input_dir=("directory with model data", "positional", None, str),
-        output_dir=("output parent directory", "positional", None, str),
-        meta=("path to meta.json", "option", "m", str),
-        force=("force overwriting of existing folder in output directory", "flag", "f", bool)
-    )
-    def package(self, input_dir, output_dir, meta=None, force=False):
-        """
-        Generate Python package for model data, including meta and required
-        installation files. A new directory will be created in the specified
-        output directory, and model data will be copied over.
-        """
-        cli_package(input_dir, output_dir, meta, force)
-
-
-    @plac.annotations(
-        lang=("model language", "positional", None, str),
-        output_dir=("output directory to store model in", "positional", None, str),
-        train_data=("location of JSON-formatted training data", "positional", None, str),
-        dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
-        n_iter=("number of iterations", "option", "n", int),
-        nsents=("number of sentences", "option", None, int),
-        parser_L1=("L1 regularization penalty for parser", "option", "L", float),
-        use_gpu=("Use GPU", "flag", "g", bool),
-        no_tagger=("Don't train tagger", "flag", "T", bool),
-        no_parser=("Don't train parser", "flag", "P", bool),
-        no_ner=("Don't train NER", "flag", "N", bool)
-    )
-    def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
-              nsents=0, parser_L1=0.0, use_gpu=False,
-              no_tagger=False, no_parser=False, no_ner=False):
-        """
-        Train a model. Expects data in spaCy's JSON format.
-        """
-        nsents = nsents or None
-        cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents,
-                  use_gpu, not no_tagger, not no_parser, not no_ner, parser_L1)
-
-    @plac.annotations(
-        lang=("model language", "positional", None, str),
-        model_dir=("output directory to store model in", "positional", None, str),
-        freqs_data=("tab-separated frequencies file", "positional", None, str),
-        clusters_data=("Brown clusters file", "positional", None, str),
-        vectors_data=("word vectors file", "positional", None, str)
-    )
-    def model(self, lang, model_dir, freqs_data, clusters_data=None, vectors_data=None):
-        """
-        Initialize a new model and its data directory.
-        """
-        cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
-
-    @plac.annotations(
-        input_file=("input file", "positional", None, str),
-        output_dir=("output directory for converted file", "positional", None, str),
-        n_sents=("Number of sentences per doc", "option", "n", float),
-        morphology=("Enable appending morphology to tags", "flag", "m", bool)
-    )
-    def convert(self, input_file, output_dir, n_sents=10, morphology=False):
-        """
-        Convert files into JSON format for use with train command and other
-        experiment management functions.
-        """
-        cli_convert(input_file, output_dir, n_sents, morphology)
-
-
-    def __missing__(self, name):
-        print("\n   Command %r does not exist."
-              "\n   Use the --help flag for a list of available commands.\n" % name)
-

 if __name__ == '__main__':
    import plac
    import sys
-    sys.argv[0] = 'spacy'
-    plac.Interpreter.call(CLI)
+    from spacy.cli import download, link, info, package, train, convert
+    from spacy.util import prints
+
+    commands = {'download': download, 'link': link, 'info': info, 'train': train,
+                'convert': convert, 'package': package}
+    if len(sys.argv) == 1:
+        prints(', '.join(commands), title="Available commands", exits=1)
+    command = sys.argv.pop(1)
+    sys.argv[0] = 'spacy %s' % command
+    if command in commands:
+        plac.call(commands[command])
+    else:
+        prints("Available: %s" % ', '.join(commands),
+               title="Unknown command: %s" % command, exits=1)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -1,20 +1,107 @@
+import ujson
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.neural import Model, Maxout, Softmax, Affine
 from thinc.neural._classes.hash_embed import HashEmbed
 from thinc.neural.ops import NumpyOps, CupyOps
+from thinc.neural.util import get_array_module
+import random
+import cytoolz

 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.static_vectors import StaticVectors
 from thinc.neural._classes.batchnorm import BatchNorm
+from thinc.neural._classes.layernorm import LayerNorm as LN
 from thinc.neural._classes.resnet import Residual
+from thinc.neural import ReLu
+from thinc.neural._classes.selu import SELU
 from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
+from thinc.api import FeatureExtracter, with_getitem
+from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
+from thinc.neural._classes.attention import ParametricAttention
+from thinc.linear.linear import LinearModel
+from thinc.api import uniqued, wrap, flatten_add_lengths

-from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP
+from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
 from .tokens.doc import Doc

 import numpy
+import io
+
+
+@layerize
+def _flatten_add_lengths(seqs, pad=0, drop=0.):
+    ops = Model.ops
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
+    def finish_update(d_X, sgd=None):
+        return ops.unflatten(d_X, lengths, pad=pad)
+    X = ops.flatten(seqs, pad=pad)
+    return (X, lengths), finish_update
+
+
+@layerize
+def _logistic(X, drop=0.):
+    xp = get_array_module(X)
+    if not isinstance(X, xp.ndarray):
+        X = xp.asarray(X)
+    # Clip to range (-10, 10)
+    X = xp.minimum(X, 10., X)
+    X = xp.maximum(X, -10., X)
+    Y = 1. / (1. + xp.exp(-X))
+    def logistic_bwd(dY, sgd=None):
+        dX = dY * (Y * (1-Y))
+        return dX
+    return Y, logistic_bwd
+
+
+@layerize
+def add_tuples(X, drop=0.):
+    """Give inputs of sequence pairs, where each sequence is (vals, length),
+    sum the values, returning a single sequence.
+
+    If input is:
+    ((vals1, length), (vals2, length)
+    Output is:
+    (vals1+vals2, length)
+
+    vals are a single tensor for the whole batch.
+    """
+    (vals1, length1), (vals2, length2) = X
+    assert length1 == length2
+
+    def add_tuples_bwd(dY, sgd=None):
+        return (dY, dY)
+
+    return (vals1+vals2, length), add_tuples_bwd
+
+
+def _zero_init(model):
+    def _zero_init_impl(self, X, y):
+        self.W.fill(0)
+    model.on_data_hooks.append(_zero_init_impl)
+    if model.W is not None:
+        model.W.fill(0.)
+    return model
+
+
+@layerize
+def _preprocess_doc(docs, drop=0.):
+    keys = [doc.to_array([LOWER]) for doc in docs]
+    keys = [a[:, 0] for a in keys]
+    ops = Model.ops
+    lengths = ops.asarray([arr.shape[0] for arr in keys])
+    keys = ops.xp.concatenate(keys)
+    vals = ops.allocate(keys.shape[0]) + 1
+    return (keys, vals, lengths), None
+
+
+def _init_for_precomputed(W, ops):
+    if (W**2).sum() != 0.:
+        return
+    reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
+    ops.xavier_uniform_init(reshaped)
+    W[:] = reshaped.reshape(W.shape)


@describe.on_data(_set_dimensions_if_needed)
@ -23,8 +110,8 @@ import numpy
    nF=Dimension("Number of features"),
    nO=Dimension("Output size"),
    W=Synapses("Weights matrix",
-        lambda obj: (obj.nO, obj.nF, obj.nI),
-        lambda W, ops: ops.xavier_uniform_init(W)),
+        lambda obj: (obj.nF, obj.nO, obj.nI),
+        lambda W, ops: _init_for_precomputed(W, ops)),
    b=Biases("Bias vector",
        lambda obj: (obj.nO,)),
    d_W=Gradient("W"),
@ -39,25 +126,25 @@ class PrecomputableAffine(Model):

    def begin_update(self, X, drop=0.):
        # X: (b, i)
-        # Xf: (b, f, i)
+        # Yf: (b, f, i)
        # dY: (b, o)
        # dYf: (b, f, o)
-        #Yf = numpy.einsum('bi,ofi->bfo', X, self.W)
+        #Yf = numpy.einsum('bi,foi->bfo', X, self.W)
        Yf = self.ops.xp.tensordot(
-                X, self.W, axes=[[1], [2]]).transpose((0, 2, 1))
+                X, self.W, axes=[[1], [2]])
        Yf += self.b
        def backward(dY_ids, sgd=None):
+            tensordot = self.ops.xp.tensordot
            dY, ids = dY_ids
            Xf = X[ids]

+            #dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
+            dXf = tensordot(dY, self.W, axes=[[1], [1]])
            #dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
-            dW = self.ops.xp.tensordot(dY, Xf, axes=[[0], [0]])
-            db = dY.sum(axis=0)
-            #dXf = numpy.einsum('bo,ofi->bfi', dY, self.W)
-            dXf = self.ops.xp.tensordot(dY, self.W, axes=[[1], [0]])
-
-            self.d_W += dW
-            self.d_b += db
+            dW = tensordot(dY, Xf, axes=[[0], [0]])
+            # ofi -> foi
+            self.d_W += dW.transpose((1, 0, 2))
+            self.d_b += dY.sum(axis=0)

            if sgd is not None:
                sgd(self._mem.weights, self._mem.gradient, key=self.id)
@ -80,10 +167,10 @@ class PrecomputableAffine(Model):
    d_b=Gradient("b")
 )
 class PrecomputableMaxouts(Model):
-    def __init__(self, nO=None, nI=None, nF=None, pieces=3, **kwargs):
+    def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
        Model.__init__(self, **kwargs)
        self.nO = nO
-        self.nP = pieces
+        self.nP = nP
        self.nI = nI
        self.nF = nF

@ -120,38 +207,105 @@ class PrecomputableMaxouts(Model):
            return dXf
        return Yfp, backward

-def Tok2Vec(width, embed_size, preprocess=None):
-    cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
-    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
-        lower = get_col(cols.index(LOWER))   >> HashEmbed(width, embed_size)
-        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2)
-        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2)
-        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2)

+def Tok2Vec(width, embed_size, preprocess=None):
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
+        norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower')
+        prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
+        suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
+        shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape')
+
+        embed = (norm | prefix | suffix | shape )
        tok2vec = (
-            flatten
-            >> (lower | prefix | suffix | shape )
-            >> Maxout(width, width*4, pieces=3)
-            >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-            >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-            >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
-            >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
+            with_flatten(
+                asarray(Model.ops, dtype='uint64')
+                >> uniqued(embed, column=5)
+                >> LN(Maxout(width, width*4, pieces=3))
+                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)),
+            pad=4)
        )
        if preprocess not in (False, None):
            tok2vec = preprocess >> tok2vec
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
+        tok2vec.embed = embed
    return tok2vec


-def get_col(idx):
+def asarray(ops, dtype):
    def forward(X, drop=0.):
+        return ops.asarray(X, dtype=dtype), None
+    return layerize(forward)
+
+
+def foreach(layer):
+    def forward(Xs, drop=0.):
+        results = []
+        backprops = []
+        for X in Xs:
+            result, bp = layer.begin_update(X, drop=drop)
+            results.append(result)
+            backprops.append(bp)
+        def backward(d_results, sgd=None):
+            dXs = []
+            for d_result, backprop in zip(d_results, backprops):
+                dXs.append(backprop(d_result, sgd))
+            return dXs
+        return results, backward
+    model = layerize(forward)
+    model._layers.append(layer)
+    return model
+
+
+def rebatch(size, layer):
+    ops = layer.ops
+    def forward(X, drop=0.):
+        if X.shape[0] < size:
+            return layer.begin_update(X)
+        parts = _divide_array(X, size)
+        results, bp_results = zip(*[layer.begin_update(p, drop=drop)
+                                    for p in parts])
+        y = ops.flatten(results)
+        def backward(dy, sgd=None):
+            d_parts = [bp(y, sgd=sgd) for bp, y in
+                       zip(bp_results, _divide_array(dy, size))]
+            try:
+                dX = ops.flatten(d_parts)
+            except TypeError:
+                dX = None
+            except ValueError:
+                dX = None
+            return dX
+        return y, backward
+    model = layerize(forward)
+    model._layers.append(layer)
+    return model
+
+
+def _divide_array(X, size):
+    parts = []
+    index = 0
+    while index < len(X):
+        parts.append(X[index : index + size])
+        index += size
+    return parts
+
+
+def get_col(idx):
+    assert idx >= 0, idx
+    def forward(X, drop=0.):
+        assert idx >= 0, idx
        if isinstance(X, numpy.ndarray):
            ops = NumpyOps()
        else:
            ops = CupyOps()
        output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
        def backward(y, sgd=None):
+            assert idx >= 0, idx
            dX = ops.allocate(X.shape)
            dX[:, idx] += y
            return dX
@ -167,21 +321,17 @@ def zero_init(model):


 def doc2feats(cols=None):
-    cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    def forward(docs, drop=0.):
        feats = []
        for doc in docs:
-            if 'cached_feats' not in doc.user_data:
-                doc.user_data['cached_feats'] = model.ops.asarray(
-                                                    doc.to_array(cols),
-                                                    dtype='uint64')
-            feats.append(doc.user_data['cached_feats'])
-            assert feats[-1].dtype == 'uint64'
+            feats.append(doc.to_array(cols))
        return feats, None
    model = layerize(forward)
    model.cols = cols
    return model

+
 def print_shape(prefix):
    def forward(X, drop=0.):
        return X, lambda dX, **kwargs: dX
@ -197,6 +347,29 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
    return vectors, backward


+def fine_tune(embedding, combine=None):
+    if combine is not None:
+        raise NotImplementedError(
+            "fine_tune currently only supports addition. Set combine=None")
+    def fine_tune_fwd(docs_tokvecs, drop=0.):
+        docs, tokvecs = docs_tokvecs
+        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
+
+        vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
+        
+        output = embedding.ops.unflatten(
+                    embedding.ops.flatten(tokvecs)
+                    + embedding.ops.flatten(vecs),
+                    lengths)
+
+        def fine_tune_bwd(d_output, sgd=None):
+            bp_vecs(d_output, sgd=sgd)
+            return d_output
+        return output, fine_tune_bwd
+    model = wrap(fine_tune_fwd, embedding)
+    return model
+
+
@layerize
 def flatten(seqs, drop=0.):
    if isinstance(seqs[0], numpy.ndarray):
@ -210,3 +383,95 @@ def flatten(seqs, drop=0.):
        return ops.unflatten(d_X, lengths)
    X = ops.xp.vstack(seqs)
    return X, finish_update
+
+
+@layerize
+def logistic(X, drop=0.):
+    xp = get_array_module(X)
+    if not isinstance(X, xp.ndarray):
+        X = xp.asarray(X)
+    # Clip to range (-10, 10)
+    X = xp.minimum(X, 10., X)
+    X = xp.maximum(X, -10., X)
+    Y = 1. / (1. + xp.exp(-X))
+    def logistic_bwd(dY, sgd=None):
+        dX = dY * (Y * (1-Y))
+        return dX
+    return Y, logistic_bwd
+
+
+def zero_init(model):
+    def _zero_init_impl(self, X, y):
+        self.W.fill(0)
+    model.on_data_hooks.append(_zero_init_impl)
+    return model
+
+@layerize
+def preprocess_doc(docs, drop=0.):
+    keys = [doc.to_array([LOWER]) for doc in docs]
+    keys = [a[:, 0] for a in keys]
+    ops = Model.ops
+    lengths = ops.asarray([arr.shape[0] for arr in keys])
+    keys = ops.xp.concatenate(keys)
+    vals = ops.allocate(keys.shape[0]) + 1
+    return (keys, vals, lengths), None
+
+def getitem(i):
+    def getitem_fwd(X, drop=0.):
+        return X[i], None
+    return layerize(getitem_fwd)
+
+def build_tagger_model(nr_class, token_vector_width, **cfg):
+    with Model.define_operators({'>>': chain, '+': add}):
+        # Input: (doc, tensor) tuples
+        private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
+ 
+        model = ( 
+            fine_tune(private_tok2vec)
+            >> with_flatten(
+                Maxout(token_vector_width, token_vector_width)
+                >> Softmax(nr_class, token_vector_width)
+            )
+        )
+    model.nI = None
+    return model
+
+
+def build_text_classifier(nr_class, width=64, **cfg):
+    nr_vector = cfg.get('nr_vector', 200)
+    with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}):
+        embed_lower = HashEmbed(width, nr_vector, column=1)
+        embed_prefix = HashEmbed(width//2, nr_vector, column=2)
+        embed_suffix = HashEmbed(width//2, nr_vector, column=3)
+        embed_shape = HashEmbed(width//2, nr_vector, column=4)
+
+        cnn_model = (
+            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE])
+            >> _flatten_add_lengths
+            >> with_getitem(0,
+                uniqued(
+                  (embed_lower | embed_prefix | embed_suffix | embed_shape) 
+                  >> Maxout(width, width+(width//2)*3))
+                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
+                >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
+            )
+            >> ParametricAttention(width,)
+            >> Pooling(sum_pool)
+            >> ReLu(width, width)
+            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
+        )
+        linear_model = (
+            _preprocess_doc
+            >> LinearModel(nr_class, drop_factor=0.)
+        )
+
+        model = (
+            (linear_model | cnn_model)
+            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
+            >> logistic
+        )
+ 
+    model.lsuv = False
+    return model
+
--- a/spacy/about.py
+++ b/spacy/about.py
@ -2,16 +2,16 @@
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

-__title__ = 'spacy'
-__version__ = '1.8.2'
+__title__ = 'spacy-nightly'
+__version__ = '2.0.0a7'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
-__author__ = 'Matthew Honnibal'
-__email__ = 'matt@explosion.ai'
+__author__ = 'Explosion AI'
+__email__ = 'contact@explosion.ai'
 __license__ = 'MIT'

 __docs_models__ = 'https://spacy.io/docs/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
 __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
 __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
-__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
+__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/develop/templates/model/'
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -83,6 +83,7 @@ cpdef enum attr_id_t:
    ENT_IOB
    ENT_TYPE
    HEAD
+    SENT_START
    SPACY
    PROB

--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -85,6 +85,7 @@ IDS = {
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "HEAD": HEAD,
+    "SENT_START": SENT_START,
    "SPACY": SPACY,
    "PROB": PROB,
    "LANG": LANG,
@ -149,6 +150,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
        else:
            int_key = IDS[name.upper()]
        if strings_map is not None and isinstance(value, basestring):
-            value = strings_map[value]
+            if hasattr(strings_map, 'add'):
+                value = strings_map.add(value)
+            else:
+                value = strings_map[value]
        inty_attrs[int_key] = value
    return inty_attrs
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -2,6 +2,5 @@ from .download import download
 from .info import info
 from .link import link
 from .package import package
-from .train import train, train_config
-from .model import model
+from .train import train
 from .convert import convert
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,31 +1,43 @@
 # coding: utf8
 from __future__ import unicode_literals

+import plac
 from pathlib import Path

-from .converters import conllu2json
+from .converters import conllu2json, iob2json
 from ..util import prints

-
 # Converters are matched by file extension. To add a converter, add a new entry
 # to this dict with the file extension mapped to the converter function imported
 # from /converters.

 CONVERTERS = {
    '.conllu': conllu2json,
-    '.conll': conllu2json
+    '.conll': conllu2json,
+    '.iob': iob2json
 }


-def convert(input_file, output_dir, *args):
+@plac.annotations(
+    input_file=("input file", "positional", None, str),
+    output_dir=("output directory for converted file", "positional", None, str),
+    n_sents=("Number of sentences per doc", "option", "n", float),
+    morphology=("Enable appending morphology to tags", "flag", "m", bool)
+)
+def convert(cmd, input_file, output_dir, n_sents, morphology):
+    """
+    Convert files into JSON format for use with train command and other
+    experiment management functions.
+    """
    input_path = Path(input_file)
    output_path = Path(output_dir)
    if not input_path.exists():
-        prints(input_path, title="Input file not found", exits=True)
+        prints(input_path, title="Input file not found", exits=1)
    if not output_path.exists():
-        prints(output_path, title="Output directory not found", exits=True)
+        prints(output_path, title="Output directory not found", exits=1)
    file_ext = input_path.suffix
    if not file_ext in CONVERTERS:
        prints("Can't find converter for %s" % input_path.parts[-1],
-               title="Unknown format", exits=True)
-    CONVERTERS[file_ext](input_path, output_path, *args)
+               title="Unknown format", exits=1)
+    CONVERTERS[file_ext](input_path, output_path,
+            n_sents=n_sents, use_morphology=morphology)
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1 +1,2 @@
 from .conllu2json import conllu2json
+from .iob2json import iob2json
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -73,10 +73,10 @@ def generate_sentence(sent):
    tokens = []
    for i, id in enumerate(id_):
        token = {}
-        token["orth"] = word[id]
-        token["tag"] = tag[id]
-        token["head"] = head[id] - i
-        token["dep"] = dep[id]
+        token["orth"] = word[i]
+        token["tag"] = tag[i]
+        token["head"] = head[i] - id
+        token["dep"] = dep[i]
        tokens.append(token)
    sentence["tokens"] = tokens
    return sentence
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -0,0 +1,45 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...compat import json_dumps, path2str
+from ...util import prints
+from ...gold import iob_to_biluo
+
+
+def iob2json(input_path, output_path, n_sents=10, *a, **k):
+    """
+    Convert IOB files into JSON format for use with train cli.
+    """
+    # TODO: This isn't complete yet -- need to map from IOB to
+    # BILUO
+    with input_path.open('r', encoding='utf8') as file_:
+        docs = read_iob(file_)
+
+    output_filename = input_path.parts[-1].replace(".iob", ".json")
+    output_file = output_path / output_filename
+    with output_file.open('w', encoding='utf-8') as f:
+        f.write(json_dumps(docs))
+    prints("Created %d documents" % len(docs),
+           title="Generated output file %s" % path2str(output_file))
+
+
+def read_iob(file_):
+    sentences = []
+    for line in file_:
+        if not line.strip():
+            continue
+        tokens = [t.split('|') for t in line.split()]
+        if len(tokens[0]) == 3:
+            words, pos, iob = zip(*tokens)
+        else:
+            words, iob = zip(*tokens)
+            pos = ['-'] * len(words)
+        biluo = iob_to_biluo(iob)
+        sentences.append([
+            {'orth': w, 'tag': p, 'ner': ent}
+            for (w, p, ent) in zip(words, pos, biluo)
+        ])
+    sentences = [{'tokens': sent} for sent in sentences]
+    paragraphs = [{'sentences': [sent]} for sent in sentences]
+    docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
+    return docs
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+import plac
 import requests
 import os
 import subprocess
@ -11,7 +12,17 @@ from ..util import prints
 from .. import about


-def download(model, direct=False):
+@plac.annotations(
+    model=("model to download (shortcut or model name)", "positional", None, str),
+    direct=("force direct download. Needs model name with version and won't "
+            "perform compatibility check", "flag", "d", bool)
+)
+def download(cmd, model, direct=False):
+    """
+    Download compatible model from default download path using pip. Model
+    can be shortcut, model name or, if --direct flag is set, full model name
+    with version.
+    """
    if direct:
        download_model('{m}/{m}.tar.gz'.format(m=model))
    else:
@ -20,7 +31,17 @@ def download(model, direct=False):
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
        download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
-        link(model_name, model, force=True)
+        try:
+            link(None, model_name, model, force=True)
+        except:
+            # Dirty, but since spacy.download and the auto-linking is mostly
+            # a convenience wrapper, it's best to show a success message and
+            # loading instructions, even if linking fails.
+            prints("Creating a shortcut link for 'en' didn't work (maybe you "
+                   "don't have admin permissions?), but you can still load "
+                   "the model via its full package name:",
+                   "nlp = spacy.load('%s')" % model_name,
+                   title="Download successful")


 def get_json(url, desc):
@ -28,7 +49,7 @@ def get_json(url, desc):
    if r.status_code != 200:
        prints("Couldn't fetch %s. Please find a model for your spaCy installation "
               "(v%s), and download it manually." % (desc, about.__version__),
-               about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True)
+               about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
    return r.json()


@ -38,7 +59,7 @@ def get_compatibility():
    comp = comp_table['spacy']
    if version not in comp:
        prints("No compatible models found for v%s of spaCy." % version,
-               title="Compatibility error", exits=True)
+               title="Compatibility error", exits=1)
    return comp[version]


@ -46,7 +67,7 @@ def get_version(model, comp):
    if model not in comp:
        version = about.__version__
        prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
-               title="Compatibility error", exits=True)
+               title="Compatibility error", exits=1)
    return comp[model][0]


--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+import plac
 import platform
 from pathlib import Path

@ -9,17 +10,30 @@ from .. import about
 from .. import util


-def info(model=None, markdown=False):
+@plac.annotations(
+    model=("optional: shortcut link of model", "positional", None, str),
+    markdown=("generate Markdown for GitHub issues", "flag", "md", str)
+)
+def info(cmd, model=None, markdown=False):
+    """Print info about spaCy installation. If a model shortcut link is
+    speficied as an argument, print model information. Flag --markdown
+    prints details in Markdown for easy copy-pasting to GitHub issues.
+    """
    if model:
-        data_path = util.get_data_path()
-        data = util.parse_package_meta(data_path / model, require=True)
-        model_path = Path(__file__).parent / data_path / model
-        if model_path.resolve() != model_path:
-            data['link'] = path2str(model_path)
-            data['source'] = path2str(model_path.resolve())
+        if util.is_package(model):
+            model_path = util.get_package_path(model)
        else:
-            data['source'] = path2str(model_path)
-        print_info(data, 'model %s' % model, markdown)
+            model_path = util.get_data_path() / model
+        meta_path = model_path / 'meta.json'
+        if not meta_path.is_file():
+            util.prints(meta_path, title="Can't find model meta.json", exits=1)
+        meta = util.read_json(meta_path)
+        if model_path.resolve() != model_path:
+            meta['link'] = path2str(model_path)
+            meta['source'] = path2str(model_path.resolve())
+        else:
+            meta['source'] = path2str(model_path)
+        print_info(meta, 'model %s' % model, markdown)
    else:
        data = {'spaCy version': about.__version__,
                'Location': path2str(Path(__file__).parent.parent),
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -1,24 +1,36 @@
 # coding: utf8
 from __future__ import unicode_literals

+import plac
 from pathlib import Path
+
 from ..compat import symlink_to, path2str
 from ..util import prints
 from .. import util


-def link(origin, link_name, force=False):
+@plac.annotations(
+    origin=("package name or local path to model", "positional", None, str),
+    link_name=("name of shortuct link to create", "positional", None, str),
+    force=("force overwriting of existing link", "flag", "f", bool)
+)
+def link(cmd, origin, link_name, force=False):
+    """
+    Create a symlink for models within the spacy/data directory. Accepts
+    either the name of a pip package, or the local path to the model data
+    directory. Linking models allows loading them via spacy.load(link_name).
+    """
    if util.is_package(origin):
-        model_path = util.get_model_package_path(origin)
+        model_path = util.get_package_path(origin)
    else:
        model_path = Path(origin)
    if not model_path.exists():
        prints("The data should be located in %s" % path2str(model_path),
-               title="Can't locate model data", exits=True)
+               title="Can't locate model data", exits=1)
    link_path = util.get_data_path() / link_name
    if link_path.exists() and not force:
        prints("To overwrite an existing link, use the --force flag.",
-               title="Link %s already exists" % link_name, exits=True)
+               title="Link %s already exists" % link_name, exits=1)
    elif link_path.exists():
        link_path.unlink()
    try:
@ -33,5 +45,5 @@ def link(origin, link_name, force=False):
               title="Error: Couldn't link model to '%s'" % link_name)
        raise
    prints("%s --> %s" % (path2str(model_path), path2str(link_path)),
-           "You can now load the model via spacy.load('%s')." % link_name,
+           "You can now load the model via spacy.load('%s')" % link_name,
           title="Linking successful")
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@ -1,122 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import gzip
-import math
-from ast import literal_eval
-from preshed.counter import PreshCounter
-
-from ..vocab import write_binary_vectors
-from ..compat import fix_text, path2str
-from ..util import prints
-from .. import util
-
-
-def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
-    model_path = util.ensure_path(model_dir)
-    freqs_path = util.ensure_path(freqs_data)
-    clusters_path = util.ensure_path(clusters_data)
-    vectors_path = util.ensure_path(vectors_data)
-    if not freqs_path.is_file():
-        prints(freqs_path, title="No frequencies file found", exits=True)
-    if clusters_path and not clusters_path.is_file():
-        prints(clusters_path, title="No Brown clusters file found", exits=True)
-    if vectors_path and not vectors_path.is_file():
-        prints(vectors_path, title="No word vectors file found", exits=True)
-    vocab = util.get_lang_class(lang).Defaults.create_vocab()
-    probs, oov_prob = read_probs(freqs_path)
-    clusters = read_clusters(clusters_path) if clusters_path else {}
-    populate_vocab(vocab, clusters, probs, oov_prob)
-    create_model(model_path, vectors_path, vocab, oov_prob)
-
-
-def create_model(model_path, vectors_path, vocab, oov_prob):
-    vocab_path = model_path / 'vocab'
-    lexemes_path = vocab_path / 'lexemes.bin'
-    strings_path = vocab_path / 'strings.json'
-    oov_path = vocab_path / 'oov_prob'
-
-    if not model_path.exists():
-        model_path.mkdir()
-    if not vocab_path.exists():
-        vocab_path.mkdir()
-    vocab.dump(path2str(lexemes_path))
-    with strings_path.open('w') as f:
-        vocab.strings.dump(f)
-    with oov_path.open('w') as f:
-        f.write('%f' % oov_prob)
-    if vectors_path:
-        vectors_dest = vocab_path / 'vec.bin'
-        write_binary_vectors(path2str(vectors_path), path2str(vectors_dest))
-
-
-def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
-    counts = PreshCounter()
-    total = 0
-    freqs_file = check_unzip(freqs_path)
-    for i, line in enumerate(freqs_file):
-        freq, doc_freq, key = line.rstrip().split('\t', 2)
-        freq = int(freq)
-        counts.inc(i+1, freq)
-        total += freq
-    counts.smooth()
-    log_total = math.log(total)
-    freqs_file = check_unzip(freqs_path)
-    probs = {}
-    for line in freqs_file:
-        freq, doc_freq, key = line.rstrip().split('\t', 2)
-        doc_freq = int(doc_freq)
-        freq = int(freq)
-        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
-            word = literal_eval(key)
-            smooth_count = counts.smoother(int(freq))
-            probs[word] = math.log(smooth_count) - log_total
-    oov_prob = math.log(counts.smoother(0)) - log_total
-    return probs, oov_prob
-
-
-def read_clusters(clusters_path):
-    clusters = {}
-    with clusters_path.open() as f:
-        for line in f:
-            try:
-                cluster, word, freq = line.split()
-                word = fix_text(word)
-            except ValueError:
-                continue
-            # If the clusterer has only seen the word a few times, its
-            # cluster is unreliable.
-            if int(freq) >= 3:
-                clusters[word] = cluster
-            else:
-                clusters[word] = '0'
-    # Expand clusters with re-casing
-    for word, cluster in list(clusters.items()):
-        if word.lower() not in clusters:
-            clusters[word.lower()] = cluster
-        if word.title() not in clusters:
-            clusters[word.title()] = cluster
-        if word.upper() not in clusters:
-            clusters[word.upper()] = cluster
-    return clusters
-
-
-def populate_vocab(vocab, clusters, probs, oov_prob):
-    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
-        lexeme = vocab[word]
-        lexeme.prob = prob
-        lexeme.is_oov = False
-        # Decode as a little-endian string, so that we can do & 15 to get
-        # the first 4 bits. See _parse_features.pyx
-        if word in clusters:
-            lexeme.cluster = int(clusters[word][::-1], 2)
-        else:
-            lexeme.cluster = 0
-
-
-def check_unzip(file_path):
-    file_path_str = path2str(file_path)
-    if file_path_str.endswith('gz'):
-        return gzip.open(file_path_str)
-    else:
-        return file_path.open()
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+import plac
 import shutil
 import requests
 from pathlib import Path
@ -11,27 +12,38 @@ from .. import util
 from .. import about


-def package(input_dir, output_dir, meta_path, force):
+@plac.annotations(
+    input_dir=("directory with model data", "positional", None, str),
+    output_dir=("output parent directory", "positional", None, str),
+    meta=("path to meta.json", "option", "m", str),
+    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
+)
+def package(cmd, input_dir, output_dir, meta=None, force=False):
+    """
+    Generate Python package for model data, including meta and required
+    installation files. A new directory will be created in the specified
+    output directory, and model data will be copied over.
+    """
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
-    meta_path = util.ensure_path(meta_path)
+    meta_path = util.ensure_path(meta)
    if not input_path or not input_path.exists():
-        prints(input_path, title="Model directory not found", exits=True)
+        prints(input_path, title="Model directory not found", exits=1)
    if not output_path or not output_path.exists():
-        prints(output_path, title="Output directory not found", exits=True)
+        prints(output_path, title="Output directory not found", exits=1)
    if meta_path and not meta_path.exists():
-        prints(meta_path, title="meta.json not found", exits=True)
+        prints(meta_path, title="meta.json not found", exits=1)

    template_setup = get_template('setup.py')
    template_manifest = get_template('MANIFEST.in')
-    template_init = get_template('en_model_name/__init__.py')
+    template_init = get_template('xx_model_name/__init__.py')
    meta_path = meta_path or input_path / 'meta.json'
    if meta_path.is_file():
        prints(meta_path, title="Reading meta.json from file")
        meta = util.read_json(meta_path)
    else:
        meta = generate_meta()
-    validate_meta(meta, ['lang', 'name', 'version'])
+    meta = validate_meta(meta, ['lang', 'name', 'version'])

    model_name = meta['lang'] + '_' + meta['name']
    model_name_v = model_name + '-' + meta['version']
@ -55,7 +67,7 @@ def create_dirs(package_path, force):
        else:
            prints(package_path, "Please delete the directory and try again, or "
                   "use the --force flag to overwrite existing directories.",
-                   title="Package directory already exists", exits=True)
+                   title="Package directory already exists", exits=1)
    Path.mkdir(package_path, parents=True)


@ -68,31 +80,45 @@ def generate_meta():
    settings = [('lang', 'Model language', 'en'),
                ('name', 'Model name', 'model'),
                ('version', 'Model version', '0.0.0'),
-                ('spacy_version', 'Required spaCy version', '>=2.0.0,<3.0.0'),
+                ('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
                ('description', 'Model description', False),
                ('author', 'Author', False),
                ('email', 'Author email', False),
                ('url', 'Author website', False),
                ('license', 'License', 'CC BY-NC 3.0')]
-
    prints("Enter the package settings for your model.", title="Generating meta.json")
    meta = {}
    for setting, desc, default in settings:
        response = util.get_raw_input(desc, default)
        meta[setting] = default if response == '' and default else response
+    meta['pipeline'] = generate_pipeline()
+    if about.__title__ != 'spacy':
+        meta['parent_package'] = about.__title__
    return meta


+def generate_pipeline():
+    prints("If set to 'True', the default pipeline is used. If set to 'False', "
+           "the pipeline will be disabled. Components should be specified as a "
+           "comma-separated list of component names, e.g. vectorizer, tagger, "
+           "parser, ner. For more information, see the docs on processing pipelines.",
+           title="Enter your model's pipeline components")
+    pipeline = util.get_raw_input("Pipeline components", True)
+    replace = {'True': True, 'False': False}
+    return replace[pipeline] if pipeline in replace else pipeline.split(', ')
+
+
 def validate_meta(meta, keys):
    for key in keys:
        if key not in meta or meta[key] == '':
            prints("This setting is required to build your package.",
-                   title='No "%s" setting found in meta.json' % key, exits=True)
+                   title='No "%s" setting found in meta.json' % key, exits=1)
+    return meta


 def get_template(filepath):
    r = requests.get(about.__model_files__ + filepath)
    if r.status_code != 200:
        prints("Couldn't fetch template files from GitHub.",
-               title="Server error (%d)" % r.status_code, exits=True)
+               title="Server error (%d)" % r.status_code, exits=1)
    return r.text
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,132 +1,153 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function

+import plac
 import json
 from collections import defaultdict
 import cytoolz
 from pathlib import Path
 import dill
+import tqdm
+from thinc.neural.optimizers import linear_decay
+from timeit import default_timer as timer

 from ..tokens.doc import Doc
 from ..scorer import Scorer
 from ..gold import GoldParse, merge_sents
-from ..gold import read_json_file as read_gold_json
+from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
 from .. import displacy
+from ..compat import json_dumps


-def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
-          use_gpu, tagger, parser, ner, parser_L1):
+@plac.annotations(
+    lang=("model language", "positional", None, str),
+    output_dir=("output directory to store model in", "positional", None, str),
+    train_data=("location of JSON-formatted training data", "positional", None, str),
+    dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
+    n_iter=("number of iterations", "option", "n", int),
+    n_sents=("number of sentences", "option", "ns", int),
+    use_gpu=("Use GPU", "option", "g", int),
+    resume=("Whether to resume training", "flag", "R", bool),
+    no_tagger=("Don't train tagger", "flag", "T", bool),
+    no_parser=("Don't train parser", "flag", "P", bool),
+    no_entities=("Don't train NER", "flag", "N", bool)
+)
+def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
+          use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
+    """
+    Train a model. Expects data in spaCy's JSON format.
+    """
+    util.set_env_log(True)
+    n_sents = n_sents or None
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    if not output_path.exists():
-        prints(output_path, title="Output directory not found", exits=True)
+        output_path.mkdir()
    if not train_path.exists():
-        prints(train_path, title="Training data not found", exits=True)
+        prints(train_path, title="Training data not found", exits=1)
    if dev_path and not dev_path.exists():
-        prints(dev_path, title="Development data not found", exits=True)
+        prints(dev_path, title="Development data not found", exits=1)

-    lang = util.get_lang_class(language)
-    parser_cfg = {
-        'pseudoprojective': True,
-        'L1': parser_L1,
-        'n_iter': n_iter,
-        'lang': language,
-        'features': lang.Defaults.parser_features}
-    entity_cfg = {
-        'n_iter': n_iter,
-        'lang': language,
-        'features': lang.Defaults.entity_features}
-    tagger_cfg = {
-        'n_iter': n_iter,
-        'lang': language,
-        'features': lang.Defaults.tagger_features}
-    gold_train = list(read_gold_json(train_path, limit=n_sents))
-    gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None
+    lang_class = util.get_lang_class(lang)

-    train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu)
-    if gold_dev:
-        scorer = evaluate(lang, gold_dev, output_path)
-        print_results(scorer)
+    pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
+    if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
+    if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
+    if no_entities and 'entities' in pipeline: pipeline.remove('entities')
+
+    # Take dropout and batch size as generators of values -- dropout
+    # starts high and decays sharply, to force the optimizer to explore.
+    # Batch size starts at 1 and grows, so that we make updates quickly
+    # at the beginning of training.
+    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
+                                  util.env_opt('dropout_to', 0.2),
+                                  util.env_opt('dropout_decay', 0.0))
+    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
+                                   util.env_opt('batch_to', 64),
+                                   util.env_opt('batch_compound', 1.001))
+
+    if resume:
+        prints(output_path / 'model19.pickle', title="Resuming training")
+        nlp = dill.load((output_path / 'model19.pickle').open('rb'))
+    else:
+        nlp = lang_class(pipeline=pipeline)
+    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
+    n_train_words = corpus.count_train()
+
+    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
+
+    print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
+    try:
+        for i in range(n_iter):
+            if resume:
+                i += 20
+            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
+                train_docs = corpus.train_docs(nlp, projectivize=True,
+                                               gold_preproc=False, max_length=0)
+                losses = {}
+                for batch in minibatch(train_docs, size=batch_sizes):
+                    docs, golds = zip(*batch)
+                    nlp.update(docs, golds, sgd=optimizer,
+                               drop=next(dropout_rates), losses=losses,
+                               update_tensors=True)
+                    pbar.update(sum(len(doc) for doc in docs))
+
+            with nlp.use_params(optimizer.averages):
+                util.set_env_log(False)
+                epoch_model_path = output_path / ('model%d' % i)
+                nlp.to_disk(epoch_model_path)
+                with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
+                    dill.dump(nlp, file_, -1)
+                nlp_loaded = lang_class(pipeline=pipeline)
+                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
+                scorer = nlp_loaded.evaluate(
+                            corpus.dev_docs(
+                                nlp_loaded,
+                                gold_preproc=False))
+                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
+                with acc_loc.open('w') as file_:
+                    file_.write(json_dumps(scorer.scores))
+                util.set_env_log(True)
+            print_progress(i, losses, scorer.scores)
+    finally:
+        print("Saving model...")
+        with (output_path / 'model-final.pickle').open('wb') as file_:
+            with nlp.use_params(optimizer.averages):
+                dill.dump(nlp, file_, -1)


-def train_config(config):
-    config_path = util.ensure_path(config)
-    if not config_path.is_file():
-        prints(config_path, title="Config file not found", exits=True)
-    config = json.load(config_path)
-    for setting in []:
-        if setting not in config.keys():
-            prints("%s not found in config file." % setting, title="Missing setting")
+def _render_parses(i, to_render):
+    to_render[0].user_data['title'] = "Batch %d" % i
+    with Path('/tmp/entities.html').open('w') as file_:
+        html = displacy.render(to_render[:5], style='ent', page=True)
+        file_.write(html)
+    with Path('/tmp/parses.html').open('w') as file_:
+        html = displacy.render(to_render[:5], style='dep', page=True)
+        file_.write(html)


-def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg):
-    print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %")
-
-    nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies'])
-    dropout = util.env_opt('dropout', 0.0)
-    # TODO: Get spaCy using Thinc's trainer and optimizer
-    with nlp.begin_training(train_data, **cfg) as (trainer, optimizer):
-        for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)):
-            losses = defaultdict(float)
-            to_render = []
-            for i, (docs, golds) in enumerate(epoch):
-                state = nlp.update(docs, golds, drop=dropout, sgd=optimizer)
-                losses['dep_loss'] += state.get('parser_loss', 0.0)
-                losses['tag_loss'] += state.get('tag_loss', 0.0)
-                to_render.insert(0, nlp(docs[-1].text))
-                to_render[0].user_data['title'] = "Batch %d" % i
-                with Path('/tmp/entities.html').open('w') as file_:
-                    html = displacy.render(to_render[:5], style='ent', page=True)
-                    file_.write(html)
-                with Path('/tmp/parses.html').open('w') as file_:
-                    html = displacy.render(to_render[:5], style='dep', page=True)
-                    file_.write(html)
-            if dev_data:
-                with nlp.use_params(optimizer.averages):
-                    dev_scores = trainer.evaluate(dev_data).scores
-            else:
-                dev_scores = defaultdict(float)
-            print_progress(itn, losses, dev_scores)
-    with (output_path / 'model.bin').open('wb') as file_:
-        dill.dump(nlp, file_, -1)
-    #nlp.to_disk(output_path, tokenizer=False)
-
-
-def evaluate(Language, gold_tuples, path):
-    with (path / 'model.bin').open('rb') as file_:
-        nlp = dill.load(file_)
-    # TODO:
-    # 1. This code is duplicate with spacy.train.Trainer.evaluate
-    # 2. There's currently a semantic difference between pipe and
-    #    not pipe! It matters whether we batch the inputs. Must fix!
-    all_docs = []
-    all_golds = []
-    for raw_text, paragraph_tuples in dev_sents:
-        if gold_preproc:
-            raw_text = None
-        else:
-            paragraph_tuples = merge_sents(paragraph_tuples)
-        docs = self.make_docs(raw_text, paragraph_tuples)
-        golds = self.make_golds(docs, paragraph_tuples)
-        all_docs.extend(docs)
-        all_golds.extend(golds)
-    scorer = Scorer()
-    for doc, gold in zip(self.nlp.pipe(all_docs), all_golds):
-        scorer.score(doc, gold)
-    return scorer
-
-
-def print_progress(itn, losses, dev_scores):
-    # TODO: Fix!
+def print_progress(itn, losses, dev_scores, wps=0.0):
    scores = {}
-    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', 'ents_f']:
+    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
+                'ents_p', 'ents_r', 'ents_f', 'wps']:
        scores[col] = 0.0
-    scores.update(losses)
+    scores['dep_loss'] = losses.get('parser', 0.0)
+    scores['tag_loss'] = losses.get('tagger', 0.0)
    scores.update(dev_scores)
-    tpl = '{:d}\t{dep_loss:.3f}\t{tag_loss:.3f}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
+    scores['wps'] = wps
+    tpl = '\t'.join((
+        '{:d}',
+        '{dep_loss:.3f}',
+        '{uas:.3f}',
+        '{ents_p:.3f}',
+        '{ents_r:.3f}',
+        '{ents_f:.3f}',
+        '{tags_acc:.3f}',
+        '{token_acc:.3f}',
+        '{wps:.1f}'))
    print(tpl.format(itn, **scores))


--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -5,6 +5,9 @@ import six
 import ftfy
 import sys
 import ujson
+import itertools
+
+from thinc.neural.util import copy_array

 try:
    import cPickle as pickle
@ -32,6 +35,8 @@ copy_reg = copy_reg
 CudaStream = CudaStream
 cupy = cupy
 fix_text = ftfy.fix_text
+copy_array = copy_array
+izip = getattr(itertools, 'izip', zip)

 is_python2 = six.PY2
 is_python3 = six.PY3
@ -57,6 +62,19 @@ elif is_python3:
    path2str = lambda path: str(path)


+def b_to_str(b_str):
+    if is_python2:
+        return b_str
+    # important: if no encoding is set, string becomes "b'...'"
+    return str(b_str, encoding='utf8')
+
+
+def getattr_(obj, name, *default):
+    if is_python3 and isinstance(name, bytes):
+        name = name.decode('utf8')
+    return getattr(obj, name, *default)
+
+
 def symlink_to(orig, dest):
    if is_python2 and is_windows:
        import subprocess
@ -71,3 +89,16 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
            (windows == None or windows == is_windows) and
            (linux == None or linux == is_linux) and
            (osx == None or osx == is_osx))
+
+
+def normalize_string_keys(old):
+    '''Given a dictionary, make sure keys are unicode strings, not bytes.'''
+    new = {}
+    for key, value in old.items():
+        if isinstance(key, bytes_):
+            new[key.decode('utf8')] = value
+        else:
+            new[key] = value
+    return new
+
+
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals

 from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc
+from ..compat import b_to_str
 from ..util import prints, is_in_jupyter


@ -10,27 +11,28 @@ _html = {}
 IS_JUPYTER = is_in_jupyter()


-def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, options={}):
+def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
+          options={}, manual=False):
    """Render displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
    style (unicode): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
-    jupyter (bool): Experimental, use Jupyter's display() to output markup.
+    jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
    options (dict): Visualiser-specific options, e.g. colors.
+    manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
    RETURNS (unicode): Rendered HTML markup.
    """
-    if isinstance(docs, Doc):
-        docs = [docs]
-    if style == 'dep':
-        renderer = DependencyRenderer(options=options)
-        parsed = [parse_deps(doc, options) for doc in docs]
-    elif style == 'ent':
-        renderer = EntityRenderer(options=options)
-        parsed = [parse_ents(doc, options) for doc in docs]
-    else:
+    factories = {'dep': (DependencyRenderer, parse_deps),
+                 'ent': (EntityRenderer, parse_ents)}
+    if style not in factories:
        raise ValueError("Unknown style: %s" % style)
+    if isinstance(docs, Doc) or isinstance(docs, dict):
+        docs = [docs]
+    renderer, converter = factories[style]
+    renderer = renderer(options=options)
+    parsed = [converter(doc, options) for doc in docs] if not manual else docs
    _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
    html = _html['parsed']
    if jupyter: # return HTML rendered by IPython display()
@ -39,7 +41,8 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, opti
    return html


-def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
+def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
+          port=5000):
    """Serve displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
@ -47,27 +50,36 @@ def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    options (dict): Visualiser-specific options, e.g. colors.
+    manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
    port (int): Port to serve visualisation.
    """
    from wsgiref import simple_server
-    render(docs, style=style, page=page, minify=minify, options=options)
+    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
    httpd = simple_server.make_server('0.0.0.0', port, app)
    prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
-    httpd.serve_forever()
+    try:
+        httpd.serve_forever()
+    except KeyboardInterrupt:
+        prints("Shutting down server on port %d." % port)
+    finally:
+        httpd.server_close()


 def app(environ, start_response):
-    start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
+    # headers and status need to be bytes in Python 2, see #1227
+    headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
+    start_response(b_to_str(b'200 OK'), headers)
    res = _html['parsed'].encode(encoding='utf-8')
    return [res]


-def parse_deps(doc, options={}):
+def parse_deps(orig_doc, options={}):
    """Generate dependency parse in {'words': [], 'arcs': []} format.

    doc (Doc): Document do parse.
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
+    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
    if options.get('collapse_punct', True):
        spans = []
        for word in doc[:-1]:
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -18,12 +18,11 @@ class DependencyRenderer(object):
                        offset_x, color, bg, font)
        """
        self.compact = options.get('compact', False)
-        distance, arrow_width = (85, 8) if self.compact else (175, 10)
        self.word_spacing = options.get('word_spacing', 45)
-        self.arrow_spacing = options.get('arrow_spacing', 20)
-        self.arrow_width = options.get('arrow_width', arrow_width)
+        self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
+        self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
        self.arrow_stroke = options.get('arrow_stroke', 2)
-        self.distance = options.get('distance', distance)
+        self.distance = options.get('distance', 150 if self.compact else 175)
        self.offset_x = options.get('offset_x', 50)
        self.color = options.get('color', '#000000')
        self.bg = options.get('bg', '#ffffff')
@ -99,6 +98,8 @@ class DependencyRenderer(object):
        x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
                 -self.arrow_spacing*(self.highest_level-level)/4)
        y_curve = self.offset_y-level*self.distance/2
+        if self.compact:
+            y_curve = self.offset_y-level*self.distance/6
        if y_curve == 0 and len(self.levels) > 5:
            y_curve = -self.distance
        arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
@ -175,7 +176,7 @@ class EntityRenderer(object):
        minify (bool): Minify HTML markup.
        RETURNS (unicode): Rendered HTML markup.
        """
-        rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed]
+        rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
        if page:
            docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
            markup = TPL_PAGE.format(content=docs)
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
 TPL_DEP_ARCS = """
 <g class="displacy-arrow">
    <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
-    <text dy="1.25em" style="font-size: 0.8em">
+    <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
        <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
    </text>
    <path class="displacy-arrowhead" d="{head}" fill="currentColor"/>
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -1,13 +1,15 @@
 from cymem.cymem cimport Pool

 from .structs cimport TokenC
+from .typedefs cimport attr_t
 from .syntax.transition_system cimport Transition


 cdef struct GoldParseC:
    int* tags
    int* heads
-    int* labels
+    int* has_dep
+    attr_t* labels
    int** brackets
    Transition* ner

@ -18,15 +20,16 @@ cdef class GoldParse:
    cdef GoldParseC c

    cdef int length
-    cdef readonly int loss
-    cdef readonly list words
-    cdef readonly list tags
-    cdef readonly list heads
-    cdef readonly list labels
-    cdef readonly dict orths
-    cdef readonly list ner
-    cdef readonly list ents
-    cdef readonly dict brackets
+    cdef public int loss
+    cdef public list words
+    cdef public list tags
+    cdef public list heads
+    cdef public list labels
+    cdef public dict orths
+    cdef public list ner
+    cdef public list ents
+    cdef public dict brackets
+    cdef public object cats

    cdef readonly list cand_to_gold
    cdef readonly list gold_to_cand
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -5,10 +5,13 @@ from __future__ import unicode_literals, print_function
 import io
 import re
 import ujson
+import random
+import cytoolz

 from .syntax import nonproj
 from .util import ensure_path
 from . import util
+from .tokens import Doc


 def tags_to_entities(tags):
@ -86,8 +89,8 @@ def _min_edit_path(cand_words, gold_words):
    # TODO: Fix this --- just do it properly, make the full edit matrix and
    # then walk back over it...
    # Preprocess inputs
-    cand_words = [punct_re.sub('', w) for w in cand_words]
-    gold_words = [punct_re.sub('', w) for w in gold_words]
+    cand_words = [punct_re.sub('', w).lower() for w in cand_words]
+    gold_words = [punct_re.sub('', w).lower() for w in gold_words]

    if cand_words == gold_words:
        return 0, ''.join(['M' for _ in gold_words])
@ -139,8 +142,164 @@ def _min_edit_path(cand_words, gold_words):
    return prev_costs[n_gold], previous_row[-1]


-def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
-    make_supertags = util.env_opt('make_supertags', make_supertags)
+def minibatch(items, size=8):
+    '''Iterate over batches of items. `size` may be an iterator,
+    so that batch-size can vary on each step.
+    '''
+    items = iter(items)
+    while True:
+        batch_size = next(size) #if hasattr(size, '__next__') else size
+        batch = list(cytoolz.take(int(batch_size), items))
+        if len(batch) == 0:
+            break
+        yield list(batch)
+
+
+class GoldCorpus(object):
+    """An annotated corpus, using the JSON file format. Manages
+    annotations for tagging, dependency parsing and NER."""
+    def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
+        """Create a GoldCorpus.
+
+        train_path (unicode or Path): File or directory of training data.
+        dev_path (unicode or Path): File or directory of development data.
+        """
+        self.train_path = util.ensure_path(train_path)
+        self.dev_path = util.ensure_path(dev_path)
+        self.limit = limit
+        self.train_locs = self.walk_corpus(self.train_path)
+        self.dev_locs = self.walk_corpus(self.dev_path)
+
+    @property
+    def train_tuples(self):
+        i = 0
+        for loc in self.train_locs:
+            gold_tuples = read_json_file(loc)
+            for item in gold_tuples:
+                yield item
+                i += len(item[1])
+                if self.limit and i >= self.limit:
+                    break
+
+    @property
+    def dev_tuples(self):
+        i = 0
+        for loc in self.dev_locs:
+            gold_tuples = read_json_file(loc)
+            for item in gold_tuples:
+                yield item
+                i += 1
+                if self.limit and i >= self.limit:
+                    break
+
+    def count_train(self):
+        n = 0
+        i = 0
+        for raw_text, paragraph_tuples in self.train_tuples:
+            n += sum([len(s[0][1]) for s in paragraph_tuples])
+            if self.limit and i >= self.limit:
+                break
+            i += len(paragraph_tuples)
+        return n
+
+    def train_docs(self, nlp, gold_preproc=False,
+                   projectivize=False, max_length=None,
+                   noise_level=0.0):
+        train_tuples = self.train_tuples
+        if projectivize:
+            train_tuples = nonproj.preprocess_training_data(
+                               self.train_tuples)
+        random.shuffle(train_tuples)
+        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
+                                        max_length=max_length,
+                                        noise_level=noise_level)
+        yield from gold_docs
+
+    def dev_docs(self, nlp, gold_preproc=False):
+        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
+        #gold_docs = nlp.preprocess_gold(gold_docs)
+        yield from gold_docs
+
+    @classmethod
+    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
+                       noise_level=0.0):
+        for raw_text, paragraph_tuples in tuples:
+            if gold_preproc:
+                raw_text = None
+            else:
+                paragraph_tuples = merge_sents(paragraph_tuples)
+
+            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
+                                  gold_preproc, noise_level=noise_level)
+            golds = cls._make_golds(docs, paragraph_tuples)
+            for doc, gold in zip(docs, golds):
+                if (not max_length) or len(doc) < max_length:
+                    yield doc, gold
+
+    @classmethod
+    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
+                   noise_level=0.0):
+        if raw_text is not None:
+            raw_text = add_noise(raw_text, noise_level)
+            return [nlp.make_doc(raw_text)]
+        else:
+            return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
+                for (sent_tuples, brackets) in paragraph_tuples]
+
+    @classmethod
+    def _make_golds(cls, docs, paragraph_tuples):
+        assert len(docs) == len(paragraph_tuples)
+        if len(docs) == 1:
+            return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
+        else:
+            return [GoldParse.from_annot_tuples(doc, sent_tuples)
+                    for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
+
+    @staticmethod
+    def walk_corpus(path):
+        if not path.is_dir():
+            return [path]
+        paths = [path]
+        locs = []
+        seen = set()
+        for path in paths:
+            if str(path) in seen:
+                continue
+            seen.add(str(path))
+            if path.parts[-1].startswith('.'):
+                continue
+            elif path.is_dir():
+                paths.extend(path.iterdir())
+            elif path.parts[-1].endswith('.json'):
+                locs.append(path)
+        return locs
+
+
+def add_noise(orig, noise_level):
+    if random.random() >= noise_level:
+        return orig
+    elif type(orig) == list:
+        corrupted = [_corrupt(word, noise_level) for word in orig]
+        corrupted = [w for w in corrupted if w]
+        return corrupted
+    else:
+        return ''.join(_corrupt(c, noise_level) for c in orig)
+
+
+def _corrupt(c, noise_level):
+    if random.random() >= noise_level:
+        return c
+    elif c == ' ':
+        return '\n'
+    elif c == '\n':
+        return ' '
+    elif c in ['.', "'", "!", "?"]:
+        return ''
+    else:
+        return c.lower()
+
+
+def read_json_file(loc, docs_filter=None, limit=None):
    loc = ensure_path(loc)
    if loc.is_dir():
        for filename in loc.iterdir():
@ -173,16 +332,14 @@ def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
                        if labels[-1].lower() == 'root':
                            labels[-1] = 'ROOT'
                        ner.append(token.get('ner', '-'))
-                        if make_supertags:
-                            tags[-1] = '-'.join((tags[-1], labels[-1], ner[-1]))
                    sents.append([
                        [ids, words, tags, heads, labels, ner],
-                        sent.get('brackets', [])])
+                         sent.get('brackets', [])])
                if sents:
                    yield [paragraph.get('raw', None), sents]


-def _iob_to_biluo(tags):
+def iob_to_biluo(tags):
    out = []
    curr_label = None
    tags = list(tags)
@ -224,26 +381,25 @@ cdef class GoldParse:
                   make_projective=make_projective)

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
-                 deps=None, entities=None, make_projective=False):
-        """
-        Create a GoldParse.
+                 deps=None, entities=None, make_projective=False,
+                 cats=tuple()):
+        """Create a GoldParse.

-        Arguments:
-            doc (Doc):
-                The document the annotations refer to.
-            words:
-                A sequence of unicode word strings.
-            tags:
-                A sequence of strings, representing tag annotations.
-            heads:
-                A sequence of integers, representing syntactic head offsets.
-            deps:
-                A sequence of strings, representing the syntactic relation types.
-            entities:
-                A sequence of named entity annotations, either as BILUO tag strings,
-                or as (start_char, end_char, label) tuples, representing the entity
-                positions.
-        Returns (GoldParse): The newly constructed object.
+        doc (Doc): The document the annotations refer to.
+        words (iterable): A sequence of unicode word strings.
+        tags (iterable): A sequence of strings, representing tag annotations.
+        heads (iterable): A sequence of integers, representing syntactic head offsets.
+        deps (iterable): A sequence of strings, representing the syntactic relation types.
+        entities (iterable): A sequence of named entity annotations, either as
+            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
+            representing the entity positions.
+        cats (iterable): A sequence of labels for text classification. Each
+            label may be a string or an int, or a `(start_char, end_char, label)`
+            tuple, indicating that the label is applied to only part of the
+            document (usually a sentence). Unlike entity annotations, label
+            annotations can overlap, i.e. a single word can be covered by
+            multiple labelled spans.
+        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
            words = [token.text for token in doc]
@ -268,9 +424,11 @@ cdef class GoldParse:
        # These are filled by the tagger/parser/entity recogniser
        self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
-        self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
+        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
+        self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))

+        self.cats = list(cats)
        self.words = [None] * len(doc)
        self.tags = [None] * len(doc)
        self.heads = [None] * len(doc)
@ -295,7 +453,10 @@ cdef class GoldParse:
            else:
                self.words[i] = words[gold_i]
                self.tags[i] = tags[gold_i]
-                self.heads[i] = self.gold_to_cand[heads[gold_i]]
+                if heads[gold_i] is None:
+                    self.heads[i] = None
+                else:
+                    self.heads[i] = self.gold_to_cand[heads[gold_i]]
                self.labels[i] = deps[gold_i]
                self.ner[i] = entities[gold_i]

@ -304,59 +465,49 @@ cdef class GoldParse:
            raise Exception("Cycle found: %s" % cycle)

        if make_projective:
-            proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads, self.labels)
+            proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
            self.heads = proj_heads

    def __len__(self):
-        """
-        Get the number of gold-standard tokens.
+        """Get the number of gold-standard tokens.

-        Returns (int): The number of gold-standard tokens.
+        RETURNS (int): The number of gold-standard tokens.
        """
        return self.length

    @property
    def is_projective(self):
-        """
-        Whether the provided syntactic annotations form a projective dependency
-        tree.
+        """Whether the provided syntactic annotations form a projective
+        dependency tree.
        """
        return not nonproj.is_nonproj_tree(self.heads)


-def biluo_tags_from_offsets(doc, entities):
-    """
-    Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
-    scheme (biluo).
+def biluo_tags_from_offsets(doc, entities, missing='O'):
+    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
+    scheme (BILUO).

-    Arguments:
-        doc (Doc):
-            The document that the entity offsets refer to. The output tags will
-            refer to the token boundaries within the document.
+    doc (Doc): The document that the entity offsets refer to. The output tags
+        will refer to the token boundaries within the document.
+    entities (iterable): A sequence of `(start, end, label)` triples. `start` and
+        `end` should be character-offset integers denoting the slice into the
+        original string.

-        entities (sequence):
-            A sequence of (start, end, label) triples. start and end should be
-            character-offset integers denoting the slice into the original string.
+    RETURNS (list): A list of unicode strings, describing the tags. Each tag
+        string will be of the form either "", "O" or "{action}-{label}", where
+        action is one of "B", "I", "L", "U". The string "-" is used where the
+        entity offsets don't align with the tokenization in the `Doc` object. The
+        training algorithm will view these as missing values. "O" denotes a
+        non-entity token. "B" denotes the beginning of a multi-token entity,
+        "I" the inside of an entity of three or more tokens, and "L" the end
+        of an entity of two or more tokens. "U" denotes a single-token entity.

-    Returns:
-        tags (list):
-            A list of unicode strings, describing the tags. Each tag string will
-            be of the form either "", "O" or "{action}-{label}", where action is one
-            of "B", "I", "L", "U". The string "-" is used where the entity
-            offsets don't align with the tokenization in the Doc object. The
-            training algorithm will view these as missing values. "O" denotes
-            a non-entity token. "B" denotes the beginning of a multi-token entity,
-            "I" the inside of an entity of three or more tokens, and "L" the end
-            of an entity of two or more tokens. "U" denotes a single-token entity.
-
-    Example:
-        text = 'I like London.'
-        entities = [(len('I like '), len('I like London'), 'LOC')]
-        doc = nlp.tokenizer(text)
-
-        tags = biluo_tags_from_offsets(doc, entities)
-
-        assert tags == ['O', 'O', 'U-LOC', 'O']
+    EXAMPLE:
+        >>> text = 'I like London.'
+        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
+        >>> doc = nlp.tokenizer(text)
+        >>> tags = biluo_tags_from_offsets(doc, entities)
+        >>> assert tags == ['O', 'O', 'U-LOC', 'O']
    """
    starts = {token.idx: token.i for token in doc}
    ends = {token.idx+len(token): token.i for token in doc}
@ -384,7 +535,7 @@ def biluo_tags_from_offsets(doc, entities):
            if i in entity_chars:
                break
        else:
-            biluo[token.i] = 'O'
+            biluo[token.i] = missing
    return biluo


--- a/spacy/lang/bn/init.py
+++ b/spacy/lang/bn/init.py
@ -13,21 +13,23 @@ from ...attrs import LANG
 from ...util import update_exc


+class BengaliDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'bn'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    lemma_rules = LEMMA_RULES
+
+    prefixes = tuple(TOKENIZER_PREFIXES)
+    suffixes = tuple(TOKENIZER_SUFFIXES)
+    infixes = tuple(TOKENIZER_INFIXES)
+
+
 class Bengali(Language):
    lang = 'bn'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'bn'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        tag_map = TAG_MAP
-        stop_words = STOP_WORDS
-        lemma_rules = LEMMA_RULES
-
-        prefixes = tuple(TOKENIZER_PREFIXES)
-        suffixes = tuple(TOKENIZER_SUFFIXES)
-        infixes = tuple(TOKENIZER_INFIXES)
+    Defaults = BengaliDefaults


 __all__ = ['Bengali']
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS


 _currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
 _list_punct = LIST_PUNCT + '। ॥'.strip().split()


-_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
+_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)

-_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(_currency),
              r'(?<=[0-9])(?:{})'.format(UNITS),
              r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])

-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -20,7 +20,6 @@ _upper = [_latin_upper]
 _lower = [_latin_lower]
 _uncased = [_bengali, _hebrew]

-
 ALPHA = merge_char_classes(_upper + _lower + _uncased)
 ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
 _hyphens = '- – — -- ---'
-
+_other_symbols = r'[\p{So}]'

 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)
 PUNCT = merge_chars(_punct)
 HYPHENS = merge_chars(_hyphens)
+ICONS = _other_symbols

 LIST_UNITS = split_chars(_units)
 LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
 LIST_PUNCT = split_chars(_punct)
 LIST_HYPHENS = split_chars(_hyphens)
 LIST_ELLIPSES = [r'\.\.+', '…']
+LIST_ICONS = [_other_symbols]
--- a/spacy/lang/da/init.py
+++ b/spacy/lang/da/init.py
@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class DanishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'da'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)


 class Danish(Language):
    lang = 'da'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'da'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = DanishDefaults


 __all__ = ['Danish']
--- a/spacy/lang/de/init.py
+++ b/spacy/lang/de/init.py
@ -2,33 +2,39 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .norm_exceptions import NORM_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class GermanDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'de'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
+                                         NORM_EXCEPTIONS, BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    tag_map = dict(TAG_MAP)
+    stop_words = set(STOP_WORDS)
+    syntax_iterators = dict(SYNTAX_ITERATORS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)


 class German(Language):
    lang = 'de'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'de'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        tag_map = dict(TAG_MAP)
-        stop_words = set(STOP_WORDS)
-        syntax_iterators = dict(SYNTAX_ITERATORS)
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = GermanDefaults


 __all__ = ['German']
--- a/spacy/lang/de/norm_exceptions.py
+++ b/spacy/lang/de/norm_exceptions.py
@ -0,0 +1,17 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+# Here we only want to include the absolute most common words. Otherwise,
+# this list would get impossibly long for German – especially considering the
+# old vs. new spelling rules, and all possible cases.
+
+
+_exc = {
+    "daß": "dass"
+}
+
+
+NORM_EXCEPTIONS = {}
+
+for string, norm in _exc.items():
+    NORM_EXCEPTIONS[string.title()] = norm
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@ -15,9 +15,9 @@ def noun_chunks(obj):
    # and not just "eine Tasse", same for "das Thema Familie".
    labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
    doc = obj.doc # Ensure works on both Doc and Span.
-    np_label = doc.vocab.strings['NP']
-    np_deps = set(doc.vocab.strings[label] for label in labels)
-    close_app = doc.vocab.strings['nk']
+    np_label = doc.vocab.strings.add('NP')
+    np_deps = set(doc.vocab.strings.add(label) for label in labels)
+    close_app = doc.vocab.strings.add('nk')

    rbracket = 0
    for i, word in enumerate(obj):
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA
 _exc = {
    "auf'm": [
        {ORTH: "auf", LEMMA: "auf"},
-        {ORTH: "'m", LEMMA: "der", NORM: "dem" }],
+        {ORTH: "'m", LEMMA: "der", NORM: "dem"}],

    "du's": [
        {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
@ -53,97 +53,97 @@ _exc = {


 for exc_data in [
-    {ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"},
-    {ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"},
-    {ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"},
-    {ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"},
+    {ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
+    {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
+    {ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
+    {ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
    {ORTH: "'n", LEMMA: "ein", NORM: "ein"},
    {ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
    {ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
    {ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
-    {ORTH: "Abb.", LEMMA: "Abbildung"},
-    {ORTH: "Abk.", LEMMA: "Abkürzung"},
-    {ORTH: "Abt.", LEMMA: "Abteilung"},
-    {ORTH: "Apr.", LEMMA: "April"},
-    {ORTH: "Aug.", LEMMA: "August"},
-    {ORTH: "Bd.", LEMMA: "Band"},
-    {ORTH: "Betr.", LEMMA: "Betreff"},
-    {ORTH: "Bf.", LEMMA: "Bahnhof"},
-    {ORTH: "Bhf.", LEMMA: "Bahnhof"},
-    {ORTH: "Bsp.", LEMMA: "Beispiel"},
-    {ORTH: "Dez.", LEMMA: "Dezember"},
-    {ORTH: "Di.", LEMMA: "Dienstag"},
-    {ORTH: "Do.", LEMMA: "Donnerstag"},
-    {ORTH: "Fa.", LEMMA: "Firma"},
-    {ORTH: "Fam.", LEMMA: "Familie"},
-    {ORTH: "Feb.", LEMMA: "Februar"},
-    {ORTH: "Fr.", LEMMA: "Frau"},
-    {ORTH: "Frl.", LEMMA: "Fräulein"},
-    {ORTH: "Hbf.", LEMMA: "Hauptbahnhof"},
-    {ORTH: "Hr.", LEMMA: "Herr"},
-    {ORTH: "Hrn.", LEMMA: "Herr"},
-    {ORTH: "Jan.", LEMMA: "Januar"},
-    {ORTH: "Jh.", LEMMA: "Jahrhundert"},
-    {ORTH: "Jhd.", LEMMA: "Jahrhundert"},
-    {ORTH: "Jul.", LEMMA: "Juli"},
-    {ORTH: "Jun.", LEMMA: "Juni"},
-    {ORTH: "Mi.", LEMMA: "Mittwoch"},
-    {ORTH: "Mio.", LEMMA: "Million"},
-    {ORTH: "Mo.", LEMMA: "Montag"},
-    {ORTH: "Mrd.", LEMMA: "Milliarde"},
-    {ORTH: "Mrz.", LEMMA: "März"},
-    {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"},
-    {ORTH: "Mär.", LEMMA: "März"},
-    {ORTH: "Nov.", LEMMA: "November"},
-    {ORTH: "Nr.", LEMMA: "Nummer"},
-    {ORTH: "Okt.", LEMMA: "Oktober"},
-    {ORTH: "Orig.", LEMMA: "Original"},
-    {ORTH: "Pkt.", LEMMA: "Punkt"},
-    {ORTH: "Prof.", LEMMA: "Professor"},
-    {ORTH: "Red.", LEMMA: "Redaktion"},
-    {ORTH: "Sa.", LEMMA: "Samstag"},
-    {ORTH: "Sep.", LEMMA: "September"},
-    {ORTH: "Sept.", LEMMA: "September"},
-    {ORTH: "So.", LEMMA: "Sonntag"},
-    {ORTH: "Std.", LEMMA: "Stunde"},
-    {ORTH: "Str.", LEMMA: "Straße"},
-    {ORTH: "Tel.", LEMMA: "Telefon"},
-    {ORTH: "Tsd.", LEMMA: "Tausend"},
-    {ORTH: "Univ.", LEMMA: "Universität"},
-    {ORTH: "abzgl.", LEMMA: "abzüglich"},
-    {ORTH: "allg.", LEMMA: "allgemein"},
-    {ORTH: "bspw.", LEMMA: "beispielsweise"},
-    {ORTH: "bzgl.", LEMMA: "bezüglich"},
-    {ORTH: "bzw.", LEMMA: "beziehungsweise"},
+    {ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
+    {ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
+    {ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
+    {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
+    {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
+    {ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
+    {ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
+    {ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
+    {ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
+    {ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
+    {ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
+    {ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
+    {ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
+    {ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
+    {ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
+    {ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
+    {ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
+    {ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
+    {ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
+    {ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
+    {ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
+    {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
+    {ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
+    {ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
+    {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
+    {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
+    {ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
+    {ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
+    {ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
+    {ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
+    {ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
+    {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
+    {ORTH: "Mär.", LEMMA: "März", NORM: "März"},
+    {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
+    {ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
+    {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
+    {ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
+    {ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
+    {ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
+    {ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
+    {ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
+    {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
+    {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
+    {ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
+    {ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
+    {ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
+    {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
+    {ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
+    {ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
+    {ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
+    {ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
+    {ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
+    {ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
+    {ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
    {ORTH: "d.h.", LEMMA: "das heißt"},
-    {ORTH: "dgl.", LEMMA: "dergleichen"},
-    {ORTH: "ebd.", LEMMA: "ebenda"},
-    {ORTH: "eigtl.", LEMMA: "eigentlich"},
-    {ORTH: "engl.", LEMMA: "englisch"},
-    {ORTH: "evtl.", LEMMA: "eventuell"},
-    {ORTH: "frz.", LEMMA: "französisch"},
-    {ORTH: "gegr.", LEMMA: "gegründet"},
-    {ORTH: "ggf.", LEMMA: "gegebenenfalls"},
-    {ORTH: "ggfs.", LEMMA: "gegebenenfalls"},
-    {ORTH: "ggü.", LEMMA: "gegenüber"},
+    {ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
+    {ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
+    {ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
+    {ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
+    {ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
+    {ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
+    {ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
+    {ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
+    {ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
+    {ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
    {ORTH: "i.O.", LEMMA: "in Ordnung"},
    {ORTH: "i.d.R.", LEMMA: "in der Regel"},
-    {ORTH: "incl.", LEMMA: "inklusive"},
-    {ORTH: "inkl.", LEMMA: "inklusive"},
-    {ORTH: "insb.", LEMMA: "insbesondere"},
-    {ORTH: "kath.", LEMMA: "katholisch"},
-    {ORTH: "lt.", LEMMA: "laut"},
-    {ORTH: "max.", LEMMA: "maximal"},
-    {ORTH: "min.", LEMMA: "minimal"},
-    {ORTH: "mind.", LEMMA: "mindestens"},
-    {ORTH: "mtl.", LEMMA: "monatlich"},
+    {ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
+    {ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
+    {ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
+    {ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
+    {ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
+    {ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
+    {ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
+    {ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
+    {ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
    {ORTH: "n.Chr.", LEMMA: "nach Christus"},
-    {ORTH: "orig.", LEMMA: "original"},
-    {ORTH: "röm.", LEMMA: "römisch"},
+    {ORTH: "orig.", LEMMA: "original", NORM: "original"},
+    {ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
    {ORTH: "s.o.", LEMMA: "siehe oben"},
    {ORTH: "sog.", LEMMA: "so genannt"},
    {ORTH: "stellv.", LEMMA: "stellvertretend"},
-    {ORTH: "tägl.", LEMMA: "täglich"},
+    {ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
    {ORTH: "u.U.", LEMMA: "unter Umständen"},
    {ORTH: "u.s.w.", LEMMA: "und so weiter"},
    {ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
@ -153,9 +153,9 @@ for exc_data in [
    {ORTH: "v.Chr.", LEMMA: "vor Christus"},
    {ORTH: "v.a.", LEMMA: "vor allem"},
    {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
-    {ORTH: "vgl.", LEMMA: "vergleiche"},
-    {ORTH: "vllt.", LEMMA: "vielleicht"},
-    {ORTH: "vlt.", LEMMA: "vielleicht"},
+    {ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
+    {ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
+    {ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
    {ORTH: "z.B.", LEMMA: "zum Beispiel"},
    {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
    {ORTH: "z.T.", LEMMA: "zum Teil"},
@ -163,7 +163,7 @@ for exc_data in [
    {ORTH: "z.Zt.", LEMMA: "zur Zeit"},
    {ORTH: "z.b.", LEMMA: "zum Beispiel"},
    {ORTH: "zzgl.", LEMMA: "zuzüglich"},
-    {ORTH: "österr.", LEMMA: "österreichisch"}]:
+    {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
    _exc[exc_data[ORTH]] = [dict(exc_data)]


--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .norm_exceptions import NORM_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
@ -10,27 +11,32 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
 from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class EnglishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
+    lex_attr_getters[LANG] = lambda text: 'en'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
+                                         BASE_NORMS, NORM_EXCEPTIONS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    tag_map = dict(TAG_MAP)
+    stop_words = set(STOP_WORDS)
+    morph_rules = dict(MORPH_RULES)
+    lemma_rules = dict(LEMMA_RULES)
+    lemma_index = dict(LEMMA_INDEX)
+    lemma_exc = dict(LEMMA_EXC)
+    syntax_iterators = dict(SYNTAX_ITERATORS)


 class English(Language):
    lang = 'en'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'en'
-        lex_attr_getters.update(LEX_ATTRS)
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        tag_map = dict(TAG_MAP)
-        stop_words = set(STOP_WORDS)
-        morph_rules = dict(MORPH_RULES)
-        lemma_rules = dict(LEMMA_RULES)
-        lemma_index = dict(LEMMA_INDEX)
-        lemma_exc = dict(LEMMA_EXC)
-        sytax_iterators = dict(SYNTAX_ITERATORS)
+    Defaults = EnglishDefaults


 __all__ = ['English']
--- a/spacy/lang/en/norm_exceptions.py
+++ b/spacy/lang/en/norm_exceptions.py
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@ -11,9 +11,9 @@ def noun_chunks(obj):
    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
              'attr', 'ROOT']
    doc = obj.doc # Ensure works on both Doc and Span.
-    np_deps = [doc.vocab.strings[label] for label in labels]
-    conj = doc.vocab.strings['conj']
-    np_label = doc.vocab.strings['NP']
+    np_deps = [doc.vocab.strings.add(label) for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
    seen = set()
    for i, word in enumerate(obj):
        if word.pos not in (NOUN, PROPN, PRON):
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
 for pron in ["i"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'m"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]

        _exc[orth + "m"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]

        _exc[orth + "'ma"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "'m", LEMMA: "be", NORM: "am"},
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]

        _exc[orth + "ma"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "m", LEMMA: "be", NORM: "am"},
            {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]

@ -36,72 +36,72 @@ for pron in ["i"]:
 for pron in ["i", "you", "he", "she", "it", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'ll"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]

        _exc[orth + "ll"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]

        _exc[orth + "'ll've"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "llve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "'d"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]

        _exc[orth + "d"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]

        _exc[orth + "'d've"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "dve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]


 for pron in ["i", "you", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'ve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "ve"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]


 for pron in ["you", "we", "they"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'re"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "'re", LEMMA: "be", NORM: "are"}]

        _exc[orth + "re"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]


 for pron in ["he", "she", "it"]:
    for orth in [pron, pron.title()]:
        _exc[orth + "'s"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
-            {ORTH: "'s"}]
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
+            {ORTH: "'s", NORM: "'s"}]

        _exc[orth + "s"] = [
-            {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
+            {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
            {ORTH: "s"}]


@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
 for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
    for orth in [word, word.title()]:
        _exc[orth + "'s"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'s"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'s", NORM: "'s"}]

        _exc[orth + "s"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "s"}]

        _exc[orth + "'ll"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]

        _exc[orth + "ll"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]

        _exc[orth + "'ll've"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "llve"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "ll", LEMMA: "will", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "'re"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "'re", LEMMA: "be", NORM: "are"}]

        _exc[orth + "re"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "re", LEMMA: "be", NORM: "are"}]

        _exc[orth + "'ve"] = [
-            {ORTH: orth},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]

        _exc[orth + "ve"] = [
            {ORTH: orth, LEMMA: word},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "'d"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'d"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'d", NORM: "'d"}]

        _exc[orth + "d"] = [
-            {ORTH: orth, LEMMA: word},
+            {ORTH: orth, LEMMA: word, NORM: word},
            {ORTH: "d"}]

        _exc[orth + "'d've"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "'d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[orth + "dve"] = [
-            {ORTH: orth, LEMMA: word},
-            {ORTH: "d", LEMMA: "would", TAG: "MD"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: orth, LEMMA: word, NORM: word},
+            {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]


 # Verbs

 for verb_data in [
-    {ORTH: "ca", LEMMA: "can", TAG: "MD"},
-    {ORTH: "could", TAG: "MD"},
-    {ORTH: "do", LEMMA: "do"},
-    {ORTH: "does", LEMMA: "do"},
-    {ORTH: "did", LEMMA: "do", TAG: "VBD"},
-    {ORTH: "had", LEMMA: "have", TAG: "VBD"},
-    {ORTH: "may", TAG: "MD"},
-    {ORTH: "might", TAG: "MD"},
-    {ORTH: "must", TAG: "MD"},
-    {ORTH: "need"},
-    {ORTH: "ought"},
-    {ORTH: "sha", LEMMA: "shall", TAG: "MD"},
-    {ORTH: "should", TAG: "MD"},
-    {ORTH: "wo", LEMMA: "will", TAG: "MD"},
-    {ORTH: "would", TAG: "MD"}]:
+    {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
+    {ORTH: "could", NORM: "could", TAG: "MD"},
+    {ORTH: "do", LEMMA: "do", NORM: "do"},
+    {ORTH: "does", LEMMA: "do", NORM: "does"},
+    {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
+    {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
+    {ORTH: "may", NORM: "may", TAG: "MD"},
+    {ORTH: "might", NORM: "might", TAG: "MD"},
+    {ORTH: "must", NORM: "must", TAG: "MD"},
+    {ORTH: "need", NORM: "need"},
+    {ORTH: "ought", NORM: "ought", TAG: "MD"},
+    {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
+    {ORTH: "should", NORM: "should", TAG: "MD"},
+    {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
+    {ORTH: "would", NORM: "would", TAG: "MD"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]

        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]

        _exc[data[ORTH] + "n't've"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"},
-            {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
+            {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]

        _exc[data[ORTH] + "ntve"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"},
-            {ORTH: "ve", LEMMA: "have", TAG: "VB"}]
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
+            {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]


 for verb_data in [
-    {ORTH: "could", TAG: "MD"},
-    {ORTH: "might"},
-    {ORTH: "must"},
-    {ORTH: "should"}]:
+    {ORTH: "could", NORM: "could", TAG: "MD"},
+    {ORTH: "might", NORM: "might", TAG: "MD"},
+    {ORTH: "must", NORM: "must", TAG: "MD"},
+    {ORTH: "should", NORM: "should", TAG: "MD"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
@ -228,21 +228,21 @@ for verb_data in [


 for verb_data in [
-    {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
-    {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
-    {ORTH: "is", LEMMA: "be", TAG: "VBZ"},
-    {ORTH: "was", LEMMA: "be"},
-    {ORTH: "were", LEMMA: "be"}]:
+    {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
+    {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
+    {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
+    {ORTH: "was", LEMMA: "be", NORM: "was"},
+    {ORTH: "were", LEMMA: "be", NORM: "were"}]:
    verb_data_tc = dict(verb_data)
    verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
    for data in [verb_data, verb_data_tc]:
        _exc[data[ORTH] + "n't"] = [
            dict(data),
-            {ORTH: "n't", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]

        _exc[data[ORTH] + "nt"] = [
            dict(data),
-            {ORTH: "nt", LEMMA: "not", TAG: "RB"}]
+            {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]


 # Other contractions with trailing apostrophe
@ -250,10 +250,10 @@ for verb_data in [
 for exc_data in [
    {ORTH: "doin", LEMMA: "do", NORM: "doing"},
    {ORTH: "goin", LEMMA: "go", NORM: "going"},
-    {ORTH: "nothin", LEMMA: "nothing"},
-    {ORTH: "nuthin", LEMMA: "nothing"},
-    {ORTH: "ol", LEMMA: "old"},
-    {ORTH: "somethin", LEMMA: "something"}]:
+    {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
+    {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
+    {ORTH: "ol", LEMMA: "old", NORM: "old"},
+    {ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
    exc_data_tc = dict(exc_data)
    exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
    for data in [exc_data, exc_data_tc]:
@ -266,10 +266,10 @@ for exc_data in [
 # Other contractions with leading apostrophe

 for exc_data in [
-    {ORTH: "cause", LEMMA: "because"},
+    {ORTH: "cause", LEMMA: "because", NORM: "because"},
    {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
-    {ORTH: "ll", LEMMA: "will"},
-    {ORTH: "nuff", LEMMA: "enough"}]:
+    {ORTH: "ll", LEMMA: "will", NORM: "will"},
+    {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
    exc_data_apos = dict(exc_data)
    exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
    for data in [exc_data, exc_data_apos]:
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
    for period in ["a.m.", "am"]:
        _exc["%d%s" % (h, period)] = [
            {ORTH: "%d" % h},
-            {ORTH: period, LEMMA: "a.m."}]
+            {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
    for period in ["p.m.", "pm"]:
        _exc["%d%s" % (h, period)] = [
            {ORTH: "%d" % h},
-            {ORTH: period, LEMMA: "p.m."}]
+            {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]


 # Rest
@ -306,56 +306,56 @@ _other_exc = {
        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],

    "How'd'y": [
-        {ORTH: "How", LEMMA: "how"},
+        {ORTH: "How", LEMMA: "how", NORM: "how"},
        {ORTH: "'d", LEMMA: "do"},
        {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],

    "not've": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],

    "notve": [
        {ORTH: "not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],

    "Not've": [
-        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
+        {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],

    "Notve": [
-        {ORTH: "Not", LEMMA: "not", TAG: "RB"},
-        {ORTH: "ve", LEMMA: "have", TAG: "VB"}],
+        {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
+        {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],

    "cannot": [
        {ORTH: "can", LEMMA: "can", TAG: "MD"},
        {ORTH: "not", LEMMA: "not", TAG: "RB"}],

    "Cannot": [
-        {ORTH: "Can", LEMMA: "can", TAG: "MD"},
+        {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
        {ORTH: "not", LEMMA: "not", TAG: "RB"}],

    "gonna": [
        {ORTH: "gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}],
+        {ORTH: "na", LEMMA: "to", NORM: "to"}],

    "Gonna": [
        {ORTH: "Gon", LEMMA: "go", NORM: "going"},
-        {ORTH: "na", LEMMA: "to"}],
+        {ORTH: "na", LEMMA: "to", NORM: "to"}],

    "gotta": [
        {ORTH: "got"},
-        {ORTH: "ta", LEMMA: "to"}],
+        {ORTH: "ta", LEMMA: "to", NORM: "to"}],

    "Gotta": [
-        {ORTH: "Got"},
-        {ORTH: "ta", LEMMA: "to"}],
+        {ORTH: "Got", NORM: "got"},
+        {ORTH: "ta", LEMMA: "to", NORM: "to"}],

    "let's": [
        {ORTH: "let"},
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],

    "Let's": [
-        {ORTH: "Let", LEMMA: "let"},
+        {ORTH: "Let", LEMMA: "let", NORM: "let"},
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
 }

@ -363,72 +363,80 @@ _exc.update(_other_exc)


 for exc_data in [
-    {ORTH: "'S", LEMMA: "'s"},
-    {ORTH: "'s", LEMMA: "'s"},
-    {ORTH: "\u2018S", LEMMA: "'s"},
-    {ORTH: "\u2018s", LEMMA: "'s"},
-    {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
+    {ORTH: "'S", LEMMA: "'s", NORM: "'s"},
+    {ORTH: "'s", LEMMA: "'s", NORM: "'s"},
+    {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
+    {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
+    {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
+    {ORTH: "w/o", LEMMA: "without", NORM: "without"},
    {ORTH: "'re", LEMMA: "be", NORM: "are"},
-    {ORTH: "'Cause", LEMMA: "because"},
-    {ORTH: "'cause", LEMMA: "because"},
-    {ORTH: "ma'am", LEMMA: "madam"},
-    {ORTH: "Ma'am", LEMMA: "madam"},
-    {ORTH: "o'clock", LEMMA: "o'clock"},
-    {ORTH: "O'clock", LEMMA: "o'clock"},
+    {ORTH: "'Cause", LEMMA: "because", NORM: "because"},
+    {ORTH: "'cause", LEMMA: "because", NORM: "because"},
+    {ORTH: "'cos", LEMMA: "because", NORM: "because"},
+    {ORTH: "'Cos", LEMMA: "because", NORM: "because"},
+    {ORTH: "'coz", LEMMA: "because", NORM: "because"},
+    {ORTH: "'Coz", LEMMA: "because", NORM: "because"},
+    {ORTH: "'cuz", LEMMA: "because", NORM: "because"},
+    {ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
+    {ORTH: "'bout", LEMMA: "about", NORM: "about"},
+    {ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
+    {ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
+    {ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
+    {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},

-    {ORTH: "Mt.", LEMMA: "Mount"},
-    {ORTH: "Ak.", LEMMA: "Alaska"},
-    {ORTH: "Ala.", LEMMA: "Alabama"},
-    {ORTH: "Apr.", LEMMA: "April"},
-    {ORTH: "Ariz.", LEMMA: "Arizona"},
-    {ORTH: "Ark.", LEMMA: "Arkansas"},
-    {ORTH: "Aug.", LEMMA: "August"},
-    {ORTH: "Calif.", LEMMA: "California"},
-    {ORTH: "Colo.", LEMMA: "Colorado"},
-    {ORTH: "Conn.", LEMMA: "Connecticut"},
-    {ORTH: "Dec.", LEMMA: "December"},
-    {ORTH: "Del.", LEMMA: "Delaware"},
-    {ORTH: "Feb.", LEMMA: "February"},
-    {ORTH: "Fla.", LEMMA: "Florida"},
-    {ORTH: "Ga.", LEMMA: "Georgia"},
-    {ORTH: "Ia.", LEMMA: "Iowa"},
-    {ORTH: "Id.", LEMMA: "Idaho"},
-    {ORTH: "Ill.", LEMMA: "Illinois"},
-    {ORTH: "Ind.", LEMMA: "Indiana"},
-    {ORTH: "Jan.", LEMMA: "January"},
-    {ORTH: "Jul.", LEMMA: "July"},
-    {ORTH: "Jun.", LEMMA: "June"},
-    {ORTH: "Kan.", LEMMA: "Kansas"},
-    {ORTH: "Kans.", LEMMA: "Kansas"},
-    {ORTH: "Ky.", LEMMA: "Kentucky"},
-    {ORTH: "La.", LEMMA: "Louisiana"},
-    {ORTH: "Mar.", LEMMA: "March"},
-    {ORTH: "Mass.", LEMMA: "Massachusetts"},
-    {ORTH: "May.", LEMMA: "May"},
-    {ORTH: "Mich.", LEMMA: "Michigan"},
-    {ORTH: "Minn.", LEMMA: "Minnesota"},
-    {ORTH: "Miss.", LEMMA: "Mississippi"},
-    {ORTH: "N.C.", LEMMA: "North Carolina"},
-    {ORTH: "N.D.", LEMMA: "North Dakota"},
-    {ORTH: "N.H.", LEMMA: "New Hampshire"},
-    {ORTH: "N.J.", LEMMA: "New Jersey"},
-    {ORTH: "N.M.", LEMMA: "New Mexico"},
-    {ORTH: "N.Y.", LEMMA: "New York"},
-    {ORTH: "Neb.", LEMMA: "Nebraska"},
-    {ORTH: "Nebr.", LEMMA: "Nebraska"},
-    {ORTH: "Nev.", LEMMA: "Nevada"},
-    {ORTH: "Nov.", LEMMA: "November"},
-    {ORTH: "Oct.", LEMMA: "October"},
-    {ORTH: "Okla.", LEMMA: "Oklahoma"},
-    {ORTH: "Ore.", LEMMA: "Oregon"},
-    {ORTH: "Pa.", LEMMA: "Pennsylvania"},
-    {ORTH: "S.C.", LEMMA: "South Carolina"},
-    {ORTH: "Sep.", LEMMA: "September"},
-    {ORTH: "Sept.", LEMMA: "September"},
-    {ORTH: "Tenn.", LEMMA: "Tennessee"},
-    {ORTH: "Va.", LEMMA: "Virginia"},
-    {ORTH: "Wash.", LEMMA: "Washington"},
-    {ORTH: "Wis.", LEMMA: "Wisconsin"}]:
+    {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
+    {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
+    {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
+    {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
+    {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
+    {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
+    {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
+    {ORTH: "Calif.", LEMMA: "California", NORM: "California"},
+    {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
+    {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
+    {ORTH: "Dec.", LEMMA: "December", NORM: "December"},
+    {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
+    {ORTH: "Feb.", LEMMA: "February", NORM: "February"},
+    {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
+    {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
+    {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
+    {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
+    {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
+    {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
+    {ORTH: "Jan.", LEMMA: "January", NORM: "January"},
+    {ORTH: "Jul.", LEMMA: "July", NORM: "July"},
+    {ORTH: "Jun.", LEMMA: "June", NORM: "June"},
+    {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
+    {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
+    {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
+    {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
+    {ORTH: "Mar.", LEMMA: "March", NORM: "March"},
+    {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
+    {ORTH: "May.", LEMMA: "May", NORM: "May"},
+    {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
+    {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
+    {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
+    {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
+    {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
+    {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
+    {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
+    {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
+    {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
+    {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
+    {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
+    {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
+    {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
+    {ORTH: "Oct.", LEMMA: "October", NORM: "October"},
+    {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
+    {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
+    {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
+    {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
+    {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
+    {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
+    {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
+    {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
+    {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
+    {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
    _exc[exc_data[ORTH]] = [dict(exc_data)]


--- a/spacy/lang/es/init.py
+++ b/spacy/lang/es/init.py
@ -5,21 +5,25 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
+from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups


 class SpanishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'es'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = dict(TAG_MAP)
    stop_words = set(STOP_WORDS)
+    sytax_iterators = dict(SYNTAX_ITERATORS)

    @classmethod
    def create_lemmatizer(cls, nlp=None):
@ -28,7 +32,7 @@ class SpanishDefaults(Language.Defaults):

 class Spanish(Language):
    lang = 'es'
-
    Defaults = SpanishDefaults

+
 __all__ = ['Spanish']
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@ -0,0 +1,55 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+
+
+def noun_chunks(obj):
+    doc = obj.doc
+    np_label = doc.vocab.strings['NP']
+    left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
+    right_labels = ['flat', 'fixed', 'compound', 'neg']
+    stop_labels = ['punct']
+    np_left_deps = [doc.vocab.strings[label] for label in left_labels]
+    np_right_deps = [doc.vocab.strings[label] for label in right_labels]
+    stop_deps = [doc.vocab.strings[label] for label in stop_labels]
+    token = doc[0]
+    while token and token.i < len(doc):
+        if token.pos in [PROPN, NOUN, PRON]:
+            left, right = noun_bounds(token)
+            yield left.i, right.i+1, np_label
+            token = right
+        token = next_token(token)
+
+
+def is_verb_token(token):
+    return token.pos in [VERB, AUX]
+
+
+def next_token(token):
+    try:
+        return token.nbor()
+    except:
+        return None
+
+
+def noun_bounds(root):
+    left_bound = root
+    for token in reversed(list(root.lefts)):
+        if token.dep in np_left_deps:
+            left_bound = token
+    right_bound = root
+    for token in root.rights:
+        if (token.dep in np_right_deps):
+            left, right = noun_bounds(token)
+            if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
+                           doc[left_bound.i: right.i])):
+                break
+            else:
+                right_bound = right
+    return left_bound, right_bound
+
+
+SYNTAX_ITERATORS = {
+    'noun_chunks': noun_chunks
+}
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@ -6,37 +6,13 @@ from ...deprecated import PRON_LEMMA


 _exc = {
-    "al": [
-        {ORTH: "a", LEMMA: "a", TAG: ADP},
-        {ORTH: "l", LEMMA: "el", TAG: DET}],
-
-    "consigo": [
-        {ORTH: "con", LEMMA: "con"},
-        {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}],
-
-    "conmigo": [
-        {ORTH: "con", LEMMA: "con"},
-        {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}],
-
-    "contigo": [
-        {ORTH: "con", LEMMA: "con"},
-        {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}],
-
-    "del": [
-        {ORTH: "de", LEMMA: "de", TAG: ADP},
-        {ORTH: "l", LEMMA: "el", TAG: DET}],
-
-    "pel": [
-        {ORTH: "pe", LEMMA: "per", TAG: ADP},
-        {ORTH: "l", LEMMA: "el", TAG: DET}],
-
    "pal": [
        {ORTH: "pa", LEMMA: "para"},
-        {ORTH: "l", LEMMA: "el"}],
+        {ORTH: "l", LEMMA: "el", NORM: "el"}],

    "pala": [
        {ORTH: "pa", LEMMA: "para"},
-        {ORTH: "la"}]
+        {ORTH: "la", LEMMA: "la", NORM: "la"}]
 }


--- a/spacy/lang/fi/init.py
+++ b/spacy/lang/fi/init.py
@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class FinnishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'fi'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)


 class Finnish(Language):
    lang = 'fi'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'fi'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = FinnishDefaults


 __all__ = ['Finnish']
--- a/spacy/lang/fr/init.py
+++ b/spacy/lang/fr/init.py
@ -5,30 +5,36 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
+from .syntax_iterators import SYNTAX_ITERATORS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class FrenchDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'fr'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+    infixes = tuple(TOKENIZER_INFIXES)
+    suffixes = tuple(TOKENIZER_SUFFIXES)
+    token_match = TOKEN_MATCH
+    syntax_iterators = dict(SYNTAX_ITERATORS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)


 class French(Language):
    lang = 'fr'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'fr'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-        infixes = tuple(TOKENIZER_INFIXES)
-        suffixes = tuple(TOKENIZER_SUFFIXES)
-        token_match = TOKEN_MATCH
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = FrenchDefaults


 __all__ = ['French']
--- a/spacy/lang/fr/_tokenizer_exceptions_list.py
+++ b/spacy/lang/fr/_tokenizer_exceptions_list.py
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@ -0,0 +1,42 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON
+
+
+def noun_chunks(obj):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
+SYNTAX_ITERATORS = {
+    'noun_chunks': noun_chunks
+}
--- a/spacy/lang/he/init.py
+++ b/spacy/lang/he/init.py
@ -9,15 +9,17 @@ from ...attrs import LANG
 from ...util import update_exc


+class HebrewDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'he'
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+
 class Hebrew(Language):
    lang = 'he'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'he'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = HebrewDefaults


 __all__ = ['Hebrew']
--- a/spacy/lang/hu/init.py
+++ b/spacy/lang/hu/init.py
@ -7,29 +7,33 @@ from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class HungarianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'hu'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+    prefixes = tuple(TOKENIZER_PREFIXES)
+    suffixes = tuple(TOKENIZER_SUFFIXES)
+    infixes = tuple(TOKENIZER_INFIXES)
+    token_match = TOKEN_MATCH
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)


 class Hungarian(Language):
    lang = 'hu'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'hu'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-        prefixes = tuple(TOKENIZER_PREFIXES)
-        suffixes = tuple(TOKENIZER_SUFFIXES)
-        infixes = tuple(TOKENIZER_INFIXES)
-        token_match = TOKEN_MATCH
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = HungarianDefaults


 __all__ = ['Hungarian']
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@ -1,18 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..punctuation import TOKENIZER_INFIXES
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER

+LIST_ICONS = [r'[\p{So}--[°]]']

 _currency = r'\$|¢|£|€|¥|฿'
 _quotes = QUOTES.replace("'", '')

+_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             [r'[,.:](?=[{a}])'.format(a=ALPHA)])

-_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES)
-
-_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(_currency),
@ -20,16 +20,14 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
              r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency),
              r'(?<=[{})])-e'.format(ALPHA_LOWER)])

-
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
-             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
+             r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])

-
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/it/init.py
+++ b/spacy/lang/it/init.py
@ -5,25 +5,29 @@ from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class ItalianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'it'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)


 class Italian(Language):
    lang = 'it'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'it'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = ItalianDefaults


 __all__ = ['Italian']
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -125,7 +125,7 @@ def word_shape(text):

 LEX_ATTRS = {
    attrs.LOWER: lambda string: string.lower(),
-    attrs.NORM: lambda string: string,
+    attrs.NORM: lambda string: string.lower(),
    attrs.PREFIX: lambda string: string[0],
    attrs.SUFFIX: lambda string: string[-3:],
    attrs.CLUSTER: lambda string: 0,
--- a/spacy/lang/nb/init.py
+++ b/spacy/lang/nb/init.py
@ -6,20 +6,24 @@ from .stop_words import STOP_WORDS
 from .morph_rules import MORPH_RULES

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class NorwegianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'nb'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)


 class Norwegian(Language):
    lang = 'nb'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'nb'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = NorwegianDefaults


 __all__ = ['Norwegian']
--- a/spacy/lang/nl/init.py
+++ b/spacy/lang/nl/init.py
@ -4,21 +4,24 @@ from __future__ import unicode_literals
 from .stop_words import STOP_WORDS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups


+class DutchDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'nl'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+

 class Dutch(Language):
    lang = 'nl'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'nl'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = DutchDefaults


 __all__ = ['Dutch']
--- a/spacy/lang/norm_exceptions.py
+++ b/spacy/lang/norm_exceptions.py
@ -0,0 +1,46 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# These exceptions are used to add NORM values based on a token's ORTH value.
+# Individual languages can also add their own exceptions and overwrite them -
+# for example, British vs. American spelling in English.
+
+# Norms are only set if no alternative is provided in the tokenizer exceptions.
+# Note that this does not change any other token attributes. Its main purpose
+# is to normalise the word representations so that equivalent tokens receive
+# similar representations. For example: $ and € are very different, but they're
+# both currency symbols. By normalising currency symbols to $, all symbols are
+# seen as similar, no matter how common they are in the training data.
+
+
+BASE_NORMS = {
+    "'s": "'s",
+    "'S": "'s",
+    "’s": "'s",
+    "’S": "'s",
+    "’": "'",
+    "‘": "'",
+    "´": "'",
+    "`": "'",
+    "”": '"',
+    "“": '"',
+    "''": '"',
+    "``": '"',
+    "´´": '"',
+    "„": '"',
+    "»": '"',
+    "«": '"',
+    "…": "...",
+    "—": "-",
+    "–": "-",
+    "--": "-",
+    "---": "-",
+    "€": "$",
+    "£": "$",
+    "¥": "$",
+    "฿": "$",
+    "US$": "$",
+    "C$": "$",
+    "A$": "$"
+}
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -1,23 +1,28 @@
 # coding: utf8
 from __future__ import unicode_literals

+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class PolishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'pl'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)


 class Polish(Language):
    lang = 'pl'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'pl'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
+    Defaults = PolishDefaults


 __all__ = ['Polish']
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -0,0 +1,23 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import ORTH, LEMMA, POS
+
+
+_exc = {}
+
+for exc_data in [
+    {ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
+    {ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
+    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
+    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
+    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
+    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
+    _exc[exc_data[ORTH]] = [dict(exc_data)],
+
+for orth in [
+    "w.", "r."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = dict(_exc)
--- a/spacy/lang/pt/init.py
+++ b/spacy/lang/pt/init.py
@ -7,26 +7,30 @@ from .lex_attrs import LEX_ATTRS
 from .lemmatizer import LOOKUP

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class PortugueseDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'pt'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    lex_attr_getters.update(LEX_ATTRS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)


 class Portuguese(Language):
    lang = 'pt'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'pt'
-        lex_attr_getters.update(LEX_ATTRS)
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = PortugueseDefaults


 __all__ = ['Portuguese']
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -2,15 +2,16 @@
 from __future__ import unicode_literals

 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from .char_classes import QUOTES, CURRENCY, UNITS


 _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-             LIST_CURRENCY)
+             LIST_CURRENCY + LIST_ICONS)


-_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             ["'s", "'S", "’s", "’S"] +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
              r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])


-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
             r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
--- a/spacy/lang/sv/init.py
+++ b/spacy/lang/sv/init.py
@ -7,25 +7,29 @@ from .morph_rules import MORPH_RULES
 from .lemmatizer import LEMMA_RULES, LOOKUP

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
 from ...language import Language
 from ...lemmatizerlookup import Lemmatizer
-from ...attrs import LANG
-from ...util import update_exc
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class SwedishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'sv'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = set(STOP_WORDS)
+
+    @classmethod
+    def create_lemmatizer(cls, nlp=None):
+        return Lemmatizer(LOOKUP)


 class Swedish(Language):
    lang = 'sv'
-
-    class Defaults(Language.Defaults):
-        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-        lex_attr_getters[LANG] = lambda text: 'sv'
-
-        tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-        stop_words = set(STOP_WORDS)
-
-        @classmethod
-        def create_lemmatizer(cls, nlp=None):
-            return Lemmatizer(LOOKUP)
+    Defaults = SwedishDefaults


 __all__ = ['Swedish']
--- a/spacy/lang/xx/init.py
+++ b/spacy/lang/xx/init.py
@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class MultiLanguageDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'xx'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+
+
+class MultiLanguage(Language):
+    """Language class to be used for models that support multiple languages.
+    This module allows models to specify their language ID as 'xx'.
+    """
+    lang = 'xx'
+    Defaults = MultiLanguageDefaults
+
+
+__all__ = ['MultiLanguage']
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -15,6 +15,7 @@ class Chinese(Language):
            raise ImportError("The Chinese tokenizer requires the Jieba library: "
                              "https://github.com/fxsjy/jieba")
        words = list(jieba.cut(text, cut_all=True))
+        words=[x for x in words if x]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))


--- a/spacy/language.py
+++ b/spacy/language.py
@ -6,23 +6,34 @@ import dill
 import numpy
 from thinc.neural import Model
 from thinc.neural.ops import NumpyOps, CupyOps
+from thinc.neural.optimizers import Adam, SGD
+import random
+import ujson
+from collections import OrderedDict
+import itertools

 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .tagger import Tagger
 from .lemmatizer import Lemmatizer
-from .train import Trainer
 from .syntax.parser import get_templates
-from .syntax.nonproj import PseudoProjectivity
+from .syntax import nonproj
+
 from .pipeline import NeuralDependencyParser, EntityRecognizer
 from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
-from .compat import json_dumps
+from .pipeline import NeuralLabeller
+from .pipeline import SimilarityHook
+from .pipeline import TextCategorizer
+from . import about
+
+from .compat import json_dumps, izip
 from .attrs import IS_STOP
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
 from .lang.tag_map import TAG_MAP
 from .lang.lex_attrs import LEX_ATTRS
 from . import util
+from .scorer import Scorer


 class BaseDefaults(object):
@ -80,21 +91,35 @@ class BaseDefaults(object):
            return NeuralEntityRecognizer(nlp.vocab, **cfg)

    @classmethod
-    def create_pipeline(cls, nlp=None):
+    def create_pipeline(cls, nlp=None, disable=tuple()):
        meta = nlp.meta if nlp is not None else {}
        # Resolve strings, like "cnn", "lstm", etc
        pipeline = []
        for entry in cls.pipeline:
+            if entry in disable or getattr(entry, 'name', entry) in disable:
+                continue
            factory = cls.Defaults.factories[entry]
            pipeline.append(factory(nlp, **meta.get(entry, {})))
        return pipeline

    factories = {
        'make_doc': create_tokenizer,
-        'token_vectors': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
-        'tags': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
-        'dependencies': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
-        'entities': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
+        'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
+        'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
+        'parser': lambda nlp, **cfg: [
+            NeuralDependencyParser(nlp.vocab, **cfg),
+            nonproj.deprojectivize],
+        'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
+        'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
+        'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
+        # Temporary compatibility -- delete after pivot
+        'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
+        'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
+        'dependencies': lambda nlp, **cfg: [
+            NeuralDependencyParser(nlp.vocab, **cfg),
+            nonproj.deprojectivize,
+        ],
+        'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
    }

    token_match = TOKEN_MATCH
@ -112,19 +137,39 @@ class BaseDefaults(object):
    lemma_index = {}
    morph_rules = {}
    lex_attr_getters = LEX_ATTRS
+    syntax_iterators = {}


 class Language(object):
-    """
-    A text-processing pipeline. Usually you'll load this once per process, and
-    pass the instance around your program.
+    """A text-processing pipeline. Usually you'll load this once per process,
+    and pass the instance around your application.
+
+    Defaults (class): Settings, data and factory methods for creating the `nlp`
+        object and processing pipeline.
+    lang (unicode): Two-letter language ID, i.e. ISO code.
    """
    Defaults = BaseDefaults
    lang = None

-    def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
-        self.meta = dict(meta)
+    def __init__(self, vocab=True, make_doc=True, pipeline=None,
+                 meta={}, disable=tuple(), **kwargs):
+        """Initialise a Language object.

+        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
+            `Language.Defaults.create_vocab`.
+        make_doc (callable): A function that takes text and returns a `Doc`
+            object. Usually a `Tokenizer`.
+        pipeline (list): A list of annotation processes or IDs of annotation,
+            processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
+            up in `Language.Defaults.factories`.
+        disable (list): A list of component names to exclude from the pipeline.
+            The disable list has priority over the pipeline list -- if the same
+            string occurs in both, the component is not loaded.
+        meta (dict): Custom meta data for the Language class. Is written to by
+            models to add model meta data.
+        RETURNS (Language): The newly constructed object.
+        """
+        self._meta = dict(meta)
        if vocab is True:
            factory = self.Defaults.create_vocab
            vocab = factory(self, **meta.get('vocab', {}))
@ -132,11 +177,15 @@ class Language(object):
        if make_doc is True:
            factory = self.Defaults.create_tokenizer
            make_doc = factory(self, **meta.get('tokenizer', {}))
-        self.make_doc = make_doc
+        self.tokenizer = make_doc
        if pipeline is True:
-            self.pipeline = self.Defaults.create_pipeline(self)
+            self.pipeline = self.Defaults.create_pipeline(self, disable)
        elif pipeline:
-            self.pipeline = list(pipeline)
+            # Careful not to do getattr(p, 'name', None) here
+            # If we had disable=[None], we'd disable everything!
+            self.pipeline = [p for p in pipeline
+                             if p not in disable
+                             and getattr(p, 'name', p) not in disable]
            # Resolve strings, like "cnn", "lstm", etc
            for i, entry in enumerate(self.pipeline):
                if entry in self.Defaults.factories:
@ -144,82 +193,224 @@ class Language(object):
                    self.pipeline[i] = factory(self, **meta.get(entry, {}))
        else:
            self.pipeline = []
+        flat_list = []
+        for pipe in self.pipeline:
+            if isinstance(pipe, list):
+                flat_list.extend(pipe)
+            else:
+                flat_list.append(pipe)
+        self.pipeline = flat_list

-    def __call__(self, text, state=None, **disabled):
-        """
-        Apply the pipeline to some text.  The text can span multiple sentences,
-        and can contain arbtrary whitespace.  Alignment into the original string
+    @property
+    def meta(self):
+        self._meta.setdefault('lang', self.vocab.lang)
+        self._meta.setdefault('name', '')
+        self._meta.setdefault('version', '0.0.0')
+        self._meta.setdefault('spacy_version', about.__version__)
+        self._meta.setdefault('description', '')
+        self._meta.setdefault('author', '')
+        self._meta.setdefault('email', '')
+        self._meta.setdefault('url', '')
+        self._meta.setdefault('license', '')
+        pipeline = []
+        for component in self.pipeline:
+            if hasattr(component, 'name'):
+                pipeline.append(component.name)
+        self._meta['pipeline'] = pipeline
+        return self._meta
+
+    @meta.setter
+    def meta(self, value):
+        self._meta = value
+
+    # Conveniences to access pipeline components
+    @property
+    def tensorizer(self):
+        return self.get_component('tensorizer')
+
+    @property
+    def tagger(self):
+        return self.get_component('tagger')
+
+    @property
+    def parser(self):
+        return self.get_component('parser')
+
+    @property
+    def entity(self):
+        return self.get_component('ner')
+
+    @property
+    def matcher(self):
+        return self.get_component('matcher')
+
+    def get_component(self, name): 
+        if self.pipeline in (True, None):
+            return None
+        for proc in self.pipeline:
+            if hasattr(proc, 'name') and proc.name.endswith(name):
+                return proc
+        return None
+
+    def __call__(self, text, disable=[]):
+        """'Apply the pipeline to some text. The text can span multiple sentences,
+        and can contain arbtrary whitespace. Alignment into the original string
        is preserved.

-        Args:
-            text (unicode): The text to be processed.
-            state: Arbitrary
+        text (unicode): The text to be processed.
+        disable (list): Names of the pipeline components to disable.
+        RETURNS (Doc): A container for accessing the annotations.

-        Returns:
-            doc (Doc): A container for accessing the annotations.
-
-        Example:
-            >>> from spacy.en import English
-            >>> nlp = English()
+        EXAMPLE:
            >>> tokens = nlp('An example sentence. Another example sentence.')
-            >>> tokens[0].orth_, tokens[0].head.tag_
+            >>> tokens[0].text, tokens[0].head.tag_
            ('An', 'NN')
        """
        doc = self.make_doc(text)
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
-            if name in disabled and not disabled[name]:
+            if name in disable:
                continue
-            state = proc(doc, state=state)
+            doc = proc(doc)
        return doc

-    def update(self, docs, golds, state=None, drop=0., sgd=None):
+    def make_doc(self, text):
+        return self.tokenizer(text)
+
+    def update(self, docs, golds, drop=0., sgd=None, losses=None,
+            update_tensors=False):
+        """Update the models in the pipeline.
+
+        docs (iterable): A batch of `Doc` objects.
+        golds (iterable): A batch of `GoldParse` objects.
+        drop (float): The droput rate.
+        sgd (callable): An optimizer.
+        RETURNS (dict): Results from the update.
+
+        EXAMPLE:
+            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+            >>>    for epoch in trainer.epochs(gold):
+            >>>        for docs, golds in epoch:
+            >>>            state = nlp.update(docs, golds, sgd=optimizer)
+        """
+        if len(docs) != len(golds):
+            raise IndexError("Update expects same number of docs and golds "
+                "Got: %d, %d" % (len(docs), len(golds)))
+        if len(docs) == 0:
+            return
+        tok2vec = self.pipeline[0]
+        feats = tok2vec.doc2feats(docs)
        grads = {}
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
-        state = {} if state is None else state
-        for process in self.pipeline:
-            if hasattr(process, 'update'):
-                state = process.update(docs, golds,
-                            state=state,
-                            drop=drop,
-                            sgd=get_grads)
-            else:
-                process(docs, state=state)
-        if sgd is not None:
-            for key, (W, dW) in grads.items():
-                # TODO: Unhack this when thinc improves
-                if isinstance(W, numpy.ndarray):
-                    sgd.ops = NumpyOps()
-                else:
-                    sgd.ops = CupyOps()
-                sgd(W, dW, key=key)
-        return state
+        pipes = list(self.pipeline[1:])
+        random.shuffle(pipes)
+        for proc in pipes:
+            if not hasattr(proc, 'update'):
+                continue
+            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
+            d_tokvecses = proc.update((docs, tokvecses), golds,
+                                      drop=drop, sgd=get_grads, losses=losses)
+            if update_tensors and d_tokvecses is not None:
+                bp_tokvecses(d_tokvecses, sgd=sgd)
+        for key, (W, dW) in grads.items():
+            sgd(W, dW, key=key)
+        # Clear the tensor variable, to free GPU memory.
+        # If we don't do this, the memory leak gets pretty
+        # bad, because we may be holding part of a batch.
+        for doc in docs:
+            doc.tensor = None

-    @contextmanager
-    def begin_training(self, gold_tuples, **cfg):
+    def preprocess_gold(self, docs_golds):
+        """Can be called before training to pre-process gold data. By default,
+        it handles nonprojectivity and adds missing tags to the tag map.
+
+        docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
+        YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
+        """
+        for proc in self.pipeline:
+            if hasattr(proc, 'preprocess_gold'):
+                docs_golds = proc.preprocess_gold(docs_golds)
+        for doc, gold in docs_golds:
+            yield doc, gold
+
+    def begin_training(self, get_gold_tuples, **cfg):
+        """Allocate models, pre-process training data and acquire a trainer and
+        optimizer. Used as a contextmanager.
+
+        gold_tuples (iterable): Gold-standard training data.
+        **cfg: Config parameters.
+        YIELDS (tuple): A trainer and an optimizer.
+
+        EXAMPLE:
+            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+            >>>    for epoch in trainer.epochs(gold):
+            >>>        for docs, golds in epoch:
+            >>>            state = nlp.update(docs, golds, sgd=optimizer)
+        """
+        if self.parser:
+            self.pipeline.append(NeuralLabeller(self.vocab))
        # Populate vocab
-        for _, annots_brackets in gold_tuples:
+        for _, annots_brackets in get_gold_tuples():
            for annots, _ in annots_brackets:
                for word in annots[1]:
                    _ = self.vocab[word]
-        # Handle crossing dependencies
-        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
        contexts = []
-        if cfg.get('use_gpu'):
+        if cfg.get('device', -1) >= 0:
+            import cupy.cuda.device
+            device = cupy.cuda.device.Device(cfg['device'])
+            device.use()
            Model.ops = CupyOps()
            Model.Ops = CupyOps
-            print("Use GPU")
+        else:
+            device = None
        for proc in self.pipeline:
            if hasattr(proc, 'begin_training'):
-                context = proc.begin_training(gold_tuples,
+                context = proc.begin_training(get_gold_tuples(),
                                              pipeline=self.pipeline)
                contexts.append(context)
-        trainer = Trainer(self, gold_tuples, **cfg)
-        yield trainer, trainer.optimizer
+        learn_rate = util.env_opt('learn_rate', 0.001)
+        beta1 = util.env_opt('optimizer_B1', 0.9)
+        beta2 = util.env_opt('optimizer_B2', 0.999)
+        eps = util.env_opt('optimizer_eps', 1e-08)
+        L2 = util.env_opt('L2_penalty', 1e-6)
+        max_grad_norm = util.env_opt('grad_norm_clip', 1.)
+        optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
+                         beta2=beta2, eps=eps)
+        optimizer.max_grad_norm = max_grad_norm
+        optimizer.device = device
+        return optimizer
+
+    def evaluate(self, docs_golds):
+        scorer = Scorer()
+        docs, golds = zip(*docs_golds)
+        docs = list(docs)
+        golds = list(golds)
+        for pipe in self.pipeline:
+            if not hasattr(pipe, 'pipe'):
+                for doc in docs:
+                    pipe(doc)
+            else:
+                docs = list(pipe.pipe(docs))
+        assert len(docs) == len(golds)
+        for doc, gold in zip(docs, golds):
+            scorer.score(doc, gold)
+            doc.tensor = None
+        return scorer

    @contextmanager
    def use_params(self, params, **cfg):
+        """Replace weights of models in the pipeline with those provided in the
+        params dictionary. Can be used as a contextmanager, in which case,
+        models go back to their original weights after the block.
+
+        params (dict): A dictionary of parameters keyed by model ID.
+        **cfg: Config parameters.
+
+        EXAMPLE:
+            >>> with nlp.use_params(optimizer.averages):
+            >>>     nlp.to_disk('/tmp/checkpoint')
+        """
        contexts = [pipe.use_params(params) for pipe
                    in self.pipeline if hasattr(pipe, 'use_params')]
        # TODO: Having trouble with contextlib
@ -236,98 +427,149 @@ class Language(object):
            except StopIteration:
                pass

-    def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
-        """
-        Process texts as a stream, and yield Doc objects in order.
+    def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
+        """Process texts as a stream, and yield `Doc` objects in order. Supports
+        GIL-free multi-threading.

-        Supports GIL-free multi-threading.
+        texts (iterator): A sequence of texts to process.
+        n_threads (int): The number of worker threads to use. If -1, OpenMP will
+            decide how many to use at run time. Default is 2.
+        batch_size (int): The number of texts to buffer.
+        disable (list): Names of the pipeline components to disable.
+        YIELDS (Doc): Documents in the order of the original text.

-        Arguments:
-            texts (iterator)
-            tag (bool)
-            parse (bool)
-            entity (bool)
+        EXAMPLE:
+            >>> texts = [u'One document.', u'...', u'Lots of documents']
+            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
+            >>>         assert doc.is_parsed
        """
-        #stream = ((self.make_doc(text), None) for text in texts)
-        stream = ((doc, {}) for doc in texts)
+        if tuples:
+            text_context1, text_context2 = itertools.tee(texts)
+            texts = (tc[0] for tc in text_context1)
+            contexts = (tc[1] for tc in text_context2)
+            docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
+                             disable=disable)
+            for doc, context in izip(docs, contexts):
+                yield (doc, context)
+            return
+        docs = (self.make_doc(text) for text in texts)
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
-            if name in disabled and not disabled[name]:
+            if name in disable:
                continue
-
            if hasattr(proc, 'pipe'):
-                stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
+                docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
            else:
-                stream = (proc(doc, state) for doc, state in stream)
-        for doc, state in stream:
+                # Apply the function, but yield the doc
+                docs = _pipe(proc, docs)
+        for doc in docs:
            yield doc

-    def to_disk(self, path, **exclude):
-        """Save the current state to a directory.
+    def to_disk(self, path, disable=tuple()):
+        """Save the current state to a directory.  If a model is loaded, this
+        will include the model.

-        Args:
-            path: A path to a directory, which will be created if it doesn't
-                    exist. Paths may be either strings or pathlib.Path-like
-                    objects.
-            **exclude: Prevent named attributes from being saved.
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
+        disable (list): Names of pipeline components to disable and prevent
+            from being saved.
+
+        EXAMPLE:
+            >>> nlp.to_disk('/path/to/models')
        """
        path = util.ensure_path(path)
-        if not path.exists():
-            path.mkdir()
-        if not path.is_dir():
-            raise IOError("Output path must be a directory")
-        props = {}
-        for name, value in self.__dict__.items():
-            if name in exclude:
+        serializers = OrderedDict((
+            ('vocab', lambda p: self.vocab.to_disk(p)),
+            ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
+            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
+        ))
+        for proc in self.pipeline:
+            if not hasattr(proc, 'name'):
                continue
-            if hasattr(value, 'to_disk'):
-                value.to_disk(path / name)
-            else:
-                props[name] = value
-        with (path / 'props.pickle').open('wb') as file_:
-            dill.dump(props, file_)
+            if proc.name in disable:
+                continue
+            if not hasattr(proc, 'to_disk'):
+                continue
+            serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
+        util.to_disk(path, serializers, {p: False for p in disable})

-    def from_disk(self, path, **exclude):
-        """Load the current state from a directory.
+    def from_disk(self, path, disable=tuple()):
+        """Loads state from a directory. Modifies the object in place and
+        returns it. If the saved `Language` object contains a model, the
+        model will be loaded.

-        Args:
-            path: A path to a directory. Paths may be either strings or
-                pathlib.Path-like objects.
-            **exclude: Prevent named attributes from being saved.
+        path (unicode or Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        disable (list): Names of the pipeline components to disable.
+        RETURNS (Language): The modified `Language` object.
+
+        EXAMPLE:
+            >>> from spacy.language import Language
+            >>> nlp = Language().from_disk('/path/to/models')
        """
        path = util.ensure_path(path)
-        for name in path.iterdir():
-            if name not in exclude and hasattr(self, str(name)):
-                getattr(self, name).from_disk(path / name)
-        with (path / 'props.pickle').open('rb') as file_:
-            bytes_data = file_.read()
-        self.from_bytes(bytes_data, **exclude)
+        deserializers = OrderedDict((
+            ('vocab', lambda p: self.vocab.from_disk(p)),
+            ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
+            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
+        ))
+        for proc in self.pipeline:
+            if not hasattr(proc, 'name'):
+                continue
+            if proc.name in disable:
+                continue
+            if not hasattr(proc, 'to_disk'):
+                continue
+            deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
+        exclude = {p: False for p in disable}
+        if not (path / 'vocab').exists():
+            exclude['vocab'] = True
+        util.from_disk(path, deserializers, exclude)
        return self

-    def to_bytes(self, **exclude):
+    def to_bytes(self, disable=[]):
        """Serialize the current state to a binary string.

-        Args:
-            path: A path to a directory. Paths may be either strings or
-                pathlib.Path-like objects.
-            **exclude: Prevent named attributes from being serialized.
+        disable (list): Nameds of pipeline components to disable and prevent
+            from being serialized.
+        RETURNS (bytes): The serialized form of the `Language` object.
        """
-        props = dict(self.__dict__)
-        for key in exclude:
-            if key in props:
-                props.pop(key)
-        return dill.dumps(props, -1)
+        serializers = OrderedDict((
+            ('vocab', lambda: self.vocab.to_bytes()),
+            ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
+            ('meta', lambda: ujson.dumps(self.meta))
+        ))
+        for i, proc in enumerate(self.pipeline):
+            if getattr(proc, 'name', None) in disable:
+                continue
+            if not hasattr(proc, 'to_bytes'):
+                continue
+            serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
+        return util.to_bytes(serializers, {})

-    def from_bytes(self, bytes_data, **exclude):
+    def from_bytes(self, bytes_data, disable=[]):
        """Load state from a binary string.

-        Args:
-            bytes_data (bytes): The data to load from.
-            **exclude: Prevent named attributes from being loaded.
+        bytes_data (bytes): The data to load from.
+        disable (list): Names of the pipeline components to disable.
+        RETURNS (Language): The `Language` object.
        """
-        props = dill.loads(bytes_data)
-        for key, value in props.items():
-            if key not in exclude:
-                setattr(self, key, value)
+        deserializers = OrderedDict((
+            ('vocab', lambda b: self.vocab.from_bytes(b)),
+            ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
+            ('meta', lambda b: self.meta.update(ujson.loads(b)))
+        ))
+        for i, proc in enumerate(self.pipeline):
+            if getattr(proc, 'name', None) in disable:
+                continue
+            if not hasattr(proc, 'from_bytes'):
+                continue
+            deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
+        msg = util.from_bytes(bytes_data, deserializers, {})
        return self

+
+def _pipe(func, docs):
+    for doc in docs:
+        func(doc)
+        yield doc
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -27,7 +27,7 @@ cdef class Lexeme:
    cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
        cdef SerializedLexemeC lex_data
        buff = <const unsigned char*>&lex.flags
-        end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
+        end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
        for i in range(sizeof(lex_data.data)):
            lex_data.data[i] = buff[i]
        return lex_data
@ -35,7 +35,7 @@ cdef class Lexeme:
    @staticmethod
    cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
        buff = <unsigned char*>&lex.flags
-        end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
+        end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
        for i in range(sizeof(lex_data.data)):
            buff[i] = lex_data.data[i]

--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


 cdef class Lexeme:
-    """
-    An entry in the vocabulary.  A Lexeme has no string context --- it's a
+    """An entry in the vocabulary. A `Lexeme` has no string context – it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
-    def __init__(self, Vocab vocab, int orth):
-        """
-        Create a Lexeme object.
+    def __init__(self, Vocab vocab, attr_t orth):
+        """Create a Lexeme object.

-        Arguments:
-            vocab (Vocab): The parent vocabulary
-            orth (int): The orth id of the lexeme.
+        vocab (Vocab): The parent vocabulary
+        orth (uint64): The orth id of the lexeme.
        Returns (Lexeme): The newly constructd object.
        """
        self.vocab = vocab
@ -54,7 +51,7 @@ cdef class Lexeme:
        if isinstance(other, Lexeme):
            a = self.orth
            b = other.orth
-        elif isinstance(other, int):
+        elif isinstance(other, long):
            a = self.orth
            b = other
        elif isinstance(other, str):
@ -82,35 +79,28 @@ cdef class Lexeme:
        return self.c.orth

    def set_flag(self, attr_id_t flag_id, bint value):
-        """
-        Change the value of a boolean flag.
+        """Change the value of a boolean flag.

-        Arguments:
-            flag_id (int): The attribute ID of the flag to set.
-            value (bool): The new value of the flag.
+        flag_id (int): The attribute ID of the flag to set.
+        value (bool): The new value of the flag.
        """
        Lexeme.c_set_flag(self.c, flag_id, value)

    def check_flag(self, attr_id_t flag_id):
-        """
-        Check the value of a boolean flag.
+        """Check the value of a boolean flag.

-        Arguments:
-            flag_id (int): The attribute ID of the flag to query.
-        Returns (bool): The value of the flag.
+        flag_id (int): The attribute ID of the flag to query.
+        RETURNS (bool): The value of the flag.
        """
        return True if Lexeme.c_check_flag(self.c, flag_id) else False

    def similarity(self, other):
-        """
-        Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        """Compute a semantic similarity estimate. Defaults to cosine over
+        vectors.

-        Arguments:
-            other:
-                The object to compare with. By default, accepts Doc, Span,
-                Token and Lexeme objects.
-        Returns:
-            score (float): A scalar similarity score. Higher is more similar.
+        other (object): The object to compare with. By default, accepts `Doc`,
+            `Span`, `Token` and `Lexeme` objects.
+        RETURNS (float): A scalar similarity score. Higher is more similar.
        """
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
@ -119,7 +109,7 @@ cdef class Lexeme:
    def to_bytes(self):
        lex_data = Lexeme.c_to_bytes(self.c)
        start = <const char*>&self.c.flags
-        end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm)
+        end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
        assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
        byte_string = b'\0' * sizeof(lex_data.data)
        byte_chars = <char*>byte_string
@ -140,22 +130,29 @@ cdef class Lexeme:
        self.orth = self.c.orth

    property has_vector:
+        """A boolean value indicating whether a word vector is associated with
+        the object.
+
+        RETURNS (bool): Whether a word vector is associated with the object.
+        """
        def __get__(self):
-            cdef int i
-            for i in range(self.vocab.vectors_length):
-                if self.c.vector[i] != 0:
-                    return True
-            else:
-                return False
+            return self.vocab.has_vector(self.c.orth)

    property vector_norm:
-        def __get__(self):
-            return self.c.l2_norm
+        """The L2 norm of the lexeme's vector representation.

-        def __set__(self, float value):
-            self.c.l2_norm = value
+        RETURNS (float): The L2 norm of the vector representation.
+        """
+        def __get__(self):
+            vector = self.vector
+            return numpy.sqrt((vector**2).sum())

    property vector:
+        """A real-valued meaning representation.
+
+        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
+            representing the lexeme's semantics.
+        """
        def __get__(self):
            cdef int length = self.vocab.vectors_length
            if length == 0:
@ -165,27 +162,16 @@ cdef class Lexeme:
                    "model doesn't include word vectors. For more info, see "
                    "the documentation: \n%s\n" % about.__docs_models__
                )
-
-            vector_view = <float[:length,]>self.c.vector
-            return numpy.asarray(vector_view)
+            return self.vocab.get_vector(self.c.orth)

        def __set__(self, vector):
            assert len(vector) == self.vocab.vectors_length
-            cdef float value
-            cdef double norm = 0.0
-            for i, value in enumerate(vector):
-                self.c.vector[i] = value
-                norm += value * value
-            self.c.l2_norm = sqrt(norm)
+            self.vocab.set_vector(self.c.orth, vector)

    property rank:
        def __get__(self):
            return self.c.id

-    property repvec:
-        def __get__(self):
-            raise AttributeError("lex.repvec has been renamed to lex.vector")
-
    property sentiment:
        def __get__(self):
            return self.c.sentiment
@ -196,33 +182,41 @@ cdef class Lexeme:
        def __get__(self):
            return self.vocab.strings[self.c.orth]

+    property text:
+        """A unicode representation of the token text.
+
+        RETURNS (unicode): The original verbatim text of the token.
+        """
+        def __get__(self):
+            return self.orth_
+
    property lower:
        def __get__(self): return self.c.lower
-        def __set__(self, int x): self.c.lower = x
+        def __set__(self, attr_t x): self.c.lower = x

    property norm:
        def __get__(self): return self.c.norm
-        def __set__(self, int x): self.c.norm = x
+        def __set__(self, attr_t x): self.c.norm = x

    property shape:
        def __get__(self): return self.c.shape
-        def __set__(self, int x): self.c.shape = x
+        def __set__(self, attr_t x): self.c.shape = x

    property prefix:
        def __get__(self): return self.c.prefix
-        def __set__(self, int x): self.c.prefix = x
+        def __set__(self, attr_t x): self.c.prefix = x

    property suffix:
        def __get__(self): return self.c.suffix
-        def __set__(self, int x): self.c.suffix = x
+        def __set__(self, attr_t x): self.c.suffix = x

    property cluster:
        def __get__(self): return self.c.cluster
-        def __set__(self, int x): self.c.cluster = x
+        def __set__(self, attr_t x): self.c.cluster = x

    property lang:
        def __get__(self): return self.c.lang
-        def __set__(self, int x): self.c.lang = x
+        def __set__(self, attr_t x): self.c.lang = x

    property prob:
        def __get__(self): return self.c.prob
@ -230,27 +224,27 @@ cdef class Lexeme:

    property lower_:
        def __get__(self): return self.vocab.strings[self.c.lower]
-        def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
+        def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x)

    property norm_:
        def __get__(self): return self.vocab.strings[self.c.norm]
-        def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
+        def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x)

    property shape_:
        def __get__(self): return self.vocab.strings[self.c.shape]
-        def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
+        def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x)

    property prefix_:
        def __get__(self): return self.vocab.strings[self.c.prefix]
-        def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
+        def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x)

    property suffix_:
        def __get__(self): return self.vocab.strings[self.c.suffix]
-        def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
+        def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x)

    property lang_:
        def __get__(self): return self.vocab.strings[self.c.lang]
-        def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
+        def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x)

    property flags:
        def __get__(self): return self.c.flags
@ -258,7 +252,7 @@ cdef class Lexeme:

    property is_oov:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
-        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)
+        def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)

    property is_stop:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
@ -308,7 +302,6 @@ cdef class Lexeme:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)

-
    property like_url:
        def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
 ctypedef pair[int, TokenPatternC_ptr] StateC


-cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
+cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
                                 object token_specs) except NULL:
    pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
    cdef int i
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
    i = len(token_specs)
-    pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC))
+    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
    pattern[i].attrs[0].attr = ID
    pattern[i].attrs[0].value = entity_id
-    pattern[i].attrs[1].attr = ENT_TYPE
-    pattern[i].attrs[1].value = label
    pattern[i].nr_attr = 0
    return pattern


+cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
+    while pattern.nr_attr != 0:
+        pattern += 1
+    id_attr = pattern[0].attrs[0]
+    assert id_attr.attr == ID
+    return id_attr.value
+
+
 cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    for attr in pattern.attrs[:pattern.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
@ -148,7 +154,7 @@ def _convert_strings(token_specs, string_store):
            if isinstance(attr, basestring):
                attr = attrs.IDS.get(attr.upper())
            if isinstance(value, basestring):
-                value = string_store[value]
+                value = string_store.add(value)
            if isinstance(value, bool):
                value = int(value)
            if attr is not None:
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):


 def merge_phrase(matcher, doc, i, matches):
-    '''Callback to merge a phrase on match'''
+    """Callback to merge a phrase on match."""
    ent_id, label, start, end = matches[i]
    span = doc[start : end]
    span.merge(ent_type=label, ent_id=ent_id)


 cdef class Matcher:
-    '''Match sequences of tokens, based on pattern rules.'''
+    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
    cdef vector[TokenPatternC*] patterns
    cdef readonly Vocab vocab
@ -175,37 +181,12 @@ cdef class Matcher:
    cdef public object _callbacks
    cdef public object _acceptors

-    @classmethod
-    def load(cls, path, vocab):
-        """
-        Load the matcher and patterns from a file path.
+    def __init__(self, vocab):
+        """Create the Matcher.

-        Arguments:
-            path (Path):
-                Path to a JSON-formatted patterns file.
-            vocab (Vocab):
-                The vocabulary that the documents to match over will refer to.
-        Returns:
-            Matcher: The newly constructed object.
-        """
-        if (path / 'gazetteer.json').exists():
-            with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
-                patterns = ujson.load(file_)
-        else:
-            patterns = {}
-        return cls(vocab, patterns)
-
-    def __init__(self, vocab, patterns={}):
-        """
-        Create the Matcher.
-
-        Arguments:
-            vocab (Vocab):
-                The vocabulary object, which must be shared with the documents
-                the matcher will operate on.
-            patterns (dict): Patterns to add to the matcher.
-        Returns:
-            The newly constructed object.
+        vocab (Vocab): The vocabulary object, which must be shared with the
+            documents the matcher will operate on.
+        RETURNS (Matcher): The newly constructed object.
        """
        self._patterns = {}
        self._entities = {}
@ -213,144 +194,111 @@ cdef class Matcher:
        self._callbacks = {}
        self.vocab = vocab
        self.mem = Pool()
-        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
-            self.add_entity(entity_key, attrs)
-            for spec in specs:
-                self.add_pattern(entity_key, spec, label=etype)

    def __reduce__(self):
        return (self.__class__, (self.vocab, self._patterns), None, None)

-    property n_patterns:
-        def __get__(self): return self.patterns.size()
+    def __len__(self):
+        """Get the number of rules added to the matcher. Note that this only
+        returns the number of rules (identical with the number of IDs), not the
+        number of individual patterns.

-    def add_entity(self, entity_key, attrs=None, if_exists='raise',
-                   acceptor=None, on_match=None):
+        RETURNS (int): The number of rules.
        """
-        Add an entity to the matcher.
+        return len(self._patterns)

-        Arguments:
-            entity_key (unicode or int):
-                An ID for the entity.
-            attrs:
-                Attributes to associate with the Matcher.
-            if_exists ('raise', 'ignore' or 'update'):
-                Controls what happens if the entity ID already exists. Defaults to 'raise'.
-            acceptor:
-                Callback function to filter matches of the entity.
-            on_match:
-                Callback function to act on matches of the entity.
-        Returns:
-            None
+    def __contains__(self, key):
+        """Check whether the matcher contains rules for a match ID.
+
+        key (unicode): The match ID.
+        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        if if_exists not in ('raise', 'ignore', 'update'):
-            raise ValueError(
-                "Unexpected value for if_exists: %s.\n"
-                "Expected one of: ['raise', 'ignore', 'update']" % if_exists)
-        if attrs is None:
-            attrs = {}
-        entity_key = self.normalize_entity_key(entity_key)
-        if self.has_entity(entity_key):
-            if if_exists == 'raise':
-                raise KeyError(
-                    "Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
-                    "Set if_exists='ignore' or if_exists='update', or check with "
-                    "matcher.has_entity()")
-            elif if_exists == 'ignore':
-                return
-        self._entities[entity_key] = dict(attrs)
-        self._patterns.setdefault(entity_key, [])
-        self._acceptors[entity_key] = acceptor
-        self._callbacks[entity_key] = on_match
+        return len(self._patterns)

-    def add_pattern(self, entity_key, token_specs, label=""):
+    def add(self, key, on_match, *patterns):
+        """Add a match-rule to the matcher.
+        A match-rule consists of: an ID key, an on_match callback, and one or
+        more patterns. If the key exists, the patterns are appended to the
+        previous ones, and the previous on_match callback is replaced. The
+        `on_match` callback will receive the arguments `(matcher, doc, i,
+        matches)`. You can also set `on_match` to `None` to not perform any
+        actions. A pattern consists of one or more `token_specs`, where a
+        `token_spec` is a dictionary mapping attribute IDs to values. Token
+        descriptors can also include quantifiers. There are currently important
+        known problems with the quantifiers – see the docs.
        """
-        Add a pattern to the matcher.
+        for pattern in patterns:
+            if len(pattern) == 0:
+                msg = ("Cannot add pattern for zero tokens to matcher.\n"
+                       "key: {key}\n")
+                raise ValueError(msg.format(key=key))
+        key = self._normalize_key(key)
+        self._patterns.setdefault(key, [])
+        self._callbacks[key] = on_match

-        Arguments:
-            entity_key (unicode or int):
-                An ID for the entity.
-            token_specs:
-                Description of the pattern to be matched.
-            label:
-                Label to assign to the matched pattern. Defaults to "".
-        Returns:
-            None
+        for pattern in patterns:
+            specs = _convert_strings(pattern, self.vocab.strings)
+            self.patterns.push_back(init_pattern(self.mem, key, specs))
+            self._patterns[key].append(specs)
+
+    def remove(self, key):
+        """Remove a rule from the matcher. A KeyError is raised if the key does
+        not exist.
+
+        key (unicode): The ID of the match rule.
        """
-        token_specs = list(token_specs)
-        if len(token_specs) == 0:
-            msg = ("Cannot add pattern for zero tokens to matcher.\n"
-                   "entity_key: {entity_key}\n"
-                   "label: {label}")
-            raise ValueError(msg.format(entity_key=entity_key, label=label))
-        entity_key = self.normalize_entity_key(entity_key)
-        if not self.has_entity(entity_key):
-            self.add_entity(entity_key)
-        if isinstance(label, basestring):
-            label = self.vocab.strings[label]
-        elif label is None:
-            label = 0
-        spec = _convert_strings(token_specs, self.vocab.strings)
+        key = self._normalize_key(key)
+        self._patterns.pop(key)
+        self._callbacks.pop(key)
+        cdef int i = 0
+        while i < self.patterns.size():
+            pattern_key = get_pattern_key(self.patterns.at(i))
+            if pattern_key == key:
+                self.patterns.erase(self.patterns.begin()+i)
+            else:
+                i += 1

-        self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec))
-        self._patterns[entity_key].append((label, token_specs))
+    def has_key(self, key):
+        """Check whether the matcher has a rule with a given key.

-    def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
-        self.add_entity(entity_key, attrs=attrs, if_exists='update',
-                        acceptor=acceptor, on_match=on_match)
-        for spec in specs:
-            self.add_pattern(entity_key, spec, label=label)
-
-    def normalize_entity_key(self, entity_key):
-        if isinstance(entity_key, basestring):
-            return self.vocab.strings[entity_key]
-        else:
-            return entity_key
-
-    def has_entity(self, entity_key):
+        key (string or int): The key to check.
+        RETURNS (bool): Whether the matcher has the rule.
        """
-        Check whether the matcher has an entity.
+        key = self._normalize_key(key)
+        return key in self._patterns

-        Arguments:
-            entity_key (string or int): The entity key to check.
-        Returns:
-            bool: Whether the matcher has the entity.
-        """
-        entity_key = self.normalize_entity_key(entity_key)
-        return entity_key in self._entities
+    def get(self, key, default=None):
+        """Retrieve the pattern stored for a key.

-    def get_entity(self, entity_key):
+        key (unicode or int): The key to retrieve.
+        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
        """
-        Retrieve the attributes stored for an entity.
+        key = self._normalize_key(key)
+        if key not in self._patterns:
+            return default
+        return (self._callbacks[key], self._patterns[key])

-        Arguments:
-            entity_key (unicode or int): The entity to retrieve.
-        Returns:
-            The entity attributes if present, otherwise None.
-        """
-        entity_key = self.normalize_entity_key(entity_key)
-        if entity_key in self._entities:
-            return self._entities[entity_key]
-        else:
-            return None
+    def pipe(self, docs, batch_size=1000, n_threads=2):
+        """Match a stream of documents, yielding them in turn.

-    def __call__(self, Doc doc, acceptor=None):
+        docs (iterable): A stream of documents.
+        batch_size (int): The number of documents to accumulate into a working set.
+        n_threads (int): The number of threads with which to work on the buffer
+            in parallel, if the `Matcher` implementation supports multi-threading.
+        YIELDS (Doc): Documents, in order.
        """
-        Find all token sequences matching the supplied patterns on the Doc.
+        for doc in docs:
+            self(doc)
+            yield doc

-        Arguments:
-            doc (Doc):
-                The document to match over.
-        Returns:
-            list
-            A list of (entity_key, label_id, start, end) tuples,
-            describing the matches. A match tuple describes a span doc[start:end].
-            The label_id and entity_key are both integers.
+    def __call__(self, Doc doc):
+        """Find all token sequences matching the supplied patterns on the `Doc`.
+
+        doc (Doc): The document to match over.
+        RETURNS (list): A list of `(key, label_id, start, end)` tuples,
+            describing the matches. A match tuple describes a span
+            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
-        if acceptor is not None:
-            raise ValueError(
-                "acceptor keyword argument to Matcher deprecated. Specify acceptor "
-                "functions when you add patterns instead.")
        cdef vector[StateC] partials
        cdef int n_partials = 0
        cdef int q = 0
@ -388,13 +336,7 @@ cdef class Matcher:
                    end = token_i+1
                    ent_id = state.second[1].attrs[0].value
                    label = state.second[1].attrs[1].value
-                    acceptor = self._acceptors.get(ent_id)
-                    if acceptor is None:
-                        matches.append((ent_id, label, start, end))
-                    else:
-                        match = acceptor(doc, ent_id, label, start, end)
-                        if match:
-                            matches.append(match)
+                    matches.append((ent_id, start, end))
            partials.resize(q)
            # Check whether we open any new patterns on this token
            for pattern in self.patterns:
@ -419,13 +361,7 @@ cdef class Matcher:
                    end = token_i+1
                    ent_id = pattern[1].attrs[0].value
                    label = pattern[1].attrs[1].value
-                    acceptor = self._acceptors.get(ent_id)
-                    if acceptor is None:
-                        matches.append((ent_id, label, start, end))
-                    else:
-                        match = acceptor(doc, ent_id, label, start, end)
-                        if match:
-                            matches.append(match)
+                    matches.append((ent_id, start, end))
        # Look for open patterns that are actually satisfied
        for state in partials:
            while state.second.quantifier in (ZERO, ZERO_PLUS):
@ -435,36 +371,19 @@ cdef class Matcher:
                    end = len(doc)
                    ent_id = state.second.attrs[0].value
                    label = state.second.attrs[0].value
-                    acceptor = self._acceptors.get(ent_id)
-                    if acceptor is None:
-                        matches.append((ent_id, label, start, end))
-                    else:
-                        match = acceptor(doc, ent_id, label, start, end)
-                        if match:
-                            matches.append(match)
-        for i, (ent_id, label, start, end) in enumerate(matches):
+                    matches.append((ent_id, start, end))
+        for i, (ent_id, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
                on_match(self, doc, i, matches)
+        # TODO: only return (match_id, start, end)
        return matches

-    def pipe(self, docs, batch_size=1000, n_threads=2):
-        """
-        Match a stream of documents, yielding them in turn.
-
-        Arguments:
-            docs: A stream of documents.
-            batch_size (int):
-                The number of documents to accumulate into a working set.
-            n_threads (int):
-                The number of threads with which to work on the buffer in parallel,
-                if the Matcher implementation supports multi-threading.
-        Yields:
-            Doc Documents, in order.
-        """
-        for doc in docs:
-            self(doc)
-            yield doc
+    def _normalize_key(self, key):
+        if isinstance(key, basestring):
+            return self.vocab.strings.add(key)
+        else:
+            return key


 def get_bilou(length):
@ -550,7 +469,7 @@ cdef class PhraseMatcher:
            self(doc)
            yield doc

-    def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
+    def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
        assert (end - start) < self.max_length
        cdef int i, j
        for i in range(self.max_length):
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -30,6 +30,7 @@ cdef class Morphology:
    cdef public object n_tags
    cdef public object reverse_index
    cdef public object tag_names
+    cdef public object exc

    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -33,36 +33,43 @@ def _normalize_props(props):


 cdef class Morphology:
-    def __init__(self, StringStore string_store, tag_map, lemmatizer):
+    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
        self.mem = Pool()
        self.strings = string_store
        self.tag_map = {}
        self.lemmatizer = lemmatizer
-        self.n_tags = len(tag_map) + 1
+        self.n_tags = len(tag_map)
        self.tag_names = tuple(sorted(tag_map.keys()))
        self.reverse_index = {}

        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
-            attrs = _normalize_props(attrs)
            self.tag_map[tag_str] = dict(attrs)
+            attrs = _normalize_props(attrs)
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
            self.rich_tags[i].id = i
-            self.rich_tags[i].name = self.strings[tag_str]
+            self.rich_tags[i].name = self.strings.add(tag_str)
            self.rich_tags[i].morph = 0
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
        self._cache = PreshMapArray(self.n_tags)
+        self.exc = {}
+        if exc is not None:
+            for (tag_str, orth_str), attrs in exc.items():
+                self.add_special_case(tag_str, orth_str, attrs)

    def __reduce__(self):
-        return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
+        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
+                             self.exc), None, None)

    cdef int assign_tag(self, TokenC* token, tag) except -1:
        if isinstance(tag, basestring):
-            tag_id = self.reverse_index[self.strings[tag]]
-        else:
+            tag = self.strings.add(tag)
+        if tag in self.reverse_index:
            tag_id = self.reverse_index[tag]
-        self.assign_tag_id(token, tag_id)
+            self.assign_tag_id(token, tag_id)
+        else:
+            token.tag = tag

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
        if tag_id >= self.n_tags:
@ -73,7 +80,7 @@ cdef class Morphology:
        # the statistical model fails.
        # Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
-            tag_id = self.reverse_index[self.strings['SP']]
+            tag_id = self.reverse_index[self.strings.add('SP')]
        rich_tag = self.rich_tags[tag_id]
        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
        if analysis is NULL:
@ -104,7 +111,8 @@ cdef class Morphology:
            tag (unicode): The part-of-speech tag to key the exception.
            orth (unicode): The word-form to key the exception.
        """
-        tag = self.strings[tag_str]
+        self.exc[(tag_str, orth_str)] = dict(attrs)
+        tag = self.strings.add(tag_str)
        tag_id = self.reverse_index[tag]
        orth = self.strings[orth_str]
        cdef RichTagC rich_tag = self.rich_tags[tag_id]
@ -140,14 +148,14 @@ cdef class Morphology:
    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
-            return self.strings[py_string.lower()]
+            return self.strings.add(py_string.lower())
        if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
-            return self.strings[py_string.lower()]
+            return self.strings.add(py_string.lower())
        cdef set lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
        lemma_string = sorted(lemma_strings)[0]
-        lemma = self.strings[lemma_string]
+        lemma = self.strings.add(lemma_string)
        return lemma


--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -9,12 +9,18 @@ import numpy
 cimport numpy as np
 import cytoolz
 import util
+from collections import OrderedDict
+import ujson
+import msgpack

-from thinc.api import add, layerize, chain, clone, concatenate
+from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.neural import Model, Maxout, Softmax, Affine
 from thinc.neural._classes.hash_embed import HashEmbed
 from thinc.neural.util import to_categorical

+from thinc.neural.pooling import Pooling, max_pool, mean_pool
+from thinc.neural._classes.difference import Siamese, CauchySimilarity
+
 from thinc.neural._classes.convolution import ExtractWindow
 from thinc.neural._classes.resnet import Residual
 from thinc.neural._classes.batchnorm import BatchNorm as BN
@ -31,110 +37,243 @@ from .syntax.stateclass cimport StateClass
 from .gold cimport GoldParse
 from .morphology cimport Morphology
 from .vocab cimport Vocab
+from .syntax import nonproj
+from .compat import json_dumps

 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
-from ._ml import Tok2Vec, flatten, get_col, doc2feats
+from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
+from ._ml import build_text_classifier, build_tagger_model
 from .parts_of_speech import X


-class TokenVectorEncoder(object):
-    '''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
-    name = 'tok2vec'
+class BaseThincComponent(object):
+    name = None

    @classmethod
-    def Model(cls, width=128, embed_size=5000, **cfg):
-        width = util.env_opt('token_vector_width', width)
-        embed_size = util.env_opt('embed_size', embed_size)
-        return Tok2Vec(width, embed_size, preprocess=None)
+    def Model(cls, *shape, **kwargs):
+        raise NotImplementedError

    def __init__(self, vocab, model=True, **cfg):
-        self.vocab = vocab
-        self.doc2feats = doc2feats()
-        self.model = model
+        raise NotImplementedError

-    def __call__(self, docs, state=None):
-        if isinstance(docs, Doc):
-            docs = [docs]
-        tokvecs = self.predict(docs)
-        self.set_annotations(docs, tokvecs)
-        state = {} if state is None else state
-        state['tokvecs'] = tokvecs
-        return state
+    def __call__(self, doc):
+        scores = self.predict([doc])
+        self.set_annotations([doc], scores)
+        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
-        for batch in cytoolz.partition_all(batch_size, stream):
-            docs, states = zip(*batch)
-            tokvecs = self.predict(docs)
-            self.set_annotations(docs, tokvecs)
-            for state in states:
-                state['tokvecs'] = tokvecs
-            yield from zip(docs, states)
+        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
+            yield from docs

    def predict(self, docs):
-        feats = self.doc2feats(docs)
-        tokvecs = self.model(feats)
-        return tokvecs
+        raise NotImplementedError

-    def set_annotations(self, docs, tokvecs):
-        start = 0
-        for doc in docs:
-            doc.tensor = tokvecs[start : start + len(doc)]
-            start += len(doc)
+    def set_annotations(self, docs, scores):
+        raise NotImplementedError

-    def update(self, docs, golds, state=None,
-               drop=0., sgd=None):
-        if isinstance(docs, Doc):
-            docs = [docs]
-            golds = [golds]
-        state = {} if state is None else state
-        feats = self.doc2feats(docs)
-        tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
-        state['feats'] = feats
-        state['tokvecs'] = tokvecs
-        state['bp_tokvecs'] = bp_tokvecs
-        return state
+    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
+        raise NotImplementedError

    def get_loss(self, docs, golds, scores):
        raise NotImplementedError

-    def begin_training(self, gold_tuples, pipeline=None):
-        self.doc2feats = doc2feats()
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
+        token_vector_width = pipeline[0].model.nO
        if self.model is True:
-            self.model = self.Model()
+            self.model = self.Model(1, token_vector_width)

    def use_params(self, params):
        with self.model.use_params(params):
            yield

+    def to_bytes(self, **exclude):
+        serialize = OrderedDict((
+            ('model', lambda: self.model.to_bytes()),
+            ('vocab', lambda: self.vocab.to_bytes())
+        ))
+        return util.to_bytes(serialize, exclude)

-class NeuralTagger(object):
-    name = 'nn_tagger'
-    def __init__(self, vocab, model=True):
+    def from_bytes(self, bytes_data, **exclude):
+        if self.model is True:
+            self.model = self.Model()
+        deserialize = OrderedDict((
+            ('model', lambda b: self.model.from_bytes(b)),
+            ('vocab', lambda b: self.vocab.from_bytes(b))
+        ))
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(self, path, **exclude):
+        serialize = OrderedDict((
+            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
+            ('vocab', lambda p: self.vocab.to_disk(p)),
+            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
+        ))
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, **exclude):
+        if self.model is True:
+            self.model = self.Model()
+        deserialize = OrderedDict((
+            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
+            ('vocab', lambda p: self.vocab.from_disk(p)),
+            ('cfg', lambda p: self.cfg.update(_load_cfg(p)))
+        ))
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+
+def _load_cfg(path):
+    if path.exists():
+        return ujson.load(path.open())
+    else:
+        return {}
+
+
+class TokenVectorEncoder(BaseThincComponent):
+    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
+    name = 'tensorizer'
+
+    @classmethod
+    def Model(cls, width=128, embed_size=7500, **cfg):
+        """Create a new statistical model for the class.
+
+        width (int): Output size of the model.
+        embed_size (int): Number of vectors in the embedding table.
+        **cfg: Config parameters.
+        RETURNS (Model): A `thinc.neural.Model` or similar instance.
+        """
+        width = util.env_opt('token_vector_width', width)
+        embed_size = util.env_opt('embed_size', embed_size)
+        return Tok2Vec(width, embed_size, preprocess=None)
+
+    def __init__(self, vocab, model=True, **cfg):
+        """Construct a new statistical model. Weights are not allocated on
+        initialisation.
+
+        vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
+            instance with the `Doc` objects it will process.
+        model (Model): A `Model` instance or `True` allocate one later.
+        **cfg: Config parameters.
+
+        EXAMPLE:
+            >>> from spacy.pipeline import TokenVectorEncoder
+            >>> tok2vec = TokenVectorEncoder(nlp.vocab)
+            >>> tok2vec.model = tok2vec.Model(128, 5000)
+        """
        self.vocab = vocab
+        self.doc2feats = doc2feats()
        self.model = model
+        self.cfg = dict(cfg)

-    def __call__(self, doc, state=None):
-        assert state is not None
-        assert 'tokvecs' in state
-        tokvecs = state['tokvecs']
-        tags = self.predict(tokvecs)
-        self.set_annotations([doc], tags)
-        return state
+    def __call__(self, doc):
+        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
+        model. Vectors are set to the `Doc.tensor` attribute.
+
+        docs (Doc or iterable): One or more documents to add vectors to.
+        RETURNS (dict or None): Intermediate computations.
+        """
+        tokvecses = self.predict([doc])
+        self.set_annotations([doc], tokvecses)
+        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
-        for batch in cytoolz.partition_all(batch_size, stream):
-            docs, states = zip(*batch)
-            tag_ids = self.predict(states[0]['tokvecs'])
-            self.set_annotations(docs, tag_ids)
-            for state in states:
-                state['tag_ids'] = tag_ids
-            yield from zip(docs, states)
+        """Process `Doc` objects as a stream.

-    def predict(self, tokvecs):
-        scores = self.model(tokvecs)
+        stream (iterator): A sequence of `Doc` objects to process.
+        batch_size (int): Number of `Doc` objects to group.
+        n_threads (int): Number of threads.
+        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
+        """
+        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
+            tokvecses = self.predict(docs)
+            self.set_annotations(docs, tokvecses)
+            yield from docs
+
+    def predict(self, docs):
+        """Return a single tensor for a batch of documents.
+
+        docs (iterable): A sequence of `Doc` objects.
+        RETURNS (object): Vector representations for each token in the documents.
+        """
+        feats = self.doc2feats(docs)
+        tokvecs = self.model(feats)
+        return tokvecs
+
+    def set_annotations(self, docs, tokvecses):
+        """Set the tensor attribute for a batch of documents.
+
+        docs (iterable): A sequence of `Doc` objects.
+        tokvecs (object): Vector representation for each token in the documents.
+        """
+        for doc, tokvecs in zip(docs, tokvecses):
+            assert tokvecs.shape[0] == len(doc)
+            doc.tensor = tokvecs
+
+    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
+        """Update the model.
+
+        docs (iterable): A batch of `Doc` objects.
+        golds (iterable): A batch of `GoldParse` objects.
+        drop (float): The droput rate.
+        sgd (callable): An optimizer.
+        RETURNS (dict): Results from the update.
+        """
+        if isinstance(docs, Doc):
+            docs = [docs]
+        feats = self.doc2feats(docs)
+        tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
+        return tokvecs, bp_tokvecs
+
+    def get_loss(self, docs, golds, scores):
+        # TODO: implement
+        raise NotImplementedError
+
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
+        """Allocate models, pre-process training data and acquire a trainer and
+        optimizer.
+
+        gold_tuples (iterable): Gold-standard training data.
+        pipeline (list): The pipeline the model is part of.
+        """
+        self.doc2feats = doc2feats()
+        if self.model is True:
+            self.model = self.Model()
+
+
+class NeuralTagger(BaseThincComponent):
+    name = 'tagger'
+    def __init__(self, vocab, model=True, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self.cfg = dict(cfg)
+
+    def __call__(self, doc):
+        tags = self.predict(([doc], [doc.tensor]))
+        self.set_annotations([doc], tags)
+        return doc
+
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
+            tokvecs = [d.tensor for d in docs]
+            tag_ids = self.predict((docs, tokvecs))
+            self.set_annotations(docs, tag_ids)
+            yield from docs
+
+    def predict(self, docs_tokvecs):
+        scores = self.model(docs_tokvecs)
+        scores = self.model.ops.flatten(scores)
        guesses = scores.argmax(axis=1)
        if not isinstance(guesses, numpy.ndarray):
            guesses = guesses.get()
+        tokvecs = docs_tokvecs[1]
+        guesses = self.model.ops.unflatten(guesses,
+                    [tv.shape[0] for tv in tokvecs])
        return guesses

    def set_annotations(self, docs, batch_tag_ids):
@ -142,49 +281,49 @@ class NeuralTagger(object):
            docs = [docs]
        cdef Doc doc
        cdef int idx = 0
-        cdef int i, j, tag_id
        cdef Vocab vocab = self.vocab
        for i, doc in enumerate(docs):
-            doc_tag_ids = batch_tag_ids[idx:idx+len(doc)]
+            doc_tag_ids = batch_tag_ids[i]
            for j, tag_id in enumerate(doc_tag_ids):
-                vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
+                # Don't clobber preset POS tags
+                if doc.c[j].tag == 0 and doc.c[j].pos == 0:
+                    vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
                idx += 1
+        doc.is_tagged = True

-    def update(self, docs, golds, state=None, drop=0., sgd=None):
-        state = {} if state is None else state
+    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+        docs, tokvecs = docs_tokvecs

-        tokvecs = state['tokvecs']
-        bp_tokvecs = state['bp_tokvecs']
        if self.model.nI is None:
-            self.model.nI = tokvecs.shape[1]
-
-        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
+            self.model.nI = tokvecs[0].shape[1]
+        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)

        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
-
-        bp_tokvecs(d_tokvecs, sgd=sgd)
-
-        state['tag_scores'] = tag_scores
-        state['tag_loss'] = loss
-        return state
+        return d_tokvecs

    def get_loss(self, docs, golds, scores):
+        scores = self.model.ops.flatten(scores)
        tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}

        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype='i')
+        guesses = scores.argmax(axis=1)
        for gold in golds:
            for tag in gold.tags:
-                correct[idx] = tag_index[tag]
+                if tag is None:
+                    correct[idx] = guesses[idx]
+                else:
+                    correct[idx] = tag_index[tag]
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype='i')
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+        d_scores /= d_scores.shape[0]
        loss = (d_scores**2).sum()
-        d_scores = self.model.ops.asarray(d_scores, dtype='f')
+        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores

-    def begin_training(self, gold_tuples, pipeline=None):
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = {}
        for raw_text, annots_brackets in gold_tuples:
@ -195,22 +334,277 @@ class NeuralTagger(object):
                        new_tag_map[tag] = orig_tag_map[tag]
                    else:
                        new_tag_map[tag] = {POS: X}
+        if 'SP' not in new_tag_map:
+            new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X})
        cdef Vocab vocab = self.vocab
-        vocab.morphology = Morphology(vocab.strings, new_tag_map,
-                                      vocab.morphology.lemmatizer)
-        self.model = Softmax(self.vocab.morphology.n_tags)
-        print("Tagging", self.model.nO, "tags")
+        if new_tag_map:
+            vocab.morphology = Morphology(vocab.strings, new_tag_map,
+                                          vocab.morphology.lemmatizer,
+                                          exc=vocab.morphology.exc)
+        token_vector_width = pipeline[0].model.nO
+        if self.model is True:
+            self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)

+    @classmethod
+    def Model(cls, n_tags, token_vector_width):
+        return build_tagger_model(n_tags, token_vector_width)
+ 
    def use_params(self, params):
        with self.model.use_params(params):
            yield

+    def to_bytes(self, **exclude):
+        serialize = OrderedDict((
+            ('model', lambda: self.model.to_bytes()),
+            ('vocab', lambda: self.vocab.to_bytes()),
+            ('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
+                                             use_bin_type=True,
+                                             encoding='utf8'))
+        ))
+        return util.to_bytes(serialize, exclude)
+
+    def from_bytes(self, bytes_data, **exclude):
+        def load_model(b):
+            if self.model is True:
+                token_vector_width = util.env_opt('token_vector_width', 128)
+                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
+            self.model.from_bytes(b)
+
+        def load_tag_map(b):
+            tag_map = msgpack.loads(b, encoding='utf8')
+            self.vocab.morphology = Morphology(
+                self.vocab.strings, tag_map=tag_map,
+                lemmatizer=self.vocab.morphology.lemmatizer,
+                exc=self.vocab.morphology.exc)
+ 
+        deserialize = OrderedDict((
+            ('vocab', lambda b: self.vocab.from_bytes(b)),
+            ('tag_map', load_tag_map),
+            ('model', lambda b: load_model(b)),
+        ))
+        util.from_bytes(bytes_data, deserialize, exclude)
+        return self
+
+    def to_disk(self, path, **exclude):
+        serialize = OrderedDict((
+            ('vocab', lambda p: self.vocab.to_disk(p)),
+            ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
+                self.vocab.morphology.tag_map,
+                use_bin_type=True,
+                encoding='utf8'))),
+            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
+            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
+        ))
+        util.to_disk(path, serialize, exclude)
+
+    def from_disk(self, path, **exclude):
+        def load_model(p):
+            if self.model is True:
+                token_vector_width = util.env_opt('token_vector_width', 128)
+                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
+            self.model.from_bytes(p.open('rb').read())
+
+        def load_tag_map(p):
+            with p.open('rb') as file_:
+                tag_map = msgpack.loads(file_.read(), encoding='utf8')
+            self.vocab.morphology = Morphology(
+                self.vocab.strings, tag_map=tag_map,
+                lemmatizer=self.vocab.morphology.lemmatizer,
+                exc=self.vocab.morphology.exc)
+
+        deserialize = OrderedDict((
+            ('vocab', lambda p: self.vocab.from_disk(p)),
+            ('tag_map', load_tag_map),
+            ('model', load_model),
+            ('cfg', lambda p: self.cfg.update(_load_cfg(p)))
+        ))
+        util.from_disk(path, deserialize, exclude)
+        return self
+
+
+class NeuralLabeller(NeuralTagger):
+    name = 'nn_labeller'
+    def __init__(self, vocab, model=True, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self.cfg = dict(cfg)
+
+    @property
+    def labels(self):
+        return self.cfg.setdefault('labels', {})
+
+    @labels.setter
+    def labels(self, value):
+        self.cfg['labels'] = value
+
+    def set_annotations(self, docs, dep_ids):
+        pass
+
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
+        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
+        for raw_text, annots_brackets in gold_tuples:
+            for annots, brackets in annots_brackets:
+                ids, words, tags, heads, deps, ents = annots
+                for dep in deps:
+                    if dep not in self.labels:
+                        self.labels[dep] = len(self.labels)
+        token_vector_width = pipeline[0].model.nO
+        if self.model is True:
+            self.model = self.Model(len(self.labels), token_vector_width)
+
+    @classmethod
+    def Model(cls, n_tags, token_vector_width):
+        return build_tagger_model(n_tags, token_vector_width)
+    
+    def get_loss(self, docs, golds, scores):
+        scores = self.model.ops.flatten(scores)
+        cdef int idx = 0
+        correct = numpy.zeros((scores.shape[0],), dtype='i')
+        guesses = scores.argmax(axis=1)
+        for gold in golds:
+            for tag in gold.labels:
+                if tag is None or tag not in self.labels:
+                    correct[idx] = guesses[idx]
+                else:
+                    correct[idx] = self.labels[tag]
+                idx += 1
+        correct = self.model.ops.xp.array(correct, dtype='i')
+        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+        d_scores /= d_scores.shape[0]
+        loss = (d_scores**2).sum()
+        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+        return float(loss), d_scores
+
+
+class SimilarityHook(BaseThincComponent):
+    """
+    Experimental
+
+    A pipeline component to install a hook for supervised similarity into
+    Doc objects. Requires a Tensorizer to pre-process documents. The similarity
+    model can be any object obeying the Thinc Model interface. By default,
+    the model concatenates the elementwise mean and elementwise max of the two
+    tensors, and compares them using the Cauchy-like similarity function
+    from Chen (2013):
+
+        similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
+
+    Where W is a vector of dimension weights, initialized to 1.
+    """
+    name = 'similarity'
+    def __init__(self, vocab, model=True, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self.cfg = dict(cfg)
+
+    @classmethod
+    def Model(cls, length):
+        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
+
+    def __call__(self, doc):
+        '''Install similarity hook'''
+        doc.user_hooks['similarity'] = self.predict
+        return doc
+
+    def pipe(self, docs, **kwargs):
+        for doc in docs:
+            yield self(doc)
+
+    def predict(self, doc1, doc2):
+        return self.model.predict([(doc1.tensor, doc2.tensor)])
+
+    def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
+        doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
+        sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
+                                                drop=drop)
+        d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
+
+        return d_tensor1s, d_tensor2s
+
+    def begin_training(self, _=tuple(), pipeline=None):
+        """
+        Allocate model, using width from tensorizer in pipeline.
+
+        gold_tuples (iterable): Gold-standard training data.
+        pipeline (list): The pipeline the model is part of.
+        """
+        if self.model is True:
+            self.model = self.Model(pipeline[0].model.nO)
+
+
+class TextCategorizer(BaseThincComponent):
+    name = 'textcat'
+
+    @classmethod
+    def Model(cls, nr_class=1, width=64, **cfg):
+        return build_text_classifier(nr_class, width, **cfg)
+
+    def __init__(self, vocab, model=True, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self.cfg = dict(cfg)
+
+    @property
+    def labels(self):
+        return self.cfg.get('labels', ['LABEL'])
+
+    @labels.setter
+    def labels(self, value):
+        self.cfg['labels'] = value
+
+    def __call__(self, doc):
+        scores = self.predict([doc])
+        self.set_annotations([doc], scores)
+        return doc
+
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
+            scores = self.predict(docs)
+            self.set_annotations(docs, scores)
+            yield from docs
+
+    def predict(self, docs):
+        scores = self.model(docs)
+        scores = self.model.ops.asarray(scores)
+        return scores
+
+    def set_annotations(self, docs, scores):
+        for i, doc in enumerate(docs):
+            for j, label in enumerate(self.labels):
+                doc.cats[label] = float(scores[i, j])
+
+    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
+        docs, tensors = docs_tensors
+        scores, bp_scores = self.model.begin_update(docs, drop=drop)
+        loss, d_scores = self.get_loss(docs, golds, scores)
+        d_tensors = bp_scores(d_scores, sgd=sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += loss
+        return d_tensors
+
+    def get_loss(self, docs, golds, scores):
+        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
+        for i, gold in enumerate(golds):
+            for j, label in enumerate(self.labels):
+                truths[i, j] = label in gold.cats
+        truths = self.model.ops.asarray(truths)
+        d_scores = (scores-truths) / scores.shape[0]
+        mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
+        return mean_square_error, d_scores
+
+    def begin_training(self, gold_tuples=tuple(), pipeline=None):
+        if pipeline:
+            token_vector_width = pipeline[0].model.nO
+        else:
+            token_vector_width = 64
+        if self.model is True:
+            self.model = self.Model(len(self.labels), token_vector_width)


 cdef class EntityRecognizer(LinearParser):
-    """
-    Annotate named entities on Doc objects.
-    """
+    """Annotate named entities on Doc objects."""
    TransitionSystem = BiluoPushDown

    feature_templates = get_feature_templates('ner')
@ -222,9 +616,7 @@ cdef class EntityRecognizer(LinearParser):


 cdef class BeamEntityRecognizer(BeamParser):
-    """
-    Annotate named entities on Doc objects.
-    """
+    """Annotate named entities on Doc objects."""
    TransitionSystem = BiluoPushDown

    feature_templates = get_feature_templates('ner')
@ -249,32 +641,26 @@ cdef class NeuralDependencyParser(NeuralParser):
    name = 'parser'
    TransitionSystem = ArcEager

+    def __reduce__(self):
+        return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
+

 cdef class NeuralEntityRecognizer(NeuralParser):
-    name = 'entity'
+    name = 'ner'
    TransitionSystem = BiluoPushDown

    nr_feature = 6

-    def get_token_ids(self, states):
-        cdef StateClass state
-        cdef int n_tokens = 6
-        ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
-        for i, state in enumerate(states):
-            ids[i, 0] = state.c.B(0)-1
-            ids[i, 1] = state.c.B(0)
-            ids[i, 2] = state.c.B(1)
-            ids[i, 3] = state.c.E(0)
-            ids[i, 4] = state.c.E(0)-1
-            ids[i, 5] = state.c.E(0)+1
-            for j in range(6):
-                if ids[i, j] >= state.c.length:
-                    ids[i, j] = -1
-                if ids[i, j] != -1:
-                    ids[i, j] += state.c.offset
-        return ids
-
+    def predict_confidences(self, docs):
+        tensors = [d.tensor for d in docs]
+        samples = []
+        for i in range(10):
+            states = self.parse_batch(docs, tensors, drop=0.3)
+            for state in states:
+                samples.append(self._get_entities(state))

+    def __reduce__(self):
+        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)


 cdef class BeamDependencyParser(BeamParser):
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -1,4 +1,5 @@
 from libc.stdint cimport int64_t
+from libcpp.vector cimport vector

 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t


 cpdef hash_t hash_string(unicode string) except 0
+cdef hash_t hash_utf8(char* utf8_string, int length) nogil
+
+cdef unicode decode_Utf8Str(const Utf8Str* string)


 ctypedef union Utf8Str:
@ -17,13 +21,11 @@ ctypedef union Utf8Str:

 cdef class StringStore:
    cdef Pool mem
-    cdef Utf8Str* c
-    cdef int64_t size
    cdef bint is_frozen

+    cdef vector[hash_t] keys
    cdef public PreshMap _map
    cdef public PreshMap _oov
-    cdef int64_t _resize_at

    cdef const Utf8Str* intern_unicode(self, unicode py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -7,11 +7,16 @@ from libc.string cimport memcpy
 from libc.stdint cimport uint64_t, uint32_t
 from murmurhash.mrmr cimport hash64, hash32
 from preshed.maps cimport map_iter, key_t
+from libc.stdint cimport uint32_t
+import ujson
+import dill
+
+from .symbols import IDS as SYMBOLS_BY_STR
+from .symbols import NAMES as SYMBOLS_BY_INT

 from .typedefs cimport hash_t
-from libc.stdint cimport uint32_t
-
-import ujson
+from . import util
+from .compat import json_dumps


 cpdef hash_t hash_string(unicode string) except 0:
@ -27,7 +32,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
    return hash32(utf8_string, length, 1)


-cdef unicode _decode(const Utf8Str* string):
+cdef unicode decode_Utf8Str(const Utf8Str* string):
    cdef int i, length
    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
        return string.s[1:string.s[0]+1].decode('utf8')
@ -44,10 +49,10 @@ cdef unicode _decode(const Utf8Str* string):
        return string.p[i:length + i].decode('utf8')


-cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
+cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
    cdef int n_length_bytes
    cdef int i
-    cdef Utf8Str string
+    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
    cdef uint32_t ulength = length
    if length < sizeof(string.s):
        string.s[0] = <unsigned char>length
@ -72,129 +77,166 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex


 cdef class StringStore:
-    """
-    Map strings to and from integer IDs.
-    """
+    """Look up strings by 64-bit hashes."""
    def __init__(self, strings=None, freeze=False):
-        """
-        Create the StringStore.
+        """Create the StringStore.

-        Arguments:
-            strings: A sequence of unicode strings to add to the store.
+        strings (iterable): A sequence of unicode strings to add to the store.
+        RETURNS (StringStore): The newly constructed object.
        """
        self.mem = Pool()
        self._map = PreshMap()
        self._oov = PreshMap()
-        self._resize_at = 10000
-        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
-        self.size = 1
        self.is_frozen = freeze
        if strings is not None:
            for string in strings:
-                _ = self[string]
-
-    property size:
-        def __get__(self):
-            return self.size -1
-
-    def __reduce__(self):
-        # TODO: OOV words, for the is_frozen stuff?
-        if self.is_frozen:
-            raise NotImplementedError(
-                "Currently missing support for pickling StringStore when "
-                "is_frozen=True")
-        return (StringStore, (list(self),))
-
-    def __len__(self):
-        """
-        The number of strings in the store.
-
-        Returns:
-            int The number of strings in the store.
-        """
-        return self.size-1
+                self.add(string)

    def __getitem__(self, object string_or_id):
-        """
-        Retrieve a string from a given integer ID, or vice versa.
+        """Retrieve a string from a given hash, or vice versa.

-        Arguments:
-            string_or_id (bytes or unicode or int):
-                The value to encode.
-        Returns:
-            unicode or int: The value to retrieved.
+        string_or_id (bytes, unicode or uint64): The value to encode.
+        Returns (unicode or uint64): The value to be retrieved.
        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
            return 0
        elif string_or_id == 0:
            return u''
+        elif string_or_id in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string_or_id]

-        cdef bytes byte_string
-        cdef const Utf8Str* utf8str
-        cdef uint64_t int_id
-        cdef uint32_t oov_id
-        if isinstance(string_or_id, (int, long)):
-            int_id = string_or_id
-            oov_id = string_or_id
-            if int_id < <uint64_t>self.size:
-                return _decode(&self.c[int_id])
-            else:
-                utf8str = <Utf8Str*>self._oov.get(oov_id)
-                if utf8str is not NULL:
-                    return _decode(utf8str)
-                else:
-                    raise IndexError(string_or_id)
+        cdef hash_t key
+
+        if isinstance(string_or_id, unicode):
+            key = hash_string(string_or_id)
+            return key
+        elif isinstance(string_or_id, bytes):
+            key = hash_utf8(string_or_id, len(string_or_id))
+            return key
+        elif string_or_id < len(SYMBOLS_BY_INT):
+            return SYMBOLS_BY_INT[string_or_id]
        else:
-            if isinstance(string_or_id, bytes):
-                byte_string = <bytes>string_or_id
-            elif isinstance(string_or_id, unicode):
-                byte_string = (<unicode>string_or_id).encode('utf8')
-            else:
-                raise TypeError(type(string_or_id))
-            utf8str = self._intern_utf8(byte_string, len(byte_string))
+            key = string_or_id
+            utf8str = <Utf8Str*>self._map.get(key)
            if utf8str is NULL:
-                # TODO: We need to use 32 bit here, for compatibility with the
-                # vocabulary values. This makes birthday paradox probabilities
-                # pretty bad.
-                # We could also get unlucky here, and hash into a value that
-                # collides with the 'real' strings.
-                return hash32_utf8(byte_string, len(byte_string))
+                raise KeyError(string_or_id)
            else:
-                return utf8str - self.c
+                return decode_Utf8Str(utf8str)

-    def __contains__(self, unicode string not None):
-        """
-        Check whether a string is in the store.
+    def add(self, string):
+        """Add a string to the StringStore.

-        Arguments:
-            string (unicode): The string to check.
-        Returns bool:
-            Whether the store contains the string.
+        string (unicode): The string to add.
+        RETURNS (uint64): The string's hash value.
        """
-        if len(string) == 0:
+        if isinstance(string, unicode):
+            if string in SYMBOLS_BY_STR:
+                return SYMBOLS_BY_STR[string]
+            key = hash_string(string)
+            self.intern_unicode(string)
+        elif isinstance(string, bytes):
+            if string in SYMBOLS_BY_STR:
+                return SYMBOLS_BY_STR[string]
+            key = hash_utf8(string, len(string))
+            self._intern_utf8(string, len(string))
+        else:
+            raise TypeError(
+                "Can only add unicode or bytes. Got type: %s" % type(string))
+        return key
+
+    def __len__(self):
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self.keys.size()
+
+    def __contains__(self, string not None):
+        """Check whether a string is in the store.
+
+        string (unicode): The string to check.
+        RETURNS (bool): Whether the store contains the string.
+        """
+        cdef hash_t key
+        if isinstance(string, int) or isinstance(string, long):
+            if string == 0:
+                return True
+            key = string
+        elif len(string) == 0:
            return True
-        cdef hash_t key = hash_string(string)
-        return self._map.get(key) is not NULL
+        elif string in SYMBOLS_BY_STR:
+            return True
+        elif isinstance(string, unicode):
+            key = hash_string(string)
+        else:
+            string = string.encode('utf8')
+            key = hash_utf8(string, len(string))
+        if key < len(SYMBOLS_BY_INT):
+            return True
+        else:
+            return self._map.get(key) is not NULL

    def __iter__(self):
-        """
-        Iterate over the strings in the store, in order.
+        """Iterate over the strings in the store, in order.

-        Yields: unicode A string in the store.
+        YIELDS (unicode): A string in the store.
        """
        cdef int i
-        for i in range(self.size):
-            yield _decode(&self.c[i]) if i > 0 else u''
+        cdef hash_t key
+        for i in range(self.keys.size()):
+            key = self.keys[i]
+            utf8str = <Utf8Str*>self._map.get(key)
+            yield decode_Utf8Str(utf8str)
        # TODO: Iterate OOV here?

    def __reduce__(self):
-        strings = [""]
-        for i in range(1, self.size):
-            string = &self.c[i]
-            py_string = _decode(string)
-            strings.append(py_string)
+        strings = list(self)
        return (StringStore, (strings,), None, None, None)

+    def to_disk(self, path):
+        """Save the current state to a directory.
+
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
+        """
+        path = util.ensure_path(path)
+        strings = list(self)
+        with path.open('w') as file_:
+            file_.write(json_dumps(strings))
+
+    def from_disk(self, path):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (unicode or Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        RETURNS (StringStore): The modified `StringStore` object.
+        """
+        path = util.ensure_path(path)
+        with path.open('r') as file_:
+            strings = ujson.load(file_)
+        self._reset_and_load(strings)
+        return self
+
+    def to_bytes(self, **exclude):
+        """Serialize the current state to a binary string.
+
+        **exclude: Named attributes to prevent from being serialized.
+        RETURNS (bytes): The serialized form of the `StringStore` object.
+        """
+        return ujson.dumps(list(self))
+
+    def from_bytes(self, bytes_data, **exclude):
+        """Load state from a binary string.
+
+        bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (StringStore): The `StringStore` object.
+        """
+        strings = ujson.loads(bytes_data)
+        self._reset_and_load(strings)
+        return self
+
    def set_frozen(self, bint is_frozen):
        # TODO
        self.is_frozen = is_frozen
@ -202,6 +244,15 @@ cdef class StringStore:
    def flush_oov(self):
        self._oov = PreshMap()

+    def _reset_and_load(self, strings, freeze=False):
+        self.mem = Pool()
+        self._map = PreshMap()
+        self._oov = PreshMap()
+        self.keys.clear()
+        for string in strings:
+            self.add(string)
+        self.is_frozen = freeze
+
    cdef const Utf8Str* intern_unicode(self, unicode py_string):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode('utf8')
@ -223,73 +274,11 @@ cdef class StringStore:
            key32 = hash32_utf8(utf8_string, length)
            # Important: Make the OOV store own the memory. That way it's trivial
            # to flush them all.
-            value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
-            value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
+            value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
            self._oov.set(key32, value)
            return NULL

-        if self.size == self._resize_at:
-            self._realloc()
-        self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
-        self._map.set(key, <void*>&self.c[self.size])
-        self.size += 1
-        return &self.c[self.size-1]
-
-    def dump(self, file_):
-        """
-        Save the strings to a JSON file.
-
-        Arguments:
-            file_ (buffer): The file to save the strings.
-        Returns:
-            None
-        """
-        string_data = ujson.dumps(list(self))
-        if not isinstance(string_data, unicode):
-            string_data = string_data.decode('utf8')
-        # TODO: OOV?
-        file_.write(string_data)
-
-    def load(self, file_):
-        """
-        Load the strings from a JSON file.
-
-        Arguments:
-            file_ (buffer): The file from which to load the strings.
-        Returns:
-            None
-        """
-        strings = ujson.load(file_)
-        if strings == ['']:
-            return None
-        cdef unicode string
-        for string in strings:
-            # explicit None/len check instead of simple truth testing
-            # (bug in Cython <= 0.23.4)
-            if string is not None and len(string):
-                self.intern_unicode(string)
-
-    def _realloc(self):
-        # We want to map straight to pointers, but they'll be invalidated if
-        # we resize our array. So, first we remap to indices, then we resize,
-        # then we can acquire the new pointers.
-        cdef Pool tmp_mem = Pool()
-        keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
-        cdef key_t key
-        cdef void* value
-        cdef const Utf8Str ptr
-        cdef int i = 0
-        cdef size_t offset
-        while map_iter(self._map.c_map, &i, &key, &value):
-            # Find array index with pointer arithmetic
-            offset = ((<Utf8Str*>value) - self.c)
-            keys[offset] = key
-
-        self._resize_at *= 2
-        cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
-        self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
-
-        self._map = PreshMap(self.size)
-        for i in range(self.size):
-            if keys[i]:
-                self._map.set(keys[i], &self.c[i])
+        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
+        self._map.set(key, value)
+        self.keys.push_back(key)
+        return value
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t


 cdef struct LexemeC:
-    float* vector
-
    flags_t flags

    attr_t lang
@ -25,11 +23,10 @@ cdef struct LexemeC:

    float prob
    float sentiment
-    float l2_norm


 cdef struct SerializedLexemeC:
-    unsigned char[4*13 + 8] data
+    unsigned char[8 + 8*10 + 4 + 4] data
    #    sizeof(flags_t)  # flags
    #    + sizeof(attr_t) # lang
    #    + sizeof(attr_t) # id
@ -50,7 +47,7 @@ cdef struct Entity:
    hash_t id
    int start
    int end
-    int label
+    attr_t label


 cdef struct TokenC:
@ -58,12 +55,12 @@ cdef struct TokenC:
    uint64_t morph
    univ_pos_t pos
    bint spacy
-    int tag
+    attr_t tag
    int idx
-    int lemma
-    int sense
+    attr_t lemma
+    attr_t sense
    int head
-    int dep
+    attr_t dep
    bint sent_start

    uint32_t l_kids
@ -72,5 +69,5 @@ cdef struct TokenC:
    uint32_t r_edge

    int ent_iob
-    int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+    attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
    hash_t ent_id
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -82,6 +82,7 @@ cpdef enum symbol_t:
    ENT_IOB
    ENT_TYPE
    HEAD
+    SENT_START
    SPACY
    PROB

--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -84,6 +84,7 @@ IDS = {
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "HEAD": HEAD,
+    "SENT_START": SENT_START,
    "SPACY": SPACY,
    "PROB": PROB,

--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity
 from ..lexeme cimport Lexeme
 from ..symbols cimport punct
 from ..attrs cimport IS_SPACE
+from ..typedefs cimport attr_t


 cdef inline bint is_space_token(const TokenC* token) nogil:
@ -71,6 +72,45 @@ cdef cppclass StateC:
        free(this._stack - PADDING)
        free(this.shifted - PADDING)

+    void set_context_tokens(int* ids, int n) nogil:
+        if n == 13:
+            ids[0] = this.B(0)
+            ids[1] = this.B(1)
+            ids[2] = this.S(0)
+            ids[3] = this.S(1)
+            ids[4] = this.S(2)
+            ids[5] = this.L(this.S(0), 1)
+            ids[6] = this.L(this.S(0), 2)
+            ids[6] = this.R(this.S(0), 1)
+            ids[7] = this.L(this.B(0), 1)
+            ids[8] = this.R(this.S(0), 2)
+            ids[9] = this.L(this.S(1), 1)
+            ids[10] = this.L(this.S(1), 2)
+            ids[11] = this.R(this.S(1), 1)
+            ids[12] = this.R(this.S(1), 2)
+        elif n == 6:
+            if this.B(0) >= 0:
+                ids[0] = this.B(0)
+            else:
+                ids[0] = -1
+            ids[1] = this.B(0)
+            ids[2] = this.B(1)
+            ids[3] = this.E(0)
+            if ids[3] >= 1:
+                ids[4] = this.E(0)-1
+            else:
+                ids[4] = -1
+            if (ids[3]+1) < this.length:
+                ids[5] = this.E(0)+1
+            else:
+                ids[5] = -1
+        else:
+            # TODO error =/
+            pass
+        for i in range(n):
+            if ids[i] >= 0:
+                ids[i] += this.offset
+
    int S(int i) nogil const:
        if i >= this._s_i:
            return -1
@ -238,7 +278,7 @@ cdef cppclass StateC:
        this._s_i -= 1
        this.shifted[this.B(0)] = True

-    void add_arc(int head, int child, int label) nogil:
+    void add_arc(int head, int child, attr_t label) nogil:
        if this.has_head(child):
            this.del_arc(this.H(child), child)

@ -282,7 +322,7 @@ cdef cppclass StateC:
            h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
            h.l_kids -= 1

-    void open_ent(int label) nogil:
+    void open_ent(attr_t label) nogil:
        this._ents[this._e_i].start = this.B(0)
        this._ents[this._e_i].label = label
        this._ents[this._e_i].end = -1
@ -294,7 +334,7 @@ cdef cppclass StateC:
        this._ents[this._e_i-1].end = this.B(0)+1
        this._sent[this.B(0)].ent_iob = 1

-    void set_ent_tag(int i, int ent_iob, int ent_type) nogil:
+    void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
        if 0 <= i < this.length:
            this._sent[i].ent_iob = ent_iob
            this._sent[i].ent_type = ent_type
@ -305,16 +345,18 @@ cdef cppclass StateC:
            this._break = this._b_i

    void clone(const StateC* src) nogil:
+        this.length = src.length
        memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
        memcpy(this._stack, src._stack, this.length * sizeof(int))
        memcpy(this._buffer, src._buffer, this.length * sizeof(int))
        memcpy(this._ents, src._ents, this.length * sizeof(Entity))
        memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
-        this.length = src.length
        this._b_i = src._b_i
        this._s_i = src._s_i
        this._e_i = src._e_i
        this._break = src._break
+        this.offset = src.offset
+        this._empty_token = src._empty_token

    void fast_forward() nogil:
        # space token attachement policy:
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t

 from .stateclass cimport StateClass
+from ..typedefs cimport attr_t

 from .transition_system cimport TransitionSystem, Transition
 from ..gold cimport GoldParseC
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -9,10 +9,12 @@ import ctypes
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from cymem.cymem cimport Pool
+from collections import OrderedDict
+from thinc.extra.search cimport Beam
+import numpy

 from .stateclass cimport StateClass
 from ._state cimport StateC, is_space_token
-from .nonproj import PseudoProjectivity
 from .nonproj import is_nonproj_tree
 from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
@ -60,7 +62,7 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
            cost += 1
        if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
            cost += 1
-    cost += Break.is_valid(stcls.c, -1) and Break.move_cost(stcls, gold) == 0
+    cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
    return cost


@ -73,7 +75,7 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
        cost += gold.heads[target] == B_i
        if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
            break
-    if Break.is_valid(stcls.c, -1) and Break.move_cost(stcls, gold) == 0:
+    if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
        cost += 1
    return cost

@ -84,14 +86,14 @@ cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int c
    elif stcls.H(child) == gold.heads[child]:
        return 1
    # Head in buffer
-    elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1:
+    elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != 0:
        return 1
    else:
        return 0


 cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
-    if gold.labels[child] == -1:
+    if not gold.has_dep[child]:
        return True
    elif gold.heads[child] == head:
        return True
@ -99,10 +101,10 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
        return False


-cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
-    if gold.labels[child] == -1:
+cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
+    if not gold.has_dep[child]:
        return True
-    elif label == -1:
+    elif label == 0:
        return True
    elif gold.labels[child] == label:
        return True
@ -111,21 +113,20 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label)


 cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
-    return gold.labels[word] == -1 or gold.heads[word] == word
-
+    return gold.heads[word] == word or not gold.has_dep[word]

 cdef class Shift:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.push()
        st.fast_forward()

    @staticmethod
-    cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
        return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)

    @staticmethod
@ -133,17 +134,17 @@ cdef class Shift:
        return push_cost(s, gold, s.B(0))

    @staticmethod
-    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return 0


 cdef class Reduce:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return st.stack_depth() >= 2

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        if st.has_head(st.S(0)):
            st.pop()
        else:
@ -151,7 +152,7 @@ cdef class Reduce:
        st.fast_forward()

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)

    @staticmethod
@ -165,28 +166,28 @@ cdef class Reduce:
                    cost -= 1
                if gold.heads[S_i] == st.S(0):
                    cost -= 1
-            if Break.is_valid(st.c, -1) and Break.move_cost(st, gold) == 0:
+            if Break.is_valid(st.c, 0) and Break.move_cost(st, gold) == 0:
                cost -= 1
        return cost

    @staticmethod
-    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return 0


 cdef class LeftArc:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return not st.B_(0).sent_start

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.add_arc(st.B(0), st.S(0), label)
        st.pop()
        st.fast_forward()

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)

    @staticmethod
@ -204,23 +205,23 @@ cdef class LeftArc:
            return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))

    @staticmethod
-    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)


 cdef class RightArc:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return not st.B_(0).sent_start

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.add_arc(st.S(0), st.B(0), label)
        st.push()
        st.fast_forward()

    @staticmethod
-    cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)

    @staticmethod
@ -233,13 +234,13 @@ cdef class RightArc:
            return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))

    @staticmethod
-    cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)


 cdef class Break:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int i
        if not USE_BREAK:
            return False
@ -251,12 +252,12 @@ cdef class Break:
            return True

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.set_break(st.B_(0).l_edge)
        st.fast_forward()

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)

    @staticmethod
@ -281,13 +282,13 @@ cdef class Break:
            return cost + 1

    @staticmethod
-    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return 0

 cdef int _get_root(int word, const GoldParseC* gold) nogil:
-    while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0:
+    while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
        word = gold.heads[word]
-    if gold.labels[word] == -1:
+    if not gold.has_dep[word]:
        return -1
    else:
        return word
@ -295,9 +296,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:

 cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
-    # Ensure sent_start is set to 0 throughout
    for i in range(st.c.length):
-        st.c._sent[i].sent_start = False
        st.c._sent[i].l_edge = i
        st.c._sent[i].r_edge = i
    st.fast_forward()
@ -313,21 +312,24 @@ cdef class ArcEager(TransitionSystem):
    @classmethod
    def get_actions(cls, **kwargs):
        actions = kwargs.get('actions',
-                    {
-                        SHIFT: [''],
-                        REDUCE: [''],
-                        RIGHT: [],
-                        LEFT: [],
-                        BREAK: ['ROOT']})
+                    OrderedDict((
+                        (SHIFT, ['']),
+                        (REDUCE, ['']),
+                        (RIGHT, []),
+                        (LEFT, []),
+                        (BREAK, ['ROOT'])
+                    )))
        seen_actions = set()
        for label in kwargs.get('left_labels', []):
            if label.upper() != 'ROOT':
                if (LEFT, label) not in seen_actions:
                    actions[LEFT].append(label)
+                    seen_actions.add((LEFT, label))
        for label in kwargs.get('right_labels', []):
            if label.upper() != 'ROOT':
                if (RIGHT, label) not in seen_actions:
                    actions[RIGHT].append(label)
+                    seen_actions.add((RIGHT, label))

        for raw_text, sents in kwargs.get('gold_parses', []):
            for (ids, words, tags, heads, labels, iob), ctnts in sents:
@ -338,29 +340,39 @@ cdef class ArcEager(TransitionSystem):
                        if head < child:
                            if (RIGHT, label) not in seen_actions:
                                actions[RIGHT].append(label)
+                                seen_actions.add((RIGHT, label))
                        elif head > child:
                            if (LEFT, label) not in seen_actions:
                                actions[LEFT].append(label)
+                                seen_actions.add((LEFT, label))
        return actions

    property action_types:
        def __get__(self):
            return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)

-    cdef int preprocess_gold(self, GoldParse gold) except -1:
+    def has_gold(self, GoldParse gold, start=0, end=None):
+        end = end or len(gold.heads)
+        if all([tag is None for tag in gold.heads[start:end]]):
+            return False
+        else:
+            return True
+
+    def preprocess_gold(self, GoldParse gold):
+        if not self.has_gold(gold):
+            return None
        for i in range(gold.length):
-            if gold.heads[i] is None: # Missing values
+            if gold.heads[i] is None or gold.labels[i] is None: # Missing values
                gold.c.heads[i] = i
-                gold.c.labels[i] = -1
+                gold.c.has_dep[i] = False
            else:
                label = gold.labels[i]
+                gold.c.has_dep[i] = True
                if label.upper() == 'ROOT':
                    label = 'ROOT'
                gold.c.heads[i] = gold.heads[i]
-                gold.c.labels[i] = self.strings[label]
-                # Count frequencies, for use in encoder
-                self.freqs[HEAD][gold.c.heads[i] - i] += 1
-                self.freqs[DEP][gold.c.labels[i]] += 1
+                gold.c.labels[i] = self.strings.add(label)
+        return gold

    cdef Transition lookup_transition(self, object name) except *:
        if '-' in name:
@ -373,15 +385,16 @@ cdef class ArcEager(TransitionSystem):
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
+        return Transition(clas=0, move=MISSING, label=0)

-    def move_name(self, int move, int label):
+    def move_name(self, int move, attr_t label):
        label_str = self.strings[label]
        if label_str:
            return MOVE_NAMES[move] + '-' + label_str
        else:
            return MOVE_NAMES[move]

-    cdef Transition init_transition(self, int clas, int move, int label) except *:
+    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
        # constructor with the function pointers
        cdef Transition t
@ -414,9 +427,7 @@ cdef class ArcEager(TransitionSystem):
        return t

    cdef int initialize_state(self, StateC* st) nogil:
-        # Ensure sent_start is set to 0 throughout
        for i in range(st.length):
-            st._sent[i].sent_start = False
            st._sent[i].l_edge = i
            st._sent[i].r_edge = i
        st.fast_forward()
@ -432,18 +443,19 @@ cdef class ArcEager(TransitionSystem):

    cdef int set_valid(self, int* output, const StateC* st) nogil:
        cdef bint[N_MOVES] is_valid
-        is_valid[SHIFT] = Shift.is_valid(st, -1)
-        is_valid[REDUCE] = Reduce.is_valid(st, -1)
-        is_valid[LEFT] = LeftArc.is_valid(st, -1)
-        is_valid[RIGHT] = RightArc.is_valid(st, -1)
-        is_valid[BREAK] = Break.is_valid(st, -1)
+        is_valid[SHIFT] = Shift.is_valid(st, 0)
+        is_valid[REDUCE] = Reduce.is_valid(st, 0)
+        is_valid[LEFT] = LeftArc.is_valid(st, 0)
+        is_valid[RIGHT] = RightArc.is_valid(st, 0)
+        is_valid[BREAK] = Break.is_valid(st, 0)
        cdef int i
        for i in range(self.n_moves):
            output[i] = is_valid[self.c[i].move]

    cdef int set_costs(self, int* is_valid, weight_t* costs,
                       StateClass stcls, GoldParse gold) except -1:
-        cdef int i, move, label
+        cdef int i, move
+        cdef attr_t label
        cdef label_cost_func_t[N_MOVES] label_cost_funcs
        cdef move_cost_func_t[N_MOVES] move_cost_funcs
        cdef weight_t[N_MOVES] move_costs
@ -461,7 +473,7 @@ cdef class ArcEager(TransitionSystem):
        label_cost_funcs[RIGHT] = RightArc.label_cost
        label_cost_funcs[BREAK] = Break.label_cost

-        cdef int* labels = gold.c.labels
+        cdef attr_t* labels = gold.c.labels
        cdef int* heads = gold.c.heads

        n_gold = 0
@ -501,3 +513,23 @@ cdef class ArcEager(TransitionSystem):
                    "State at failure:\n"
                    "%s" % (self.n_moves, stcls.print_state(gold.words)))
        assert n_gold >= 1
+
+    def get_beam_annot(self, Beam beam):
+        length = (<StateClass>beam.at(0)).c.length
+        heads = [{} for _ in range(length)]
+        deps = [{} for _ in range(length)]
+        probs = beam.probs
+        for i in range(beam.size):
+            stcls = <StateClass>beam.at(i)
+            self.finalize_state(stcls.c)
+            if stcls.is_final():
+                prob = probs[i]
+                for j in range(stcls.c.length):
+                    head = j + stcls.c._sent[j].head
+                    dep = stcls.c._sent[j].dep
+                    heads[j].setdefault(head, 0.0)
+                    heads[j][head] += prob
+                    deps[j].setdefault(dep, 0.0)
+                    deps[j][dep] += prob
+        return heads, deps
+
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -1,7 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals

-from ..parts_of_speech cimport NOUN, PROPN, PRON
+from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX


 def english_noun_chunks(obj):
@ -12,9 +12,9 @@ def english_noun_chunks(obj):
    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
              'attr', 'ROOT']
    doc = obj.doc # Ensure works on both Doc and Span.
-    np_deps = [doc.vocab.strings[label] for label in labels]
-    conj = doc.vocab.strings['conj']
-    np_label = doc.vocab.strings['NP']
+    np_deps = [doc.vocab.strings.add(label) for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
    seen = set()
    for i, word in enumerate(obj):
        if word.pos not in (NOUN, PROPN, PRON):
@ -48,9 +48,9 @@ def english_noun_chunks(obj):
 def german_noun_chunks(obj):
    labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
    doc = obj.doc # Ensure works on both Doc and Span.
-    np_label = doc.vocab.strings['NP']
-    np_deps = set(doc.vocab.strings[label] for label in labels)
-    close_app = doc.vocab.strings['nk']
+    np_label = doc.vocab.strings.add('NP')
+    np_deps = set(doc.vocab.strings.add(label) for label in labels)
+    close_app = doc.vocab.strings.add('nk')

    rbracket = 0
    for i, word in enumerate(obj):
@ -66,4 +66,79 @@ def german_noun_chunks(obj):
            yield word.left_edge.i, rbracket, np_label


-CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
+def es_noun_chunks(obj):
+    doc = obj.doc
+    np_label = doc.vocab.strings['NP']
+    left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
+    right_labels = ['flat', 'fixed', 'compound', 'neg']
+    stop_labels = ['punct']
+    np_left_deps = [doc.vocab.strings[label] for label in left_labels]
+    np_right_deps = [doc.vocab.strings[label] for label in right_labels]
+    stop_deps = [doc.vocab.strings[label] for label in stop_labels]
+
+    def next_token(token):
+        try:
+            return token.nbor()
+        except:
+            return None
+
+    def noun_bounds(root):
+        def is_verb_token(token):
+            return token.pos in [VERB, AUX]
+
+        left_bound = root
+        for token in reversed(list(root.lefts)):
+            if token.dep in np_left_deps:
+                left_bound = token
+        right_bound = root
+        for token in root.rights:
+            if (token.dep in np_right_deps):
+                left, right = noun_bounds(token)
+                if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
+                               doc[left_bound.i: right.i])):
+                    break
+                else:
+                    right_bound = right
+        return left_bound, right_bound
+
+    token = doc[0]
+    while token and token.i < len(doc):
+        if token.pos in [PROPN, NOUN, PRON]:
+            left, right = noun_bounds(token)
+            yield left.i, right.i+1, np_label
+            token = right
+        token = next_token(token)
+
+
+def french_noun_chunks(obj):
+    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    conj = doc.vocab.strings.add('conj')
+    np_label = doc.vocab.strings.add('NP')
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            if any(w.i in seen for w in word.subtree):
+                continue
+            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+            yield word.left_edge.i, word.right_edge.i+1, np_label
+        elif word.dep == conj:
+            head = word.head
+            while head.dep == conj and head.head.i < head.i:
+                head = head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                if any(w.i in seen for w in word.subtree):
+                    continue
+                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
+                yield word.left_edge.i, word.right_edge.i+1, np_label
+
+
+CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
+            'es': es_noun_chunks, 'fr': french_noun_chunks}
--- a/spacy/syntax/ner.pxd
+++ b/spacy/syntax/ner.pxd
@ -1,6 +1,7 @@
 from .transition_system cimport TransitionSystem
 from .transition_system cimport Transition
 from ..gold cimport GoldParseC
+from ..typedefs cimport attr_t


 cdef class BiluoPushDown(TransitionSystem):
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -2,6 +2,10 @@
 from __future__ import unicode_literals

 from thinc.typedefs cimport weight_t
+from thinc.extra.search cimport Beam
+from collections import OrderedDict
+import numpy
+from thinc.neural.ops import NumpyOps

 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -51,17 +55,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:


 cdef class BiluoPushDown(TransitionSystem):
+    def __init__(self, *args, **kwargs):
+        TransitionSystem.__init__(self, *args, **kwargs)
+
+    def __reduce__(self):
+        labels_by_action = OrderedDict()
+        cdef Transition t
+        for trans in self.c[:self.n_moves]:
+            label_str = self.strings[trans.label]
+            labels_by_action.setdefault(trans.move, []).append(label_str)
+        return (BiluoPushDown, (self.strings, labels_by_action),
+                None, None)
+
    @classmethod
    def get_actions(cls, **kwargs):
        actions = kwargs.get('actions',
-                    {
-                        MISSING: [''],
-                        BEGIN: [],
-                        IN: [],
-                        LAST: [],
-                        UNIT: [],
-                        OUT: ['']
-                    })
+                    OrderedDict((
+                        (MISSING, ['']),
+                        (BEGIN, []),
+                        (IN, []),
+                        (LAST, []),
+                        (UNIT, []),
+                        (OUT, [''])
+                    )))
        seen_entities = set()
        for entity_type in kwargs.get('entity_types', []):
            if entity_type in seen_entities:
@ -87,42 +103,75 @@ cdef class BiluoPushDown(TransitionSystem):
        def __get__(self):
            return (BEGIN, IN, LAST, UNIT, OUT)

-    def move_name(self, int move, int label):
+    def move_name(self, int move, attr_t label):
        if move == OUT:
            return 'O'
-        elif move == 'MISSING':
+        elif move == MISSING:
            return 'M'
        else:
            return MOVE_NAMES[move] + '-' + self.strings[label]

-    cdef int preprocess_gold(self, GoldParse gold) except -1:
+    def has_gold(self, GoldParse gold, start=0, end=None):
+        end = end or len(gold.ner)
+        if all([tag == '-' for tag in gold.ner[start:end]]):
+            return False
+        else:
+            return True
+
+    def preprocess_gold(self, GoldParse gold):
+        if not self.has_gold(gold):
+            return None
        for i in range(gold.length):
            gold.c.ner[i] = self.lookup_transition(gold.ner[i])
-            # Count frequencies, for use in encoder
-            if gold.c.ner[i].move in (BEGIN, UNIT):
-                self.freqs[ENT_IOB][3] += 1
-                self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
-            elif gold.c.ner[i].move in (IN, LAST):
-                self.freqs[ENT_IOB][2] += 1
-                self.freqs[ENT_TYPE][0] += 1
-            elif gold.c.ner[i].move == OUT:
-                self.freqs[ENT_IOB][1] += 1
-                self.freqs[ENT_TYPE][0] += 1
-            else:
-                self.freqs[ENT_IOB][1] += 1
-                self.freqs[ENT_TYPE][0] += 1
+        return gold
+
+    def get_beam_annot(self, Beam beam):
+        entities = {}
+        probs = beam.probs
+        for i in range(beam.size):
+            stcls = <StateClass>beam.at(i)
+            if stcls.is_final():
+                self.finalize_state(stcls.c)
+                prob = probs[i]
+                for j in range(stcls.c._e_i):
+                    start = stcls.c._ents[j].start
+                    end = stcls.c._ents[j].end
+                    label = stcls.c._ents[j].label
+                    entities.setdefault((start, end, label), 0.0)
+                    entities[(start, end, label)] += prob
+        return entities
+
+    def get_beam_parses(self, Beam beam):
+        parses = []
+        probs = beam.probs
+        for i in range(beam.size):
+            stcls = <StateClass>beam.at(i)
+            if stcls.is_final():
+                self.finalize_state(stcls.c)
+                prob = probs[i]
+                parse = []
+                for j in range(stcls.c._e_i):
+                    start = stcls.c._ents[j].start
+                    end = stcls.c._ents[j].end
+                    label = stcls.c._ents[j].label
+                    parse.append((start, end, self.strings[label]))
+                parses.append((prob, parse))
+        return parses

    cdef Transition lookup_transition(self, object name) except *:
+        cdef attr_t label
        if name == '-' or name == None:
            move_str = 'M'
            label = 0
+        elif name == '!O':
+            return Transition(clas=0, move=ISNT, label=0, score=0)
        elif '-' in name:
            move_str, label_str = name.split('-', 1)
            # Hacky way to denote 'not this entity'
            if label_str.startswith('!'):
                label_str = label_str[1:]
                move_str = 'x'
-            label = self.strings[label_str]
+            label = self.strings.add(label_str)
        else:
            move_str = name
            label = 0
@ -135,7 +184,7 @@ cdef class BiluoPushDown(TransitionSystem):
        else:
            raise KeyError(name)

-    cdef Transition init_transition(self, int clas, int move, int label) except *:
+    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
        # constructor with the function pointers
        cdef Transition t
@ -184,21 +233,21 @@ cdef class BiluoPushDown(TransitionSystem):

 cdef class Missing:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return False

    @staticmethod
-    cdef int transition(StateC* s, int label) nogil:
+    cdef int transition(StateC* s, attr_t label) nogil:
        pass

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return 9000


 cdef class Begin:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        # Ensure we don't clobber preset entities. If no entity preset,
        # ent_iob is 0
        cdef int preset_ent_iob = st.B_(0).ent_iob
@ -222,16 +271,16 @@ cdef class Begin:
            return label != 0 and not st.entity_is_open()

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.open_ent(label)
        st.set_ent_tag(st.B(0), 3, label)
        st.push()
        st.pop()

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label

        if g_act == MISSING:
            return 0
@ -251,7 +300,7 @@ cdef class Begin:

 cdef class In:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        if preset_ent_iob == 2:
            return False
@ -267,17 +316,17 @@ cdef class In:
        return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.set_ent_tag(st.B(0), 1, label)
        st.push()
        st.pop()

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        move = IN
        cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label
        cdef bint is_sunk = _entity_is_sunk(s, gold.ner)

        if g_act == MISSING:
@ -297,30 +346,33 @@ cdef class In:
        elif g_act == UNIT:
            # I, Gold U --> True iff next tag == O
            return next_act != OUT
+        # Support partial supervision in the form of "not this label"
+        elif g_act == ISNT:
+            return 0
        else:
            return 1


 cdef class Last:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        if st.B_(1).ent_iob == 1:
            return False
        return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.close_ent()
        st.set_ent_tag(st.B(0), 1, label)
        st.push()
        st.pop()

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        move = LAST

        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label

        if g_act == MISSING:
            return 0
@ -339,13 +391,16 @@ cdef class Last:
        elif g_act == UNIT:
            # L, Gold U --> True
            return 0
+        # Support partial supervision in the form of "not this label"
+        elif g_act == ISNT:
+            return 0
        else:
            return 1


 cdef class Unit:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        if preset_ent_iob == 2:
            return False
@ -358,7 +413,7 @@ cdef class Unit:
        return label != 0 and not st.entity_is_open()

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.open_ent(label)
        st.close_ent()
        st.set_ent_tag(st.B(0), 3, label)
@ -366,9 +421,9 @@ cdef class Unit:
        st.pop()

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label

        if g_act == MISSING:
            return 0
@ -388,7 +443,7 @@ cdef class Unit:

 cdef class Out:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        if preset_ent_iob == 3:
            return False
@ -397,17 +452,19 @@ cdef class Out:
        return not st.entity_is_open()

    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.set_ent_tag(st.B(0), 2, 0)
        st.push()
        st.pop()

    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label

-        if g_act == MISSING or g_act == ISNT:
+        if g_act == ISNT and g_tag == 0:
+            return 1
+        elif g_act == MISSING or g_act == ISNT:
            return 0
        elif g_act == BEGIN:
            # O, Gold B --> False
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -5,7 +5,7 @@
 # coding: utf-8
 from __future__ import unicode_literals, print_function

-from collections import Counter
+from collections import Counter, OrderedDict
 import ujson
 import contextlib

@ -18,6 +18,7 @@ import dill
 import numpy.random
 cimport numpy as np

+from libcpp.vector cimport vector
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
@ -28,26 +29,30 @@ from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
 from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
 from thinc.extra.eg cimport Example
+from thinc.extra.search cimport Beam
+
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get

-from thinc.api import layerize, chain
+from thinc.api import layerize, chain, noop, clone
 from thinc.neural import Model, Affine, ELU, ReLu, Maxout
-from thinc.neural.ops import NumpyOps
+from thinc.neural.ops import NumpyOps, CupyOps
+from thinc.neural.util import get_array_module

 from .. import util
 from ..util import get_async, get_cuda_stream
 from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
-from .._ml import Tok2Vec, doc2feats
+from .._ml import Tok2Vec, doc2feats, rebatch
+from ..compat import json_dumps

 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
-from .nonproj import PseudoProjectivity
+from . import nonproj
 from .transition_system import OracleError
 from .transition_system cimport TransitionSystem, Transition
 from ..structs cimport TokenC
@ -104,68 +109,75 @@ cdef class precompute_hiddens:
            cached = gpu_cached
        self.nF = cached.shape[1]
        self.nO = cached.shape[2]
-        self.nP = cached.shape[3]
+        self.nP = getattr(lower_model, 'nP', 1)
        self.ops = lower_model.ops
-        self._features = numpy.zeros((batch_size, self.nO, self.nP), dtype='f')
        self._is_synchronized = False
        self._cuda_stream = cuda_stream
        self._cached = cached
        self._bp_hiddens = bp_features

-    def __call__(self, X):
-        return self.begin_update(X)[0]
-
-    def begin_update(self, token_ids, drop=0.):
-        self._features.fill(0)
+    cdef const float* get_feat_weights(self) except NULL:
        if not self._is_synchronized \
        and self._cuda_stream is not None:
            self._cuda_stream.synchronize()
            self._is_synchronized = True
+        return <float*>self._cached.data
+
+    def __call__(self, X):
+        return self.begin_update(X)[0]
+
+    def begin_update(self, token_ids, drop=0.):
+        cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
        # This is tricky, but (assuming GPU available);
        # - Input to forward on CPU
        # - Output from forward on CPU
        # - Input to backward on GPU!
        # - Output from backward on GPU
-        cdef np.ndarray state_vector = self._features[:len(token_ids)]
-        cdef np.ndarray hiddens = self._cached
        bp_hiddens = self._bp_hiddens

+        feat_weights = self.get_feat_weights()
        cdef int[:, ::1] ids = token_ids
-        self._sum_features(<float*>state_vector.data,
-            <float*>hiddens.data, &ids[0,0],
+        sum_state_features(<float*>state_vector.data,
+            feat_weights, &ids[0,0],
            token_ids.shape[0], self.nF, self.nO*self.nP)
+        state_vector, bp_nonlinearity = self._nonlinearity(state_vector)

-        output, bp_output = self._apply_nonlinearity(state_vector)
-
-        def backward(d_output, sgd=None):
+        def backward(d_state_vector, sgd=None):
+            if bp_nonlinearity is not None:
+                d_state_vector = bp_nonlinearity(d_state_vector, sgd)
            # This will usually be on GPU
-            if isinstance(d_output, numpy.ndarray):
-                d_output = self.ops.xp.array(d_output)
-            d_state_vector = bp_output(d_output, sgd)
+            if isinstance(d_state_vector, numpy.ndarray):
+                d_state_vector = self.ops.xp.array(d_state_vector)
            d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
            return d_tokens
-        return output, backward
+        return state_vector, backward

-    def _apply_nonlinearity(self, X):
-        if self.nP < 2:
-            return X.reshape(X.shape[:2]), lambda dX, sgd=None: dX.reshape(X.shape)
-        best, which = self.ops.maxout(X)
-        return best, lambda dX, sgd=None: self.ops.backprop_maxout(dX, which, self.nP)
+    def _nonlinearity(self, state_vector):
+        if self.nP == 1:
+            return state_vector, None
+        state_vector = state_vector.reshape(
+            (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
+        best, which = self.ops.maxout(state_vector)
+        def backprop(d_best, sgd=None):
+            return self.ops.backprop_maxout(d_best, which, self.nP)
+        return best, backprop

-    cdef void _sum_features(self, float* output,
-            const float* cached, const int* token_ids, int B, int F, int O) nogil:
-        cdef int idx, b, f, i
-        cdef const float* feature
-        for b in range(B):
-            for f in range(F):
-                if token_ids[f] < 0:
-                    continue
-                idx = token_ids[f] * F * O + f*O
-                feature = &cached[idx]
-                for i in range(O):
-                    output[i] += feature[i]
-            output += O
-            token_ids += F
+
+
+cdef void sum_state_features(float* output,
+        const float* cached, const int* token_ids, int B, int F, int O) nogil:
+    cdef int idx, b, f, i
+    cdef const float* feature
+    for b in range(B):
+        for f in range(F):
+            if token_ids[f] < 0:
+                continue
+            idx = token_ids[f] * F * O + f*O
+            feature = &cached[idx]
+            for i in range(O):
+                output[i] += feature[i]
+        output += O
+        token_ids += F


 cdef void cpu_log_loss(float* d_scores,
@ -220,25 +232,39 @@ cdef class Parser:
    Base class of the DependencyParser and EntityRecognizer.
    """
    @classmethod
-    def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
+    def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
+        depth = util.env_opt('parser_hidden_depth', depth)
        token_vector_width = util.env_opt('token_vector_width', token_vector_width)
        hidden_width = util.env_opt('hidden_width', hidden_width)
-        maxout_pieces = util.env_opt('parser_maxout_pieces', 1)
-        lower = PrecomputableMaxouts(hidden_width,
-                    nF=cls.nr_feature,
-                    nI=token_vector_width,
-                    pieces=maxout_pieces)
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
+        tensors = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
+        if parser_maxout_pieces == 1:
+            lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
+                        nF=cls.nr_feature,
+                        nI=token_vector_width)
+        else:
+            lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
+                        nF=cls.nr_feature,
+                        nP=parser_maxout_pieces,
+                        nI=token_vector_width)

        with Model.use_device('cpu'):
            upper = chain(
-                        Maxout(hidden_width),
-                        zero_init(Affine(nr_class))
-                    )
+                clone(Maxout(hidden_width), (depth-1)),
+                zero_init(Affine(nr_class, drop_factor=0.0))
+            )
        # TODO: This is an unfortunate hack atm!
        # Used to set input dimensions in network.
        lower.begin_training(lower.ops.allocate((500, token_vector_width)))
        upper.begin_training(upper.ops.allocate((500, hidden_width)))
-        return lower, upper
+        cfg = {
+            'nr_class': nr_class,
+            'depth': depth,
+            'token_vector_width': token_vector_width,
+            'hidden_width': hidden_width,
+            'maxout_pieces': parser_maxout_pieces
+        }
+        return (tensors, lower, upper), cfg

    def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
        """
@ -274,7 +300,7 @@ cdef class Parser:
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)

-    def __call__(self, Doc tokens, state=None):
+    def __call__(self, Doc doc, beam_width=None, beam_density=None):
        """
        Apply the parser or entity recognizer, setting the annotations onto the Doc object.

@ -283,10 +309,26 @@ cdef class Parser:
        Returns:
            None
        """
-        self.parse_batch([tokens], state['tokvecs'])
-        return state
+        if beam_width is None:
+            beam_width = self.cfg.get('beam_width', 1)
+        if beam_density is None:
+            beam_density = self.cfg.get('beam_density', 0.001)
+        cdef Beam beam
+        if beam_width == 1:
+            states = self.parse_batch([doc], [doc.tensor])
+            self.set_annotations([doc], states)
+            return doc
+        else:
+            beam = self.beam_parse([doc], [doc.tensor],
+                        beam_width=beam_width, beam_density=beam_density)[0]
+            output = self.moves.get_beam_annot(beam)
+            state = <StateClass>beam.at(0)
+            self.set_annotations([doc], [state])
+            _cleanup(beam)
+            return output

-    def pipe(self, stream, int batch_size=1000, int n_threads=2):
+    def pipe(self, docs, int batch_size=1000, int n_threads=2,
+             beam_width=1, beam_density=0.001):
        """
        Process a stream of documents.

@ -298,99 +340,244 @@ cdef class Parser:
                The number of threads with which to work on the buffer in parallel.
        Yields (Doc): Documents, in order.
        """
-        cdef StateClass parse_state
        cdef Doc doc
-        queue = []
-        for batch in cytoolz.partition_all(batch_size, stream):
-            batch = list(batch)
-            docs, states = zip(*batch)
-            parse_states = self.parse_batch(docs, states[0]['tokvecs'])
+        for docs in cytoolz.partition_all(batch_size, docs):
+            docs = list(docs)
+            tokvecs = [doc.tensor for doc in docs]
+            if beam_width == 1:
+                parse_states = self.parse_batch(docs, tokvecs)
+            else:
+                parse_states = self.beam_parse(docs, tokvecs,
+                                    beam_width=beam_width, beam_density=beam_density)
            self.set_annotations(docs, parse_states)
-            yield from zip(docs, states)
+            yield from docs
+
+    def parse_batch(self, docs, tokvecses):
+        cdef:
+            precompute_hiddens state2vec
+            StateClass state
+            Pool mem
+            const float* feat_weights
+            StateC* st
+            vector[StateC*] next_step, this_step
+            int nr_class, nr_feat, nr_piece, nr_dim, nr_state
+        if isinstance(docs, Doc):
+            docs = [docs]
+        if isinstance(tokvecses, np.ndarray):
+            tokvecses = [tokvecses]
+
+        tokvecs = self.model[0].ops.flatten(tokvecses)
+        tokvecs += self.model[0].ops.flatten(self.model[0](docs))
+
+        nr_state = len(docs)
+        nr_class = self.moves.n_moves
+        nr_dim = tokvecs.shape[1]
+        nr_feat = self.nr_feature

-    def parse_batch(self, docs, tokvecs):
        cuda_stream = get_cuda_stream()
+        state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
+                                                     cuda_stream, 0.0)
+        nr_piece = state2vec.nP

        states = self.moves.init_batch(docs)
-        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs,
-                                                     cuda_stream, 0.0)
+        for state in states:
+            if not state.c.is_final():
+                next_step.push_back(state.c)

-        todo = [st for st in states if not st.is_final()]
-        while todo:
-            token_ids = self.get_token_ids(states)
-            vectors = state2vec(token_ids)
+        feat_weights = state2vec.get_feat_weights()
+        cdef int i
+        cdef np.ndarray token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
+        cdef np.ndarray is_valid = numpy.zeros((nr_state, nr_class), dtype='i')
+        cdef np.ndarray scores
+        c_token_ids = <int*>token_ids.data
+        c_is_valid = <int*>is_valid.data
+        while not next_step.empty():
+            for i in range(next_step.size()):
+                st = next_step[i]
+                st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
+                self.moves.set_valid(&c_is_valid[i*nr_class], st)
+                vectors = state2vec(token_ids[:next_step.size()])
            scores = vec2scores(vectors)
-            self.transition_batch(states, scores)
-            todo = [st for st in states if not st.is_final()]
+            c_scores = <float*>scores.data
+            for i in range(next_step.size()):
+                st = next_step[i]
+                guess = arg_max_if_valid(
+                    &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
+                action = self.moves.c[guess]
+                action.do(st, action.label)
+            this_step, next_step = next_step, this_step
+            next_step.clear()
+            for st in this_step:
+                if not st.is_final():
+                    next_step.push_back(st)
        return states

-    def update(self, docs, golds, state=None, drop=0., sgd=None):
-        assert state is not None
-        assert 'tokvecs' in state
-        assert 'bp_tokvecs' in state
+    def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001):
+        cdef Beam beam
+        cdef np.ndarray scores
+        cdef Doc doc
+        cdef int nr_class = self.moves.n_moves
+        cdef StateClass stcls, output
+        tokvecs = self.model[0].ops.flatten(tokvecses)
+        tokvecs += self.model[0].ops.flatten(self.model[0](docs))
+        cuda_stream = get_cuda_stream()
+        state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
+                                                     cuda_stream, 0.0)
+        beams = []
+        cdef int offset = 0
+        for doc in docs:
+            beam = Beam(nr_class, beam_width, min_density=beam_density)
+            beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
+            for i in range(beam.width):
+                stcls = <StateClass>beam.at(i)
+                stcls.c.offset = offset
+            offset += len(doc)
+            beam.check_done(_check_final_state, NULL)
+            while not beam.is_done:
+                states = []
+                for i in range(beam.size):
+                    stcls = <StateClass>beam.at(i)
+                    states.append(stcls)
+                token_ids = self.get_token_ids(states)
+                vectors = state2vec(token_ids)
+                scores = vec2scores(vectors)
+                for i in range(beam.size):
+                    stcls = <StateClass>beam.at(i)
+                    if not stcls.is_final():
+                        self.moves.set_valid(beam.is_valid[i], stcls.c)
+                        for j in range(nr_class):
+                            beam.scores[i][j] = scores[i, j]
+                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
+                beam.check_done(_check_final_state, NULL)
+            beams.append(beam)
+        return beams
+
+    def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
+        if losses is not None and self.name not in losses:
+            losses[self.name] = 0.
+        docs, tokvec_lists = docs_tokvecs
+        tokvecs = self.model[0].ops.flatten(tokvec_lists)
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
+        my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs, drop=0.)
+        my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
+        tokvecs += my_tokvecs

        cuda_stream = get_cuda_stream()
-        for gold in golds:
-            self.moves.preprocess_gold(gold)

-        tokvecs = state['tokvecs']
-        bp_tokvecs = state['bp_tokvecs']
-
-        states = self.moves.init_batch(docs)
+        states, golds, max_steps = self._init_gold_batch(docs, golds)
        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
-                                                      drop)
-
-        todo = [(s, g) for s, g in zip(states, golds) if not s.is_final()]
+                                                      0.0)
+        todo = [(s, g) for (s, g) in zip(states, golds)
+                if not s.is_final() and g is not None]
+        if not todo:
+            return None

        backprops = []
+        d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
        cdef float loss = 0.
-        cutoff = max(1, len(todo) // 10)
-        while len(todo) >= cutoff:
+        n_steps = 0
+        while todo:
            states, golds = zip(*todo)

            token_ids = self.get_token_ids(states)
-            vector, bp_vector = state2vec.begin_update(token_ids, drop=drop)
+            vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
+            if drop != 0:
+                mask = vec2scores.ops.get_dropout_mask(vector.shape, drop)
+                vector *= mask
            scores, bp_scores = vec2scores.begin_update(vector, drop=drop)

            d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores, sgd=sgd)
-            loss += (d_scores**2).sum()
+            d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
+            if drop != 0:
+                d_vector *= mask

-            if not isinstance(tokvecs, state2vec.ops.xp.ndarray):
-                backprops.append((token_ids, d_vector, bp_vector))
-            else:
+            if isinstance(self.model[0].ops, CupyOps) \
+            and not isinstance(token_ids, state2vec.ops.xp.ndarray):
                # Move token_ids and d_vector to CPU, asynchronously
                backprops.append((
                    get_async(cuda_stream, token_ids),
                    get_async(cuda_stream, d_vector),
                    bp_vector
                ))
+            else:
+                backprops.append((token_ids, d_vector, bp_vector))
            self.transition_batch(states, scores)
            todo = [st for st in todo if not st[0].is_final()]
+            if losses is not None:
+                losses[self.name] += (d_scores**2).sum()
+            n_steps += 1
+            if n_steps >= max_steps:
+                break
+        self._make_updates(d_tokvecs,
+            backprops, sgd, cuda_stream)
+        d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
+        #bp_my_tokvecs(d_tokvecs, sgd=sgd)
+        return d_tokvecs
+
+    def _init_gold_batch(self, whole_docs, whole_golds):
+        """Make a square batch, of length equal to the shortest doc. A long
+        doc will get multiple states. Let's say we have a doc of length 2*N,
+        where N is the shortest doc. We'll make two states, one representing
+        long_doc[:N], and another representing long_doc[N:]."""
+        cdef:
+            StateClass state
+            Transition action
+        whole_states = self.moves.init_batch(whole_docs)
+        max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
+        max_moves = 0
+        states = []
+        golds = []
+        for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
+            gold = self.moves.preprocess_gold(gold)
+            if gold is None:
+                continue
+            oracle_actions = self.moves.get_oracle_sequence(doc, gold)
+            start = 0
+            while start < len(doc):
+                state = state.copy()
+                n_moves = 0
+                while state.B(0) < start and not state.is_final():
+                    action = self.moves.c[oracle_actions.pop(0)]
+                    action.do(state.c, action.label)
+                    n_moves += 1
+                has_gold = self.moves.has_gold(gold, start=start,
+                                               end=start+max_length)
+                if not state.is_final() and has_gold:
+                    states.append(state)
+                    golds.append(gold)
+                    max_moves = max(max_moves, n_moves)
+                start += min(max_length, len(doc)-start)
+            max_moves = max(max_moves, len(oracle_actions))
+        return states, golds, max_moves
+
+    def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
        # Tells CUDA to block, so our async copies complete.
        if cuda_stream is not None:
            cuda_stream.synchronize()
-        d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
-        xp = state2vec.ops.xp # Handle for numpy/cupy
-        for token_ids, d_vector, bp_vector in backprops:
+        xp = get_array_module(d_tokvecs)
+        for ids, d_vector, bp_vector in backprops:
            d_state_features = bp_vector(d_vector, sgd=sgd)
-            active_feats = token_ids * (token_ids >= 0)
-            active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1))
+            active_feats = ids * (ids >= 0)
+            active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
            if hasattr(xp, 'scatter_add'):
                xp.scatter_add(d_tokvecs,
-                    token_ids, d_state_features * active_feats)
+                    ids, d_state_features * active_feats)
            else:
                xp.add.at(d_tokvecs,
-                    token_ids, d_state_features * active_feats)
-        bp_tokvecs(d_tokvecs, sgd)
-        state['parser_loss'] = loss
-        return state
+                    ids, d_state_features * active_feats)
+
+    @property
+    def move_names(self):
+        names = []
+        for i in range(self.moves.n_moves):
+            name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
+            names.append(name)
+        return names

    def get_batch_model(self, batch_size, tokvecs, stream, dropout):
-        lower, upper = self.model
+        _, lower, upper = self.model
        state2vec = precompute_hiddens(batch_size, tokvecs,
                        lower, stream, drop=dropout)
        return state2vec, upper
@ -400,9 +587,13 @@ cdef class Parser:
    def get_token_ids(self, states):
        cdef StateClass state
        cdef int n_tokens = self.nr_feature
-        ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C')
+        cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
+                                          dtype='i', order='C')
+        c_ids = <int*>ids.data
        for i, state in enumerate(states):
-            state.set_context_tokens(ids[i])
+            if not state.is_final():
+                state.c.set_context_tokens(c_ids, n_tokens)
+            c_ids += ids.shape[1]
        return ids

    def transition_batch(self, states, float[:, ::1] scores):
@ -445,7 +636,6 @@ cdef class Parser:
            self.moves.finalize_doc(doc)

    def add_label(self, label):
-        # Doesn't set label into serializer -- subclasses override it to do that.
        for action in self.moves.action_types:
            added = self.moves.add_action(action, label)
            if added:
@ -456,12 +646,18 @@ cdef class Parser:
    def begin_training(self, gold_tuples, **cfg):
        if 'model' in cfg:
            self.model = cfg['model']
+        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
        actions = self.moves.get_actions(gold_parses=gold_tuples)
        for action, labels in actions.items():
            for label in labels:
                self.moves.add_action(action, label)
        if self.model is True:
-            self.model = self.Model(self.moves.n_moves, **cfg)
+            self.model, cfg = self.Model(self.moves.n_moves, **cfg)
+            self.cfg.update(cfg)
+
+    def preprocess_gold(self, docs_golds):
+        for doc, gold in docs_golds:
+            yield doc, gold

    def use_params(self, params):
        # Can't decorate cdef class :(. Workaround.
@ -469,21 +665,85 @@ cdef class Parser:
            with self.model[1].use_params(params):
                yield

-    def to_disk(self, path):
-        path = util.ensure_path(path)
-        with (path / 'model.bin').open('wb') as file_:
-            dill.dump(self.model, file_)
+    def to_disk(self, path, **exclude):
+        serializers = {
+            'tok2vec_model': lambda p: p.open('wb').write(
+                self.model[0].to_bytes()),
+            'lower_model': lambda p: p.open('wb').write(
+                self.model[1].to_bytes()),
+            'upper_model': lambda p: p.open('wb').write(
+                self.model[2].to_bytes()),
+            'vocab': lambda p: self.vocab.to_disk(p),
+            'moves': lambda p: self.moves.to_disk(p, strings=False),
+            'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
+        }
+        util.to_disk(path, serializers, exclude)

-    def from_disk(self, path):
-        path = util.ensure_path(path)
-        with (path / 'model.bin').open('wb') as file_:
-            self.model = dill.load(file_)
+    def from_disk(self, path, **exclude):
+        deserializers = {
+            'vocab': lambda p: self.vocab.from_disk(p),
+            'moves': lambda p: self.moves.from_disk(p, strings=False),
+            'cfg': lambda p: self.cfg.update(ujson.load(p.open())),
+            'model': lambda p: None
+        }
+        util.from_disk(path, deserializers, exclude)
+        if 'model' not in exclude:
+            path = util.ensure_path(path)
+            if self.model is True:
+                self.model, cfg = self.Model(**self.cfg)
+            else:
+                cfg = {}
+            with (path / 'tok2vec_model').open('rb') as file_:
+                bytes_data = file_.read()
+            self.model[0].from_bytes(bytes_data)
+            with (path / 'lower_model').open('rb') as file_:
+                bytes_data = file_.read()
+            self.model[1].from_bytes(bytes_data)
+            with (path / 'upper_model').open('rb') as file_:
+                bytes_data = file_.read()
+            self.model[2].from_bytes(bytes_data)
+            self.cfg.update(cfg)
+        return self

-    def to_bytes(self):
-        pass
+    def to_bytes(self, **exclude):
+        serializers = OrderedDict((
+            ('tok2vec_model', lambda: self.model[0].to_bytes()),
+            ('lower_model', lambda: self.model[1].to_bytes()),
+            ('upper_model', lambda: self.model[2].to_bytes()),
+            ('vocab', lambda: self.vocab.to_bytes()),
+            ('moves', lambda: self.moves.to_bytes(strings=False)),
+            ('cfg', lambda: ujson.dumps(self.cfg))
+        ))
+        if 'model' in exclude:
+            exclude['tok2vec_model'] = True
+            exclude['lower_model'] = True
+            exclude['upper_model'] = True
+            exclude.pop('model')
+        return util.to_bytes(serializers, exclude)

-    def from_bytes(self, data):
-        pass
+    def from_bytes(self, bytes_data, **exclude):
+        deserializers = OrderedDict((
+            ('vocab', lambda b: self.vocab.from_bytes(b)),
+            ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
+            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
+            ('tok2vec_model', lambda b: None),
+            ('lower_model', lambda b: None),
+            ('upper_model', lambda b: None)
+        ))
+        msg = util.from_bytes(bytes_data, deserializers, exclude)
+        if 'model' not in exclude:
+            if self.model is True:
+                self.model, cfg = self.Model(self.moves.n_moves)
+            else:
+                cfg = {}
+            if 'tok2vec_model' in msg:
+                self.model[0].from_bytes(msg['tok2vec_model'])
+            if 'lower_model' in msg:
+                self.model[1].from_bytes(msg['lower_model'])
+            if 'upper_model' in msg:
+                self.model[2].from_bytes(msg['upper_model'])
+            self.cfg.update(cfg)
+        return self


 class ParserStateError(ValueError):
@ -521,6 +781,19 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
    return best


+cdef int arg_maxout_if_valid(const weight_t* scores, const int* is_valid,
+                             int n, int nP) nogil:
+    cdef int best = -1
+    cdef float best_score = 0
+    for i in range(n):
+        if is_valid[i] >= 1:
+            for j in range(nP):
+                if best == -1 or scores[i*nP+j] > best_score:
+                    best = i
+                    best_score = scores[i*nP+j]
+    return best
+
+
 cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
                       int nr_class) except -1:
    cdef weight_t score = 0
@ -531,3 +804,30 @@ cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actio
            mode = i
            score = scores[i]
    return mode
+
+
+# These are passed as callbacks to thinc.search.Beam
+cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
+    dest = <StateClass>_dest
+    src = <StateClass>_src
+    moves = <const Transition*>_moves
+    dest.clone(src)
+    moves[clas].do(dest.c, moves[clas].label)
+
+
+cdef int _check_final_state(void* _state, void* extra_args) except -1:
+    return (<StateClass>_state).is_final()
+
+
+def _cleanup(Beam beam):
+    for i in range(beam.width):
+        Py_XDECREF(<PyObject*>beam._states[i].content)
+        Py_XDECREF(<PyObject*>beam._parents[i].content)
+
+
+cdef hash_t _hash_state(void* _state, void* _) except 0:
+    state = <StateClass>_state
+    if state.c.is_final():
+        return 1
+    else:
+        return state.c.hash()
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -1,10 +1,17 @@
 # coding: utf-8
+"""
+Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
+for doing pseudo-projective parsing implementation uses the HEAD decoration
+scheme.
+"""
 from __future__ import unicode_literals
 from copy import copy

 from ..tokens.doc cimport Doc
 from ..attrs import DEP, HEAD

+DELIMITER = '||'
+

 def ancestors(tokenid, heads):
    # returns all words going from the word up the path to the root
@ -60,145 +67,124 @@ def is_nonproj_tree(heads):
    return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )


-class PseudoProjectivity:
-    # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
-    # for doing pseudo-projective parsing
-    # implementation uses the HEAD decoration scheme
-
-    delimiter = '||'
-
-    @classmethod
-    def decompose(cls, label):
-        return label.partition(cls.delimiter)[::2]
-
-    @classmethod
-    def is_decorated(cls, label):
-        return label.find(cls.delimiter) != -1
-
-    @classmethod
-    def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30):
-        preprocessed = []
-        freqs = {}
-        for raw_text, sents in gold_tuples:
-            prepro_sents = []
-            for (ids, words, tags, heads, labels, iob), ctnts in sents:
-                proj_heads,deco_labels = cls.projectivize(heads,labels)
-                # set the label to ROOT for each root dependent
-                deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
-                # count label frequencies
-                if label_freq_cutoff > 0:
-                    for label in deco_labels:
-                        if cls.is_decorated(label):
-                            freqs[label] = freqs.get(label,0) + 1
-                prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
-            preprocessed.append((raw_text, prepro_sents))
-
-        if label_freq_cutoff > 0:
-            return cls._filter_labels(preprocessed,label_freq_cutoff,freqs)
-        return preprocessed
+def decompose(label):
+    return label.partition(DELIMITER)[::2]


-    @classmethod
-    def projectivize(cls, heads, labels):
-        # use the algorithm by Nivre & Nilsson 2005
-        # assumes heads to be a proper tree, i.e. connected and cycle-free
-        # returns a new pair (heads,labels) which encode
-        # a projective and decorated tree
-        proj_heads = copy(heads)
-        smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
-        if smallest_np_arc == None: # this sentence is already projective
-            return proj_heads, copy(labels)
-        while smallest_np_arc != None:
-            cls._lift(smallest_np_arc, proj_heads)
-            smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
-        deco_labels = cls._decorate(heads, proj_heads, labels)
-        return proj_heads, deco_labels
+def is_decorated(label):
+    return label.find(DELIMITER) != -1


-    @classmethod
-    def deprojectivize(cls, tokens):
-        # reattach arcs with decorated labels (following HEAD scheme)
-        # for each decorated arc X||Y, search top-down, left-to-right,
-        # breadth-first until hitting a Y then make this the new head
-        #parse = tokens.to_array([HEAD, DEP])
-        for token in tokens:
-            if cls.is_decorated(token.dep_):
-                newlabel,headlabel = cls.decompose(token.dep_)
-                newhead = cls._find_new_head(token,headlabel)
-                token.head = newhead
-                token.dep_ = newlabel
+def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
+    preprocessed = []
+    freqs = {}
+    for raw_text, sents in gold_tuples:
+        prepro_sents = []
+        for (ids, words, tags, heads, labels, iob), ctnts in sents:
+            proj_heads,deco_labels = projectivize(heads,labels)
+            # set the label to ROOT for each root dependent
+            deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
+            # count label frequencies
+            if label_freq_cutoff > 0:
+                for label in deco_labels:
+                    if is_decorated(label):
+                        freqs[label] = freqs.get(label,0) + 1
+            prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
+        preprocessed.append((raw_text, prepro_sents))

-                # tokens.attach(token,newhead,newlabel)
-                #parse[token.i,1] = tokens.vocab.strings[newlabel]
-                #parse[token.i,0] = newhead.i - token.i
-        #tokens.from_array([HEAD, DEP],parse)
+    if label_freq_cutoff > 0:
+        return _filter_labels(preprocessed,label_freq_cutoff,freqs)
+    return preprocessed


-    @classmethod
-    def _decorate(cls, heads, proj_heads, labels):
-        # uses decoration scheme HEAD from Nivre & Nilsson 2005
-        assert(len(heads) == len(proj_heads) == len(labels))
-        deco_labels = []
-        for tokenid,head in enumerate(heads):
-            if head != proj_heads[tokenid]:
-                deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head]))
-            else:
-                deco_labels.append(labels[tokenid])
-        return deco_labels
+def projectivize(heads, labels):
+    # use the algorithm by Nivre & Nilsson 2005
+    # assumes heads to be a proper tree, i.e. connected and cycle-free
+    # returns a new pair (heads,labels) which encode
+    # a projective and decorated tree
+    proj_heads = copy(heads)
+    smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
+    if smallest_np_arc == None: # this sentence is already projective
+        return proj_heads, copy(labels)
+    while smallest_np_arc != None:
+        _lift(smallest_np_arc, proj_heads)
+        smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
+    deco_labels = _decorate(heads, proj_heads, labels)
+    return proj_heads, deco_labels


-    @classmethod
-    def _get_smallest_nonproj_arc(cls, heads):
-        # return the smallest non-proj arc or None
-        # where size is defined as the distance between dep and head
-        # and ties are broken left to right
-        smallest_size = float('inf')
-        smallest_np_arc = None
-        for tokenid,head in enumerate(heads):
-            size = abs(tokenid-head)
-            if size < smallest_size and is_nonproj_arc(tokenid,heads):
-                smallest_size = size
-                smallest_np_arc = tokenid
-        return smallest_np_arc
+def deprojectivize(tokens):
+    # reattach arcs with decorated labels (following HEAD scheme)
+    # for each decorated arc X||Y, search top-down, left-to-right,
+    # breadth-first until hitting a Y then make this the new head
+    for token in tokens:
+        if is_decorated(token.dep_):
+            newlabel,headlabel = decompose(token.dep_)
+            newhead = _find_new_head(token,headlabel)
+            token.head = newhead
+            token.dep_ = newlabel
+    return tokens
+
+def _decorate(heads, proj_heads, labels):
+    # uses decoration scheme HEAD from Nivre & Nilsson 2005
+    assert(len(heads) == len(proj_heads) == len(labels))
+    deco_labels = []
+    for tokenid,head in enumerate(heads):
+        if head != proj_heads[tokenid]:
+            deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
+        else:
+            deco_labels.append(labels[tokenid])
+    return deco_labels


-    @classmethod
-    def _lift(cls, tokenid, heads):
-        # reattaches a word to it's grandfather
-        head = heads[tokenid]
-        ghead = heads[head]
-        # attach to ghead if head isn't attached to root else attach to root
-        heads[tokenid] = ghead if head != ghead else tokenid
+def _get_smallest_nonproj_arc(heads):
+    # return the smallest non-proj arc or None
+    # where size is defined as the distance between dep and head
+    # and ties are broken left to right
+    smallest_size = float('inf')
+    smallest_np_arc = None
+    for tokenid,head in enumerate(heads):
+        size = abs(tokenid-head)
+        if size < smallest_size and is_nonproj_arc(tokenid,heads):
+            smallest_size = size
+            smallest_np_arc = tokenid
+    return smallest_np_arc


-    @classmethod
-    def _find_new_head(cls, token, headlabel):
-        # search through the tree starting from the head of the given token
-        # returns the id of the first descendant with the given label
-        # if there is none, return the current head (no change)
-        queue = [token.head]
-        while queue:
-            next_queue = []
-            for qtoken in queue:
-                for child in qtoken.children:
-                    if child.is_space: continue
-                    if child == token: continue
-                    if child.dep_ == headlabel:
-                        return child
-                    next_queue.append(child)
-            queue = next_queue
-        return token.head
+def _lift(tokenid, heads):
+    # reattaches a word to it's grandfather
+    head = heads[tokenid]
+    ghead = heads[head]
+    # attach to ghead if head isn't attached to root else attach to root
+    heads[tokenid] = ghead if head != ghead else tokenid


-    @classmethod
-    def _filter_labels(cls, gold_tuples, cutoff, freqs):
-        # throw away infrequent decorated labels
-        # can't learn them reliably anyway and keeps label set smaller
-        filtered = []
-        for raw_text, sents in gold_tuples:
-            filtered_sents = []
-            for (ids, words, tags, heads, labels, iob), ctnts in sents:
-                filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
-                filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
-            filtered.append((raw_text, filtered_sents))
-        return filtered
+def _find_new_head(token, headlabel):
+    # search through the tree starting from the head of the given token
+    # returns the id of the first descendant with the given label
+    # if there is none, return the current head (no change)
+    queue = [token.head]
+    while queue:
+        next_queue = []
+        for qtoken in queue:
+            for child in qtoken.children:
+                if child.is_space: continue
+                if child == token: continue
+                if child.dep_ == headlabel:
+                    return child
+                next_queue.append(child)
+        queue = next_queue
+    return token.head
+
+
+def _filter_labels(gold_tuples, cutoff, freqs):
+    # throw away infrequent decorated labels
+    # can't learn them reliably anyway and keeps label set smaller
+    filtered = []
+    for raw_text, sents in gold_tuples:
+        filtered_sents = []
+        for (ids, words, tags, heads, labels, iob), ctnts in sents:
+            filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
+            filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
+        filtered.append((raw_text, filtered_sents))
+    return filtered
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -33,7 +33,6 @@ from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
-from .nonproj import PseudoProjectivity
 from .transition_system import OracleError
 from .transition_system cimport TransitionSystem, Transition
 from ..structs cimport TokenC
--- a/spacy/syntax/stateclass.pxd
+++ b/spacy/syntax/stateclass.pxd
@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
 cimport cython

 from ..structs cimport TokenC, Entity
+from ..typedefs cimport attr_t

 from ..vocab cimport EMPTY_LEXEME
 from ._state cimport StateC
@ -105,19 +106,19 @@ cdef class StateClass:
    cdef inline void unshift(self) nogil:
        self.c.unshift()

-    cdef inline void add_arc(self, int head, int child, int label) nogil:
+    cdef inline void add_arc(self, int head, int child, attr_t label) nogil:
        self.c.add_arc(head, child, label)

    cdef inline void del_arc(self, int head, int child) nogil:
        self.c.del_arc(head, child)

-    cdef inline void open_ent(self, int label) nogil:
+    cdef inline void open_ent(self, attr_t label) nogil:
        self.c.open_ent(label)

    cdef inline void close_ent(self) nogil:
        self.c.close_ent()

-    cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil:
+    cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil:
        self.c.set_ent_tag(i, ent_iob, ent_type)

    cdef inline void set_break(self, int i) nogil:
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@ -41,6 +41,11 @@ cdef class StateClass:
    def is_final(self):
        return self.c.is_final()

+    def copy(self):
+        cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length)
+        new_state.c.clone(self.c)
+        return new_state
+
    def print_state(self, words):
        words = list(words) + ['_']
        top = words[self.S(0)] + '_%d' % self.S_(0).head
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -1,6 +1,7 @@
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t

+from ..typedefs cimport attr_t
 from ..structs cimport TokenC
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
@ -13,20 +14,22 @@ from ._state cimport StateC
 cdef struct Transition:
    int clas
    int move
-    int label
+    attr_t label

    weight_t score

-    bint (*is_valid)(const StateC* state, int label) nogil
-    weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
-    int (*do)(StateC* state, int label) nogil
+    bint (*is_valid)(const StateC* state, attr_t label) nogil
+    weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
+    int (*do)(StateC* state, attr_t label) nogil


-ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
+ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
+        attr_tlabel) nogil
 ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
-ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
+ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
+        gold, attr_t label) nogil

-ctypedef int (*do_func_t)(StateC* state, int label) nogil
+ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil

 ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL

@ -36,18 +39,16 @@ cdef class TransitionSystem:
    cdef Transition* c
    cdef readonly int n_moves
    cdef int _size
-    cdef public int root_label
+    cdef public attr_t root_label
    cdef public freqs
    cdef init_state_t init_beam_state

    cdef int initialize_state(self, StateC* state) nogil
    cdef int finalize_state(self, StateC* state) nogil

-    cdef int preprocess_gold(self, GoldParse gold) except -1
-
    cdef Transition lookup_transition(self, object name) except *

-    cdef Transition init_transition(self, int clas, int move, int label) except *
+    cdef Transition init_transition(self, int clas, int move, attr_t label) except *

    cdef int set_valid(self, int* output, const StateC* st) nogil

--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -5,11 +5,14 @@ from __future__ import unicode_literals
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
+import ujson

+from .. import util
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
+from ..typedefs cimport attr_t


 cdef weight_t MIN_SCORE = -90000
@ -26,7 +29,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:


 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None):
+    def __init__(self, StringStore string_table, labels_by_action):
        self.mem = Pool()
        self.strings = string_table
        self.n_moves = 0
@ -34,28 +37,20 @@ cdef class TransitionSystem:

        self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))

-        for action, label_strs in sorted(labels_by_action.items()):
+        for action, label_strs in labels_by_action.items():
            for label_str in label_strs:
                self.add_action(int(action), label_str)
-        self.root_label = self.strings['ROOT']
-        self.freqs = {} if _freqs is None else _freqs
-        for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
-            self.freqs[attr] = defaultdict(int)
-            self.freqs[attr][0] = 1
-        # Ensure we've seen heads. Need an official dependency length limit...
-        for i in range(10024):
-            self.freqs[HEAD][i] = 1
-            self.freqs[HEAD][-i] = 1
+        self.root_label = self.strings.add('ROOT')
        self.init_beam_state = _init_state

    def __reduce__(self):
-        labels_by_action = {}
+        labels_by_action = OrderedDict()
        cdef Transition t
        for trans in self.c[:self.n_moves]:
            label_str = self.strings[trans.label]
            labels_by_action.setdefault(trans.move, []).append(label_str)
        return (self.__class__,
-                (self.strings, labels_by_action, self.freqs),
+                (self.strings, labels_by_action),
                None, None)

    def init_batch(self, docs):
@ -69,6 +64,29 @@ cdef class TransitionSystem:
            offset += len(doc)
        return states

+    def get_oracle_sequence(self, doc, GoldParse gold):
+        cdef Pool mem = Pool()
+        costs = <float*>mem.alloc(self.n_moves, sizeof(float))
+        is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
+
+        cdef StateClass state = StateClass(doc, offset=0)
+        self.initialize_state(state.c)
+        history = []
+        while not state.is_final():
+            self.set_costs(is_valid, costs, state, gold)
+            for i in range(self.n_moves):
+                if is_valid[i] and costs[i] <= 0:
+                    action = self.c[i]
+                    history.append(i)
+                    action.do(state.c, action.label)
+                    break
+            else:
+                print(gold.words)
+                print(gold.ner)
+                print(history)
+                raise ValueError("Could not find gold move")
+        return history
+
    cdef int initialize_state(self, StateC* state) nogil:
        pass

@ -78,17 +96,19 @@ cdef class TransitionSystem:
    def finalize_doc(self, doc):
        pass

-    cdef int preprocess_gold(self, GoldParse gold) except -1:
+    def preprocess_gold(self, GoldParse gold):
        raise NotImplementedError

    cdef Transition lookup_transition(self, object name) except *:
        raise NotImplementedError

-    cdef Transition init_transition(self, int clas, int move, int label) except *:
+    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        raise NotImplementedError

    def is_valid(self, StateClass stcls, move_name):
        action = self.lookup_transition(move_name)
+        if action.move == 0:
+            return False
        return action.is_valid(stcls.c, action.label)

    cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
@ -100,24 +120,80 @@ cdef class TransitionSystem:
                       StateClass stcls, GoldParse gold) except -1:
        cdef int i
        self.set_valid(is_valid, stcls.c)
+        cdef int n_gold = 0
        for i in range(self.n_moves):
            if is_valid[i]:
                costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
+                n_gold += costs[i] <= 0
            else:
                costs[i] = 9000
+        if n_gold <= 0:
+            print(gold.words)
+            print(gold.ner)
+            print([gold.c.ner[i].clas for i in range(gold.length)])
+            print([gold.c.ner[i].move for i in range(gold.length)])
+            print([gold.c.ner[i].label for i in range(gold.length)])
+            print("Self labels", [self.c[i].label for i in range(self.n_moves)])
+            raise ValueError(
+                "Could not find a gold-standard action to supervise "
+                "the entity recognizer\n"
+                "The transition system has %d actions." % (self.n_moves))

-    def add_action(self, int action, label):
-        if not isinstance(label, int):
-            label = self.strings[label]
+    def get_class_name(self, int clas):
+        act = self.c[clas]
+        return self.move_name(act.move, act.label)
+
+    def add_action(self, int action, label_name):
+        cdef attr_t label_id
+        if not isinstance(label_name, int):
+            label_id = self.strings.add(label_name)
+        else:
+            label_id = label_name
        # Check we're not creating a move we already have, so that this is
        # idempotent
        for trans in self.c[:self.n_moves]:
-            if trans.move == action and trans.label == label:
+            if trans.move == action and trans.label == label_id:
                return 0
        if self.n_moves >= self._size:
            self._size *= 2
            self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
-
-        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label)
+        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
+        assert self.c[self.n_moves].label == label_id
        self.n_moves += 1
        return 1
+
+    def to_disk(self, path, **exclude):
+        with path.open('wb') as file_:
+            file_.write(self.to_bytes(**exclude))
+
+    def from_disk(self, path, **exclude):
+        with path.open('rb') as file_:
+            byte_data = file_.read()
+        self.from_bytes(byte_data, **exclude)
+        return self
+
+    def to_bytes(self, **exclude):
+        transitions = []
+        for trans in self.c[:self.n_moves]:
+            transitions.append({
+                'clas': trans.clas,
+                'move': trans.move,
+                'label': self.strings[trans.label],
+                'name': self.move_name(trans.move, trans.label)
+            })
+        serializers = {
+            'transitions': lambda: ujson.dumps(transitions),
+            'strings': lambda: self.strings.to_bytes()
+        }
+        return util.to_bytes(serializers, exclude)
+
+    def from_bytes(self, bytes_data, **exclude):
+        transitions = []
+        deserializers = {
+            'transitions': lambda b: transitions.extend(ujson.loads(b)),
+            'strings': lambda b: self.strings.from_bytes(b)
+        }
+        msg = util.from_bytes(bytes_data, deserializers, exclude)
+        for trans in transitions:
+            self.add_action(trans['move'], trans['label'])
+        return self
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import ujson
 from collections import defaultdict

 from cymem.cymem cimport Pool
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .gold cimport GoldParse
 from .attrs cimport *
-from . import util


 cpdef enum:
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:


 cdef class Tagger:
-    """
-    Annotate part-of-speech tags on Doc objects.
-    """
-    @classmethod
-    def load(cls, path, vocab, require=False):
-        """
-        Load the statistical model from the supplied path.
-
-        Arguments:
-            path (Path):
-                The path to load from.
-            vocab (Vocab):
-                The vocabulary. Must be shared by the documents to be processed.
-            require (bool):
-                Whether to raise an error if the files are not found.
-        Returns (Tagger):
-            The newly created object.
-        """
-        # TODO: Change this to expect config.json when we don't have to
-        # support old data.
-        path = util.ensure_path(path)
-        if (path / 'templates.json').exists():
-            with (path / 'templates.json').open('r', encoding='utf8') as file_:
-                templates = ujson.load(file_)
-        elif require:
-            raise IOError(
-                "Required file %s/templates.json not found when loading Tagger" % str(path))
-        else:
-            templates = cls.feature_templates
-        self = cls(vocab, model=None, feature_templates=templates)
-
-        if (path / 'model').exists():
-            self.model.load(str(path / 'model'))
-        elif require:
-            raise IOError(
-                "Required file %s/model not found when loading Tagger" % str(path))
-        return self
+    """Annotate part-of-speech tags on Doc objects."""

    def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
-        """
-        Create a Tagger.
+        """Create a Tagger.

-        Arguments:
-            vocab (Vocab):
-                The vocabulary object. Must be shared with documents to be processed.
-            model (thinc.linear.AveragedPerceptron):
-                The statistical model.
-        Returns (Tagger):
-            The newly constructed object.
+        vocab (Vocab): The vocabulary object. Must be shared with documents to
+            be processed.
+        model (thinc.linear.AveragedPerceptron): The statistical model.
+        RETURNS (Tagger): The newly constructed object.
        """
        if model is None:
            model = TaggerModel(cfg.get('features', self.feature_templates),
@ -186,13 +144,9 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length

    def __call__(self, Doc tokens):
-        """
-        Apply the tagger, setting the POS tags onto the Doc object.
+        """Apply the tagger, setting the POS tags onto the Doc object.

-        Arguments:
-            doc (Doc): The tokens to be tagged.
-        Returns:
-            None
+        doc (Doc): The tokens to be tagged.
        """
        if tokens.length == 0:
            return 0
@ -215,34 +169,25 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length

    def pipe(self, stream, batch_size=1000, n_threads=2):
-        """
-        Tag a stream of documents.
+        """Tag a stream of documents.

        Arguments:
-            stream: The sequence of documents to tag.
-            batch_size (int):
-                The number of documents to accumulate into a working set.
-            n_threads (int):
-                The number of threads with which to work on the buffer in parallel,
-                if the Matcher implementation supports multi-threading.
-        Yields:
-            Doc Documents, in order.
+        stream: The sequence of documents to tag.
+        batch_size (int): The number of documents to accumulate into a working set.
+        n_threads (int): The number of threads with which to work on the buffer
+            in parallel, if the Matcher implementation supports multi-threading.
+        YIELDS (Doc): Documents, in order.
        """
        for doc in stream:
            self(doc)
            yield doc

    def update(self, Doc tokens, GoldParse gold, itn=0):
-        """
-        Update the statistical model, with tags supplied for the given document.
+        """Update the statistical model, with tags supplied for the given document.

-        Arguments:
-            doc (Doc):
-                The document to update on.
-            gold (GoldParse):
-                Manager for the gold-standard tags.
-        Returns (int):
-            Number of tags correct.
+        doc (Doc): The document to update on.
+        gold (GoldParse): Manager for the gold-standard tags.
+        RETURNS (int): Number of tags predicted correctly.
        """
        gold_tag_strs = gold.tags
        assert len(tokens) == len(gold_tag_strs)
--- a/spacy/tests/README.md
+++ b/spacy/tests/README.md
@ -13,21 +13,32 @@ Tests for spaCy modules and classes live in their own directories of the same na
 2. [Dos and don'ts](#dos-and-donts)
 3. [Parameters](#parameters)
 4. [Fixtures](#fixtures)
-5. [Helpers and utilities](#helpers-and-utilities)
-6. [Contributing to the tests](#contributing-to-the-tests)
+5. [Testing models](#testing-models)
+6. [Helpers and utilities](#helpers-and-utilities)
+7. [Contributing to the tests](#contributing-to-the-tests)


 ## Running the tests

+To show print statements, run the tests with `py.test -s`. To abort after the
+first failure, run them with `py.test -x`.
+
 ```bash
-py.test spacy                    # run basic tests
-py.test spacy --models           # run basic and model tests
-py.test spacy --slow             # run basic and slow tests
-py.test spacy --models --slow    # run all tests
+py.test spacy                        # run basic tests
+py.test spacy --models --en          # run basic and English model tests
+py.test spacy --models --all         # run basic and all model tests
+py.test spacy --slow                 # run basic and slow tests
+py.test spacy --models --all --slow  # run all tests
 ```

-To show print statements, run the tests with `py.test -s`. To abort after the first failure, run them with `py.test -x`.
+You can also run tests in a specific file or directory, or even only one
+specific test:

+```bash
+py.test spacy/tests/tokenizer  # run all tests in directory
+py.test spacy/tests/tokenizer/test_exceptions.py # run all tests in file
+py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji # run specific test
+```

 ## Dos and don'ts

@ -83,14 +94,9 @@ These are the main fixtures that are currently available:
 | Fixture | Description |
 | --- | --- |
 | `tokenizer` | Creates **all available** language tokenizers and runs the test for **each of them**. |
-| `en_tokenizer` | Creates an English `Tokenizer` object. |
-| `de_tokenizer` | Creates a German `Tokenizer` object. |
-| `hu_tokenizer` | Creates a Hungarian `Tokenizer` object. |
-| `en_vocab` | Creates an English `Vocab` object. |
-| `en_entityrecognizer` | Creates an English `EntityRecognizer` object. |
-| `lemmatizer` | Creates a `Lemmatizer` object from the installed language data (`None` if no data is found).
-| `EN` | Creates an instance of `English`. Only use for tests that require the models. |
-| `DE` | Creates an instance of `German`. Only use for tests that require the models. |
+| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
+| `en_vocab`, `en_entityrecognizer`, ... | Creates an instance of the English `Vocab`, `EntityRecognizer` object etc. |
+|  `EN`, `DE`, ... |  Creates a language class with a loaded model. For more info, see [Testing models](#testing-models). |
 | `text_file` | Creates an instance of `StringIO` to simulate reading from and writing to files. |
 | `text_file_b` | Creates an instance of `ByteIO` to simulate reading from and writing to files. |

@ -103,6 +109,48 @@ def test_module_do_something(en_tokenizer):

 If all tests in a file require a specific configuration, or use the same complex example, it can be helpful to create a separate fixture. This fixture should be added at the top of each file. Make sure to use descriptive names for these fixtures and don't override any of the global fixtures listed above. **From looking at a test, it should immediately be clear which fixtures are used, and where they are coming from.**

+## Testing models
+
+Models should only be loaded and tested **if absolutely necessary** – for example, if you're specifically testing a model's performance, or if your test is related to model loading. If you only need an annotated `Doc`, you should use the `get_doc()` helper function to create it manually instead.
+
+To specify which language models a test is related to, set the language ID as an argument of `@pytest.mark.models`. This allows you to later run the tests with `--models --en`. You can then use the `EN` [fixture](#fixtures) to get a language
+class with a loaded model.
+
+```python
+@pytest.mark.models('en')
+def test_english_model(EN):
+    doc = EN(u'This is a test')
+```
+
+> ⚠️ **Important note:** In order to test models, they need to be installed as a packge. The [conftest.py](conftest.py) includes a list of all available models, mapped to their IDs, e.g. `en`. Unless otherwise specified, each model that's installed in your environment will be imported and tested. If you don't have a model installed, **the test will be skipped**.
+
+Under the hood, `pytest.importorskip` is used to import a model package and skip the test if the package is not installed. The `EN` fixture for example gets all
+available models for `en`, [parametrizes](#parameters) them to run the test for *each of them*, and uses `load_test_model()` to import the model and run the test, or skip it if the model is not installed.
+
+### Testing specific models
+
+Using the `load_test_model()` helper function, you can also write tests for specific models, or combinations of them:
+
+```python
+from .util import load_test_model
+
+@pytest.mark.models('en')
+def test_en_md_only():
+    nlp = load_test_model('en_core_web_md')
+    # test something specific to en_core_web_md
+
+@pytest.mark.models('en', 'fr')
+@pytest.mark.parametrize('model', ['en_core_web_md', 'fr_depvec_web_lg'])
+def test_different_models(model):
+    nlp = load_test_model(model)
+    # test something specific to the parametrized models
+```
+
+### Known issues and future improvements
+
+Using `importorskip` on a list of model packages is not ideal and we're looking to improve this in the future. But at the moment, it's the best way to ensure that tests are performed on specific model packages only, and that you'll always be able to run the tests, even if you don't have *all available models* installed. (If the tests made a call to `spacy.load('en')` instead, this would load whichever model you've created an `en` shortcut for. This may be one of spaCy's default models, but it could just as easily be your own custom English model.)
+
+The current setup also doesn't provide an easy way to only run tests on specific model versions. The `minversion` keyword argument on `pytest.importorskip` can take care of this, but it currently only checks for the package's `__version__` attribute. An alternative solution would be to load a model package's meta.json and skip if the model's version does not match the one specified in the test.

 ## Helpers and utilities

@ -152,11 +200,11 @@ print([token.dep_ for token in doc])

 **Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.

-
 ### Other utilities

 | Name | Description |
 | --- | --- |
+| `load_test_model` | Load a model if it's installed as a package, otherwise skip test. |
 | `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
 | `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
 | `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -1,25 +1,50 @@
 # coding: utf-8
 from __future__ import unicode_literals

-from ..tokens import Doc
-from ..strings import StringStore
-from ..lemmatizer import Lemmatizer
-from ..attrs import ORTH, TAG, HEAD, DEP
-from .. import util
-
 from io import StringIO, BytesIO
 from pathlib import Path
 import pytest

+from .util import load_test_model
+from ..tokens import Doc
+from ..strings import StringStore
+from .. import util
+

 _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
-              'nl', 'pl', 'pt', 'sv']
+              'nl', 'pl', 'pt', 'sv', 'xx']
+_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
+           'de': ['de_core_news_md'],
+           'fr': ['fr_depvec_web_lg'],
+           'xx': ['xx_ent_web_md']}


-@pytest.fixture(params=_languages)
-def tokenizer(request):
-    lang = util.get_lang_class(request.param)
-    return lang.Defaults.create_tokenizer()
+# only used for tests that require loading the models
+# in all other cases, use specific instances
+
+@pytest.fixture(params=_models['en'])
+def EN(request):
+    return load_test_model(request.param)
+
+
+@pytest.fixture(params=_models['de'])
+def DE(request):
+    return load_test_model(request.param)
+
+
+@pytest.fixture(params=_models['fr'])
+def FR(request):
+    return load_test_model(request.param)
+
+
+#@pytest.fixture(params=_languages)
+#def tokenizer(request):
+    #lang = util.get_lang_class(request.param)
+    #return lang.Defaults.create_tokenizer()
+
+@pytest.fixture
+def tokenizer():
+    return util.get_lang_class('xx').Defaults.create_tokenizer()


@pytest.fixture
@ -47,7 +72,7 @@ def de_tokenizer():
    return util.get_lang_class('de').Defaults.create_tokenizer()


-@pytest.fixture(scope='module')
+@pytest.fixture
 def fr_tokenizer():
    return util.get_lang_class('fr').Defaults.create_tokenizer()

@ -91,11 +116,6 @@ def en_entityrecognizer():
     return util.get_lang_class('en').Defaults.create_entity()


-@pytest.fixture
-def lemmatizer():
-    return util.get_lang_class('en').Defaults.create_lemmatizer()
-
-
@pytest.fixture
 def text_file():
    return StringIO()
@ -105,22 +125,6 @@ def text_file_b():
    return BytesIO()


-# only used for tests that require loading the models
-# in all other cases, use specific instances
-@pytest.fixture(scope="session")
-def EN():
-    return English()
-
-
-@pytest.fixture(scope="session")
-def DE():
-    return German()
-
-@pytest.fixture(scope="session")
-def FR():
-    return French()
-
-
 def pytest_addoption(parser):
    parser.addoption("--models", action="store_true",
        help="include tests that require full models")
@ -129,8 +133,18 @@ def pytest_addoption(parser):
    parser.addoption("--slow", action="store_true",
        help="include slow tests")

+    for lang in _languages + ['all']:
+        parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
+

 def pytest_runtest_setup(item):
    for opt in ['models', 'vectors', 'slow']:
        if opt in item.keywords and not item.config.getoption("--%s" % opt):
            pytest.skip("need --%s option to run" % opt)
+
+    # Check if test is marked with models and has arguments set, i.e. specific
+    # language. If so, skip test if flag not set.
+    if item.get_marker('models'):
+        for arg in item.get_marker('models').args:
+            if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"):
+                pytest.skip("need --%s or --all option to run" % arg)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -102,7 +102,7 @@ def test_doc_api_getitem(en_tokenizer):
 def test_doc_api_serialize(en_tokenizer, text):
    tokens = en_tokenizer(text)
    new_tokens = get_doc(tokens.vocab).from_bytes(tokens.to_bytes())
-    assert tokens.string == new_tokens.string
+    assert tokens.text == new_tokens.text
    assert [t.text for t in tokens] == [t.text for t in new_tokens]
    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]

@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
    assert doc[6].right_edge.text == ','


+@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
    ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
 ])
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -68,6 +68,7 @@ def test_doc_token_api_is_properties(en_vocab):
    assert doc[5].like_email


+@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
    ("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
 ])
@ -99,8 +100,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
    assert [t.text for t in doc[1].ancestors] == ["saw"]
    assert [t.text for t in doc[2].ancestors] == []

-    assert doc[2].is_ancestor_of(doc[7])
-    assert not doc[6].is_ancestor_of(doc[2])
+    assert doc[2].is_ancestor(doc[7])
+    assert not doc[6].is_ancestor(doc[2])


 def test_doc_token_api_head_setter(en_tokenizer):
@ -155,3 +156,15 @@ def test_doc_token_api_head_setter(en_tokenizer):
    assert doc[3].left_edge.i == 0
    assert doc[4].left_edge.i == 0
    assert doc[2].left_edge.i == 0
+
+
+def test_sent_start(en_tokenizer):
+    doc = en_tokenizer(u'This is a sentence. This is another.')
+    assert not doc[0].sent_start
+    assert not doc[5].sent_start
+    doc[5].sent_start = True
+    assert doc[5].sent_start
+    assert not doc[0].sent_start
+    doc.is_parsed = True
+    assert len(list(doc.sents)) == 2
+
--- a/spacy/tests/integration/init.py
+++ b/spacy/tests/integration/init.py
--- a/spacy/tests/integration/test_model_sanity.py
+++ b/spacy/tests/integration/test_model_sanity.py
@ -1,72 +0,0 @@
-# coding: utf-8
-
-import pytest
-import numpy
-
-
-@pytest.mark.models
-class TestModelSanity:
-    """
-    This is to make sure the model works as expected. The tests make sure that
-    values are properly set.
-    Tests are not meant to evaluate the content of the output, only make sure
-    the output is formally okay.
-    """
-    @pytest.fixture(scope='class', params=['en','de'])
-    def example(self, request, EN, DE):
-        assert EN.entity != None
-        assert DE.entity != None
-        if request.param == 'en':
-            doc = EN(u'There was a stranger standing at the big ' +
-                      u'street talking to herself.')
-        elif request.param == 'de':
-            doc = DE(u'An der großen Straße stand eine merkwürdige ' +
-                u'Gestalt und führte Selbstgespräche.')
-        return doc
-
-    def test_tokenization(self, example):
-        # tokenization should split the document into tokens
-        assert len(example) > 1
-
-    def test_tagging(self, example):
-        # if tagging was done properly, pos tags shouldn't be empty
-        assert example.is_tagged
-        assert all( t.pos != 0 for t in example )
-        assert all( t.tag != 0 for t in example )
-
-    def test_parsing(self, example):
-        # if parsing was done properly
-        # - dependency labels shouldn't be empty
-        # - the head of some tokens should not be root
-        assert example.is_parsed
-        assert all( t.dep != 0 for t in example )
-        assert any( t.dep != i for i,t in enumerate(example) )
-
-    def test_ner(self, example):
-        # if ner was done properly, ent_iob shouldn't be empty
-        assert all([t.ent_iob != 0 for t in example])
-
-    def test_vectors(self, example):
-        # if vectors are available, they should differ on different words
-        # this isn't a perfect test since this could in principle fail
-        # in a sane model as well,
-        # but that's very unlikely and a good indicator if something is wrong
-        vector0 = example[0].vector
-        vector1 = example[1].vector
-        vector2 = example[2].vector
-        assert not numpy.array_equal(vector0,vector1)
-        assert not numpy.array_equal(vector0,vector2)
-        assert not numpy.array_equal(vector1,vector2)
-
-    def test_probs(self, example):
-        # if frequencies/probabilities are okay, they should differ for
-        # different words
-        # this isn't a perfect test since this could in principle fail
-        # in a sane model as well,
-        # but that's very unlikely and a good indicator if something is wrong
-        prob0 = example[0].prob
-        prob1 = example[1].prob
-        prob2 = example[2].prob
-        assert not prob0 == prob1
-        assert not prob0 == prob2
-        assert not prob1 == prob2
--- a/spacy/tests/lang/de/test_exceptions.py
+++ b/spacy/tests/lang/de/test_exceptions.py
@ -8,20 +8,33 @@ import pytest


@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
-def test_tokenizer_splits_contractions(de_tokenizer, text):
+def test_de_tokenizer_splits_contractions(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 2


@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
-def test_tokenizer_handles_abbr(de_tokenizer, text):
+def test_de_tokenizer_handles_abbr(de_tokenizer, text):
    tokens = de_tokenizer(text)
    assert len(tokens) == 1


-def test_tokenizer_handles_exc_in_text(de_tokenizer):
+def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
    text = "Ich bin z.Zt. im Urlaub."
    tokens = de_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[2].text == "z.Zt."
    assert tokens[2].lemma_ == "zur Zeit"
+
+
+@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
+def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
+    tokens = de_tokenizer(text)
+    assert [token.norm_ for token in tokens] == norms
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize('text,norm', [("daß", "dass")])
+def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
+    tokens = de_tokenizer(text)
+    assert tokens[0].norm_ == norm
--- a/spacy/tests/lang/de/test_models.py
+++ b/spacy/tests/lang/de/test_models.py
@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import numpy
+import pytest
+
+
+@pytest.fixture
+def example(DE):
+    """
+    This is to make sure the model works as expected. The tests make sure that
+    values are properly set. Tests are not meant to evaluate the content of the
+    output, only make sure the output is formally okay.
+    """
+    assert DE.entity != None
+    return DE('An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
+
+
+@pytest.mark.models('de')
+def test_de_models_tokenization(example):
+    # tokenization should split the document into tokens
+    assert len(example) > 1
+
+
+@pytest.mark.xfail
+@pytest.mark.models('de')
+def test_de_models_tagging(example):
+    # if tagging was done properly, pos tags shouldn't be empty
+    assert example.is_tagged
+    assert all(t.pos != 0 for t in example)
+    assert all(t.tag != 0 for t in example)
+
+
+@pytest.mark.models('de')
+def test_de_models_parsing(example):
+    # if parsing was done properly
+    # - dependency labels shouldn't be empty
+    # - the head of some tokens should not be root
+    assert example.is_parsed
+    assert all(t.dep != 0 for t in example)
+    assert any(t.dep != i for i,t in enumerate(example))
+
+
+@pytest.mark.models('de')
+def test_de_models_ner(example):
+    # if ner was done properly, ent_iob shouldn't be empty
+    assert all([t.ent_iob != 0 for t in example])
+
+
+@pytest.mark.models('de')
+def test_de_models_vectors(example):
+    # if vectors are available, they should differ on different words
+    # this isn't a perfect test since this could in principle fail
+    # in a sane model as well,
+    # but that's very unlikely and a good indicator if something is wrong
+    vector0 = example[0].vector
+    vector1 = example[1].vector
+    vector2 = example[2].vector
+    assert not numpy.array_equal(vector0,vector1)
+    assert not numpy.array_equal(vector0,vector2)
+    assert not numpy.array_equal(vector1,vector2)
+
+
+@pytest.mark.xfail
+@pytest.mark.models('de')
+def test_de_models_probs(example):
+    # if frequencies/probabilities are okay, they should differ for
+    # different words
+    # this isn't a perfect test since this could in principle fail
+    # in a sane model as well,
+    # but that's very unlikely and a good indicator if something is wrong
+    prob0 = example[0].prob
+    prob1 = example[1].prob
+    prob2 = example[2].prob
+    assert not prob0 == prob1
+    assert not prob0 == prob2
+    assert not prob1 == prob2
--- a/spacy/tests/lang/de/test_parser.py
+++ b/spacy/tests/lang/de/test_parser.py
@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ...util import get_doc
+
+import pytest
+
+
+def test_de_parser_noun_chunks_standard_de(de_tokenizer):
+    text = "Eine Tasse steht auf dem Tisch."
+    heads = [1, 1, 0, -1, 1, -2, -4]
+    tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
+    deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
+
+    tokens = de_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "Eine Tasse "
+    assert chunks[1].text_with_ws == "dem Tisch "
+
+
+def test_de_extended_chunk(de_tokenizer):
+    text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
+    heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
+    tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
+    deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
+
+    tokens = de_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 3
+    assert chunks[0].text_with_ws == "Die Sängerin "
+    assert chunks[1].text_with_ws == "einer Tasse Kaffee "
+    assert chunks[2].text_with_ws == "Arien "
--- a/spacy/tests/lang/en/test_contractions.py
+++ b/spacy/tests/lang/en/test_contractions.py
@ -1,87 +0,0 @@
-# coding: utf-8
-"""Test that tokens are created correctly for contractions."""
-
-
-from __future__ import unicode_literals
-
-import pytest
-
-
-def test_tokenizer_handles_basic_contraction(en_tokenizer):
-    text = "don't giggle"
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 3
-    assert tokens[1].text == "n't"
-    text = "i said don't!"
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 5
-    assert tokens[4].text == "!"
-
-
-@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
-def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 3
-
-
-@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
-def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
-    tokens = en_tokenizer(text_poss)
-    assert len(tokens) == 2
-    assert tokens[0].text == text
-    assert tokens[1].text == "'s"
-
-
-@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
-def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 2
-    assert tokens[0].text == text.split("'")[0]
-    assert tokens[1].text == "'"
-
-
-@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
-def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 1
-    assert tokens[0].text == text
-
-
-@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
-def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
-    tokens = en_tokenizer(text)
-    assert len(tokens) == 2
-    assert tokens[0].text == text.split("'")[0]
-    assert tokens[1].text == "'ll"
-    assert tokens[1].lemma_ == "will"
-
-
-@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
-def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
-    tokens_lower = en_tokenizer(text_lower)
-    tokens_title = en_tokenizer(text_title)
-    assert tokens_title[0].text == tokens_lower[0].text.title()
-    assert tokens_lower[0].text == tokens_title[0].text.lower()
-    assert tokens_lower[1].text == tokens_title[1].text
-
-
-@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
-@pytest.mark.parametrize('contraction', ["'ll", "'d"])
-def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
-    tokens = en_tokenizer(pron + contraction)
-    assert tokens[0].text == pron
-    assert tokens[1].text == contraction
-
-
-@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
-def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
-    tokens = en_tokenizer(exc)
-    assert len(tokens) == 1
-
-
-@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
-def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
-    tokens = en_tokenizer(wo_punct)
-    assert len(tokens) == 2
-    tokens = en_tokenizer(w_punct)
-    assert len(tokens) == 3
--- a/Show More
+++ b/Show More