Merge branch 'master' of https://github.com/honnibal/spaCy

2015-07-25 21:14:07 +02:00 · 2015-07-25 21:14:07 +02:00 · 2e6a60eaec
parent 105305b4aa 616445e027
commit 2e6a60eaec
52 changed files with 317092 additions and 195 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -11,11 +11,18 @@ python:
 # install dependencies
 install:
  - "pip install --upgrade setuptools"
-  - "rm -rf spacy/"
-  - "pip install spacy"
+  - "pip install cython fabric fabtools"
+  - "pip install -r requirements.txt"
+  - "python setup.py build_ext --inplace"
+  - "mkdir -p corpora/en"
+  - "cd corpora/en"
+  - "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
+  - "tar -xzf WordNet-3.0.tar.gz"
+  - "mv WordNet-3.0 wordnet"
+  - "cd ../../"
+  - "export PYTHONPATH=`pwd`"
+  - "python bin/init_model.py lang_data/en corpora/en spacy/en/data"

 # run tests
 script:
-  - py.test tests/tokenizer/
-  - py.test tests/vocab/
-  - py.test tests/tagger/
+  - "py.test tests/ -x"
--- a/bin/gather_freqs.py
+++ b/bin/gather_freqs.py
@ -0,0 +1,27 @@
+import plac
+
+def main(in_loc, out_loc):
+    out_file = open(out_loc, 'w')
+    this_key = None
+    this_freq = 0
+    df = 0
+    for line in open(in_loc):
+        line = line.strip()
+        if not line:
+            continue
+        freq, key = line.split('\t', 1)
+        freq = int(freq)
+        if this_key is not None and key != this_key:
+            out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
+            this_key = key
+            this_freq = freq
+            df = 1
+        else:
+            this_freq += freq
+            df += 1
+    out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
+    out_file.close()
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -15,6 +15,8 @@ Requires:
        * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
        * vectors.tgz --- output of something like word2vec
 """
+from __future__ import unicode_literals
+
 import plac
 from pathlib import Path

@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):

 def _read_clusters(loc):
    if not loc.exists():
-        print "Warning: Clusters file not found"
+        print("Warning: Clusters file not found")
        return {}
    clusters = {}
    for line in codecs.open(str(loc), 'r', 'utf8'):
@ -60,7 +62,7 @@ def _read_clusters(loc):
        else:
            clusters[word] = '0'
    # Expand clusters with re-casing
-    for word, cluster in clusters.items():
+    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
            clusters[word.lower()] = cluster
        if word.title() not in clusters:
@ -72,7 +74,7 @@ def _read_clusters(loc):

 def _read_probs(loc):
    if not loc.exists():
-        print "Warning: Probabilities file not found"
+        print("Warning: Probabilities file not found")
        return {}
    probs = {}
    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
@ -85,7 +87,7 @@ def _read_probs(loc):
 def _read_senses(loc):
    lexicon = defaultdict(lambda: defaultdict(list))
    if not loc.exists():
-        print "Warning: WordNet senses not found"
+        print("Warning: WordNet senses not found")
        return lexicon
    sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
    pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
@ -109,13 +111,20 @@ def setup_vocab(src_dir, dst_dir):
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    else:
-        print "Warning: Word vectors file not found"
+        print("Warning: Word vectors file not found")
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')
-    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
+    if not probs:
+        min_prob = 0.0
+    else:
+        min_prob = min(probs.values())
+    for word in clusters:
+        if word not in probs:
+            probs[word] = min_prob
+
    lexicon = []
-    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
+    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        entry = get_lex_props(word)
        if word in clusters or float(prob) >= -17:
            entry['prob'] = float(prob)
@ -144,7 +153,7 @@ def main(lang_data_dir, corpora_dir, model_dir):
    setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
    setup_vocab(corpora_dir, model_dir / 'vocab')
    if not (model_dir / 'wordnet').exists():
-        copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
+        copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))


 if __name__ == '__main__':
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 from __future__ import division
 from __future__ import unicode_literals
+from __future__ import print_function

 import os
 from os import path
@ -107,7 +108,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',

    nlp = Language(data_dir=model_dir)

-    print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
+    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
@ -138,9 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
-        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
-                                               scorer.tags_acc,
-                                               scorer.token_acc)
+        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
+                                                   scorer.tags_acc,
+                                                   scorer.token_acc))
    nlp.end_training()

 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
@ -219,14 +220,14 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
    #    write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
-    print 'TOK', scorer.token_acc
-    print 'POS', scorer.tags_acc
-    print 'UAS', scorer.uas
-    print 'LAS', scorer.las
+    print('TOK', scorer.token_acc)
+    print('POS', scorer.tags_acc)
+    print('UAS', scorer.uas)
+    print('LAS', scorer.las)

-    print 'NER P', scorer.ents_p
-    print 'NER R', scorer.ents_r
-    print 'NER F', scorer.ents_f
+    print('NER P', scorer.ents_p)
+    print('NER R', scorer.ents_r)
+    print('NER F', scorer.ents_f)


 if __name__ == '__main__':
--- a/corpora/en/clusters.txt
+++ b/corpora/en/clusters.txt
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,7 @@ cython
 cymem == 1.11
 pathlib
 preshed == 0.37
-thinc == 3.2
+thinc == 3.3
 murmurhash == 0.24
 unidecode
 numpy
--- a/setup.py
+++ b/setup.py
@ -120,7 +120,7 @@ def run_setup(exts):
        ext_modules=exts,
        license="Dual: Commercial or AGPL",
        install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
-                          'thinc == 3.2', "unidecode", 'wget', 'plac', 'six',
+                          'thinc == 3.3', "unidecode", 'wget', 'plac', 'six',
                          'ujson'],
        setup_requires=["headers_workaround"],
    )
@ -162,6 +162,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.gold', 'spacy.orth',
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
+             'spacy.cfile',
             'spacy.syntax.ner']


--- a/spacy/cfile.pxd
+++ b/spacy/cfile.pxd
@ -0,0 +1,12 @@
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+from cymem.cymem cimport Pool
+
+cdef class CFile:
+    cdef FILE* fp
+    cdef bint is_open
+ 
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
+    
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@ -0,0 +1,40 @@
+from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
+
+
+cdef class CFile:
+    def __init__(self, loc, mode):
+        if isinstance(mode, unicode):
+            mode_str = mode.encode('ascii')
+        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
+        self.fp = fopen(<char*>bytes_loc, mode_str)
+        if self.fp == NULL:
+            raise IOError("Could not open binary file %s" % bytes_loc)
+        self.is_open = True
+
+    def __dealloc__(self):
+        if self.is_open:
+            fclose(self.fp)
+
+    def close(self):
+        fclose(self.fp)
+        self.is_open = False
+
+    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
+        st = fread(dest, elem_size, number, self.fp)
+        if st != number:
+            raise IOError
+
+    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
+        st = fwrite(src, elem_size, number, self.fp)
+        if st != number:
+            raise IOError
+
+    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
+        cdef void* dest = mem.alloc(number, elem_size)
+        self.read_into(dest, number, elem_size)
+        return dest
+
+    def write_unicode(self, unicode value):
+        cdef bytes py_bytes = value.encode('utf8')
+        cdef char* chars = <char*>py_bytes
+        self.write(sizeof(char), len(py_bytes), chars)
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -95,15 +95,15 @@ class English(object):

        self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
        
-        if Tagger:
+        if Tagger and path.exists(path.join(data_dir, 'pos')):
            self.tagger = Tagger(self.vocab.strings, data_dir)
        else:
            self.tagger = None
-        if Parser:
+        if Parser and path.exists(path.join(data_dir, 'deps')):
            self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
        else:
            self.parser = None
-        if Entity:
+        if Entity and path.exists(path.join(data_dir, 'ner')):
            self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
        else:
            self.entity = None
@ -153,15 +153,14 @@ class English(object):
        self.tagger.model.end_training()
        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))

-        packer = Packer(self.vocab, [
-            (TAG, self.tagger.moves.freqs[TAG].items()),
-            (HEAD, self.parser.moves.freqs[HEAD].items()),
-            (DEP, self.parser.moves.freqs[DEP].items()),
-            (ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
-            (ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items())
-        ])
-
-        packer.dump(path.join(data_dir, 'vocab'))
+        with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
+            file_.write(
+                json.dumps([
+                    (TAG, self.tagger.freqs[TAG].items()),
+                    (DEP, self.parser.moves.freqs[DEP].items()),
+                    (ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
+                    (ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()),
+                    (HEAD, self.parser.moves.freqs[HEAD].items())]))

    @property
    def tags(self):
--- a/spacy/en/attrs.pxd
+++ b/spacy/en/attrs.pxd
@ -14,6 +14,9 @@ from ..attrs cimport LEMMA as _LEMMA
 from ..attrs cimport POS as _POS
 from ..attrs cimport TAG as _TAG
 from ..attrs cimport DEP as _DEP
+from ..attrs cimport HEAD as _HEAD
+from ..attrs cimport ENT_IOB as _ENT_IOB
+from ..attrs cimport ENT_TYPE as _ENT_TYPE


 cpdef enum:
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -262,6 +262,9 @@ cdef class EnPosTagger:
                                                 'morphs.json'))))
        self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
        self.freqs = {TAG: defaultdict(int)}
+        for tag in self.tag_names:
+            self.freqs[TAG][self.strings[tag]] = 1
+        self.freqs[TAG][0] = 1

    def __call__(self, Doc tokens):
        """Apply the tagger, setting the POS tags onto the Doc object.
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -3,8 +3,6 @@ from cymem.cymem cimport Pool
 from .structs cimport TokenC
 from .syntax.transition_system cimport Transition

-cimport numpy
-

 cdef struct GoldParseC:
    int* tags
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -1,7 +1,5 @@
 import numpy
 import codecs
-import json
-import ujson
 import random
 import re
 import os
@ -9,6 +7,11 @@ from os import path

 from libc.string cimport memset

+try:
+    import ujson as json
+except ImportError:
+    import json
+

 def tags_to_entities(tags):
    entities = []
@ -128,7 +131,7 @@ def read_json_file(loc, docs_filter=None):
            yield from read_json_file(path.join(loc, filename))
    else:
        with open(loc) as file_:
-            docs = ujson.load(file_)
+            docs = json.load(file_)
        for doc in docs:
            if docs_filter is not None and not docs_filter(doc):
                continue
--- a/spacy/serialize/bits.pxd
+++ b/spacy/serialize/bits.pxd
@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil


 cdef class BitArray:
-    cdef bytes data
+    cdef bytearray data
    cdef uchar byte
    cdef uchar bit_of_byte
    cdef uint32_t i
--- a/spacy/serialize/bits.pyx
+++ b/spacy/serialize/bits.pyx
@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 from libc.string cimport memcpy

 # Note that we're setting the most significant bits here first, when in practice
@ -15,7 +17,7 @@ cdef Code bit_append(Code code, bint bit) nogil:

 cdef class BitArray:
    def __init__(self, data=b''):
-        self.data = data
+        self.data = bytearray(data)
        self.byte = 0
        self.bit_of_byte = 0
        self.i = 0
@ -45,7 +47,7 @@ cdef class BitArray:
        start_bit = self.i % 8

        if start_bit != 0 and start_byte < len(self.data):
-            byte = ord(self.data[start_byte])
+            byte = self.data[start_byte]
            for i in range(start_bit, 8):
                self.i += 1
                yield 1 if (byte & (one << i)) else 0
@ -68,18 +70,24 @@ cdef class BitArray:

        # TODO portability
        cdef uchar[4] chars
-        chars[0] = <uchar>ord(self.data[start_byte])
-        chars[1] = <uchar>ord(self.data[start_byte+1])
-        chars[2] = <uchar>ord(self.data[start_byte+2])
-        chars[3] = <uchar>ord(self.data[start_byte+3])
+        chars[0] = self.data[start_byte]
+        chars[1] = self.data[start_byte+1]
+        chars[2] = self.data[start_byte+2]
+        chars[3] = self.data[start_byte+3]
        cdef uint32_t output
        memcpy(&output, chars, 4)
        self.i += 32
        return output

    def as_bytes(self):
+        cdef unsigned char byte_char
        if self.bit_of_byte != 0:
-            return self.data + chr(self.byte)
+            byte = chr(self.byte)
+            # Jump through some hoops for Python3
+            if isinstance(byte, unicode):
+                return self.data + <bytes>(&self.byte)[:1]
+            else:
+                return self.data + chr(self.byte)
        else:
            return self.data

@ -92,7 +100,7 @@ cdef class BitArray:
        self.bit_of_byte += 1
        self.i += 1
        if self.bit_of_byte == 8:
-            self.data += chr(self.byte)
+            self.data += bytearray((self.byte,))
            self.byte = 0
            self.bit_of_byte = 0

@ -106,7 +114,7 @@ cdef class BitArray:
                self.byte &= ~(one << self.bit_of_byte)
            self.bit_of_byte += 1
            if self.bit_of_byte == 8:
-                self.data += chr(self.byte)
+                self.data += <bytes>self.byte
                self.byte = 0
                self.bit_of_byte = 0
            self.i += 1
--- a/spacy/serialize/huffman.pyx
+++ b/spacy/serialize/huffman.pyx
@ -1,4 +1,5 @@
 # cython: profile=True
+from __future__ import unicode_literals
 cimport cython
 from libcpp.queue cimport priority_queue
 from libcpp.pair cimport pair
@ -110,14 +111,14 @@ cdef class HuffmanCodec:
        cdef int branch

        cdef int n_msg = msg.shape[0]
-        cdef bytes bytes_ = bits.as_bytes()
+        cdef bytearray bytes_ = bits.as_bytes()
        cdef unsigned char byte
        cdef int i_msg = 0
        cdef int i_byte = bits.i // 8
        cdef unsigned char i_bit = 0
        cdef unsigned char one = 1
        while i_msg < n_msg:
-            byte = ord(bytes_[i_byte])
+            byte = bytes_[i_byte]
            i_byte += 1
            for i_bit in range(8):
                branch = node.right if (byte & (one << i_bit)) else node.left
@ -138,11 +139,11 @@ cdef class HuffmanCodec:
        def __get__(self):
            output = []
            cdef int i, j
-            cdef bytes string
+            cdef unicode string
            cdef Code code
            for i in range(self.codes.size()):
                code = self.codes[i]
-                string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
+                string = '{0:b}'.format(code.bits).rjust(code.length, '0')
                string = string[::-1]
                output.append(string)
            return output
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@ -10,6 +10,7 @@ from libcpp.pair cimport pair
 from cymem.cymem cimport Address, Pool
 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter
+import json

 from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 from ..tokens.doc cimport Doc
@ -65,7 +66,7 @@ def _gen_orths(Vocab vocab):
 def _gen_chars(Vocab vocab):
    cdef attr_t orth
    cdef size_t addr
-    char_weights = {chr(i): 1e-20 for i in range(256)}
+    char_weights = {i: 1e-20 for i in range(256)}
    cdef unicode string
    cdef bytes char
    cdef bytes utf8_str
@ -74,9 +75,9 @@ def _gen_chars(Vocab vocab):
        string = vocab.strings[lex.orth]
        utf8_str = string.encode('utf8')
        for char in utf8_str:
-            char_weights.setdefault(char, 0.0)
-            char_weights[char] += c_exp(lex.prob)
-        char_weights[b' '] += c_exp(lex.prob)
+            char_weights.setdefault(ord(char), 0.0)
+            char_weights[ord(char)] += c_exp(lex.prob)
+        char_weights[ord(' ')] += c_exp(lex.prob)
    return char_weights.items()


@ -98,33 +99,34 @@ cdef class Packer:
        self._codecs = tuple(codecs)
        self.attrs = tuple(attrs)

-    @classmethod
-    def from_dir(cls, Vocab vocab, data_dir):
-        return cls(vocab, util.read_encoding_freqs(data_dir))
-
    def pack(self, Doc doc):
        bits = self._orth_encode(doc)
        if bits is None:
            bits = self._char_encode(doc)
-        
        cdef int i
        if self.attrs:
            array = doc.to_array(self.attrs)
            for i, codec in enumerate(self._codecs):
-                codec.encode_int32(array[:, i], bits)
-        return bits
+                codec.encode(array[:, i], bits)
+        return bits.as_bytes()

-    def unpack(self, BitArray bits):
+    def unpack(self, data):
+        doc = Doc(self.vocab)
+        self.unpack_into(data, doc)
+        return doc
+
+    def unpack_into(self, byte_string, Doc doc):
+        bits = BitArray(byte_string)
        bits.seek(0)
        cdef int32_t length = bits.read32()
        if length >= 0:
-            doc = self._orth_decode(bits, length)
+            self._orth_decode(bits, length, doc)
        else:
-            doc = self._char_decode(bits, -length)
-
+            self._char_decode(bits, -length, doc)
+        
        array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
        for i, codec in enumerate(self._codecs):
-            codec.decode_int32(bits, array[:, i])
+            codec.decode(bits, array[:, i])

        doc.from_array(self.attrs, array)
        return doc
@ -141,20 +143,13 @@ cdef class Packer:
            bits.append(bool(token.whitespace_))
        return bits

-    def _orth_decode(self, BitArray bits, n):
-        orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
-        self.orth_codec.decode_int32(bits, orths)
-        orths_and_spaces = zip(orths, bits)
-        cdef Doc doc = Doc(self.vocab, orths_and_spaces)
-        return doc
-
    def _char_encode(self, Doc doc):
        cdef bytes utf8_str = doc.string.encode('utf8')
        cdef BitArray bits = BitArray()
        cdef int32_t length = len(utf8_str)
        # Signal chars with negative length
        bits.extend(-length, 32)
-        self.char_codec.encode(utf8_str, bits)
+        self.char_codec.encode(bytearray(utf8_str), bits)
        cdef int i, j
        for i in range(doc.length):
            for j in range(doc.data[i].lex.length-1):
@ -164,12 +159,24 @@ cdef class Packer:
                bits.append(False)
        return bits

-    def _char_decode(self, BitArray bits, n):
+    def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
+        cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
+        self.orth_codec.decode_int32(bits, orths)
+        cdef int i
+        cdef bint space
+        spaces = iter(bits)
+        for i in range(n):
+            orth = orths[i]
+            space = next(spaces)
+            lex = self.vocab.get_by_orth(doc.mem, orth)
+            doc.push_back(lex, space)
+        return doc
+
+    def _char_decode(self, BitArray bits, int32_t n, Doc doc):
        cdef bytearray utf8_str = bytearray(n)
        self.char_codec.decode(bits, utf8_str)

        cdef unicode string = utf8_str.decode('utf8')
-        cdef Doc tokens = Doc(self.vocab)
        cdef int start = 0
        cdef bint is_spacy
        cdef int length = len(string)
@ -178,11 +185,11 @@ cdef class Packer:
        for is_end_token in bits:
            if is_end_token:
                span = string[start:i+1]
-                lex = self.vocab.get(tokens.mem, span)
+                lex = self.vocab.get(doc.mem, span)
                is_spacy = (i+1) < length and string[i+1] == u' '
-                tokens.push_back(lex, is_spacy)
+                doc.push_back(lex, is_spacy)
                start = i + 1 + is_spacy
            i += 1
            if i >= n:
                break
-        return tokens
+        return doc
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -81,6 +81,7 @@ cdef class StringStore:
    def __getitem__(self, object string_or_id):
        cdef bytes byte_string
        cdef const Utf8Str* utf8str
+        cdef int id_
        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
            if string_or_id == 0:
                return u''
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 """
 Fill an array, context, with every _atomic_ value our features reference.
 We then write the _actual features_ as tuples of the atoms. The machinery
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 from __future__ import unicode_literals

 import ctypes
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -85,6 +85,9 @@ cdef class BiluoPushDown(TransitionSystem):
            elif gold.c.ner[i].move == OUT:
                self.freqs[ENT_IOB][1] += 1
                self.freqs[ENT_TYPE][0] += 1
+            else:
+                self.freqs[ENT_IOB][1] += 1
+                self.freqs[ENT_TYPE][0] += 1

    cdef Transition lookup_transition(self, object name) except *:
        if name == '-':
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 """
 MALT-style dependency parser
 """
@ -85,18 +84,17 @@ cdef class Parser:

        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
                                  self.model.n_feats, self.model.n_feats)
-        self.parse(stcls, eg.c)
+        with nogil:
+            self.parse(stcls, eg.c)
        tokens.set_parse(stcls._sent)

    cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
        while not stcls.is_final():
            memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
-
            self.moves.set_valid(eg.is_valid, stcls)
            fill_context(eg.atoms, stcls)
            self.model.set_scores(eg.scores, eg.atoms)
            eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
-
            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
        self.moves.finalize_state(stcls)

--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@ -1,4 +1,3 @@
-# cython: profile=True
 from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t
 from ..vocab cimport EMPTY_LEXEME
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -33,6 +33,11 @@ cdef class TransitionSystem:
        self.freqs = {}
        for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
            self.freqs[attr] = defaultdict(int)
+            self.freqs[attr][0] = 1
+        # Ensure we've seen heads. Need an official dependency length limit...
+        for i in range(512):
+            self.freqs[HEAD][i] = 1
+            self.freqs[HEAD][-i] = 1

    cdef int initialize_state(self, StateClass state) except -1:
        pass
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -71,17 +71,6 @@ cdef class Doc:
        self.is_tagged = False
        self.is_parsed = False
        self._py_tokens = []
-        cdef const LexemeC* lex
-        cdef attr_t orth
-        cdef bint space
-        if orths_and_spaces is not None:
-            for orth, space in orths_and_spaces:
-                lex = <LexemeC*>self.vocab._by_orth.get(orth)
-                if lex != NULL:
-                    assert lex.orth == orth
-                    self.push_back(lex, space)
-                else:
-                    raise Exception('Lexeme not found: %d' % orth)

    def __getitem__(self, object i):
        """Get a token.
@ -122,9 +111,12 @@ cdef class Doc:
    def __unicode__(self):
        return u''.join([t.string for t in self])

+    def __str__(self):
+        return u''.join([t.string for t in self])
+
    @property
    def string(self):
-        return unicode(self)
+        return u''.join([t.string for t in self])

    @property
    def ents(self):
@ -303,12 +295,11 @@ cdef class Doc:
        return self

    def to_bytes(self):
-        bits = self.vocab.packer.pack(self)
-        return struct.pack('I', len(bits)) + bits.as_bytes()
+        byte_string = self.vocab.serializer.pack(self)
+        return struct.pack('I', len(byte_string)) + byte_string

    def from_bytes(self, data):
-        bits = BitArray(data)
-        self.vocab.packer.unpack_into(bits, self)
+        self.vocab.serializer.unpack_into(data[4:], self)
        return self
    
    @staticmethod
@ -316,15 +307,14 @@ cdef class Doc:
        keep_reading = True
        while keep_reading:
            try:
-                n_bits_str = file_.read(4)
-                if len(n_bits_str) < 4:
+                n_bytes_str = file_.read(4)
+                if len(n_bytes_str) < 4:
                    break
-                n_bits = struct.unpack('I', n_bits_str)[0]
-                n_bytes = n_bits // 8 + bool(n_bits % 8)
+                n_bytes = struct.unpack('I', n_bytes_str)[0]
                data = file_.read(n_bytes)
            except StopIteration:
                keep_reading = False
-            yield data
+            yield n_bytes_str + data

    # This function is terrible --- need to fix this.
    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -34,6 +34,9 @@ cdef class Token:
    def __unicode__(self):
        return self.string

+    def __str__(self):
+        return self.string
+
    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
        return check_flag(self.c.lex, flag_id)

--- a/spacy/util.py
+++ b/spacy/util.py
@ -65,16 +65,6 @@ def read_tokenization(lang):
    return entries


-def read_encoding_freqs(data_dir):
-    tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
-    heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
-    deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
-    iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
-    ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
-    return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
-            (ENT_TYPE, ne_types)]
-
-
 def read_detoken_rules(lang): # Deprecated?
    loc = path.join(DATA_DIR, lang, 'detokenize')
    entries = []
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64

 from .structs cimport LexemeC, TokenC
-from .typedefs cimport utf8_t, hash_t
+from .typedefs cimport utf8_t, attr_t, hash_t
 from .strings cimport StringStore


@ -29,9 +29,12 @@ cdef class Vocab:
    cpdef readonly StringStore strings
    cdef readonly object pos_tags
    cdef readonly int length
-    cdef public object packer
+    cdef public object _serializer
+    cdef public object data_dir

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
+    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
+    
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1

    cdef PreshMap _by_hash
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,3 +1,6 @@
+from __future__ import unicode_literals
+
+
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memset
 from libc.stdint cimport int32_t
@ -6,6 +9,7 @@ import bz2
 from os import path
 import codecs
 import math
+import json

 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport set_lex_struct_props
@ -13,6 +17,7 @@ from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .orth cimport word_shape
 from .typedefs cimport attr_t
+from .cfile cimport CFile

 from cymem.cymem cimport Address
 from . import util
@ -54,8 +59,19 @@ cdef class Vocab:
            if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
                self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))

-        #self.packer = Packer(self, util.read_encoding_freqs(data_dir))
-        self.packer = None
+        self._serializer = None
+        self.data_dir = data_dir
+
+    property serializer:
+        def __get__(self):
+            if self._serializer is None:
+                freqs = []
+                if self.data_dir is not None:
+                    freqs_loc = path.join(self.data_dir, 'serializer.json')
+                    if path.exists(freqs_loc):
+                        freqs = json.load(open(freqs_loc))
+                self._serializer = Packer(self, freqs)
+            return self._serializer

    def __len__(self):
        """The current number of lexemes stored."""
@ -82,6 +98,27 @@ cdef class Vocab:
            self._add_lex_to_vocab(key, lex)
        return lex

+    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
+        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
+        if necessary, using memory acquired from the given pool.  If the pool
+        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
+        cdef LexemeC* lex
+        lex = <LexemeC*>self._by_orth.get(orth)
+        if lex != NULL:
+            return lex
+        cdef unicode string = self.strings[orth]
+        cdef bint is_oov = mem is not self.mem
+        if len(string) < 3:
+            mem = self.mem
+        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
+        props = self.lexeme_props_getter(string)
+        set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
+        if is_oov:
+            lex.id = 0
+        else:
+            self._add_lex_to_vocab(hash_string(string), lex)
+        return lex
+
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
        self._by_hash.set(key, <void*>lex)
        self._by_orth.set(lex.orth, <void*>lex)
@ -138,19 +175,16 @@ cdef class Vocab:
        if path.exists(loc):
            assert not path.isdir(loc)
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
-        cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
-        assert fp != NULL
+
+        cdef CFile fp = CFile(bytes_loc, 'wb')
        cdef size_t st
        cdef size_t addr
        cdef hash_t key
        for key, addr in self._by_hash.items():
            lexeme = <LexemeC*>addr
-            st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
-            assert st == 1
-            st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
-            assert st == 1
-        st = fclose(fp)
-        assert st == 0
+            fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
+            fp.write_from(lexeme, sizeof(LexemeC), 1)
+        fp.close()

    def load_lexemes(self, strings_loc, loc):
        self.strings.load(strings_loc)
@ -188,7 +222,7 @@ cdef class Vocab:
        fclose(fp)

    def load_rep_vectors(self, loc):
-        file_ = _CFile(loc, b'rb')
+        cdef CFile file_ = CFile(loc, b'rb')
        cdef int32_t word_len
        cdef int32_t vec_len
        cdef int32_t prev_vec_len = 0
@ -198,22 +232,20 @@ cdef class Vocab:
        cdef bytes py_word
        cdef vector[float*] vectors
        cdef int i
+        cdef Pool tmp_mem = Pool()
        while True:
            try:
-                file_.read(&word_len, sizeof(word_len), 1)
+                file_.read_into(&word_len, sizeof(word_len), 1)
            except IOError:
                break
-            file_.read(&vec_len, sizeof(vec_len), 1)
+            file_.read_into(&vec_len, sizeof(vec_len), 1)
            if prev_vec_len != 0 and vec_len != prev_vec_len:
                raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
            if 0 >= vec_len >= MAX_VEC_SIZE:
                raise VectorReadError.bad_size(loc, vec_len)
-            mem = Address(word_len, sizeof(char))
-            chars = <char*>mem.ptr
-            vec = <float*>self.mem.alloc(vec_len, sizeof(float))

-            file_.read(chars, sizeof(char), word_len)
-            file_.read(vec, sizeof(float), vec_len)
+            chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
+            vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))

            string_id = self.strings[chars[:word_len]]
            while string_id >= vectors.size():
@ -235,7 +267,7 @@ cdef class Vocab:


 def write_binary_vectors(in_loc, out_loc):
-    cdef _CFile out_file = _CFile(out_loc, 'wb')
+    cdef CFile out_file = CFile(out_loc, 'wb')
    cdef Address mem
    cdef int32_t word_len
    cdef int32_t vec_len
@ -252,42 +284,12 @@ def write_binary_vectors(in_loc, out_loc):
            word_len = len(word)
            vec_len = len(pieces)

-            out_file.write(sizeof(word_len), 1, &word_len)
-            out_file.write(sizeof(vec_len), 1, &vec_len)
+            out_file.write_from(&word_len, 1, sizeof(word_len))
+            out_file.write_from(&vec_len, 1, sizeof(vec_len))

            chars = <char*>word
-            out_file.write(sizeof(char), len(word), chars)
-            out_file.write(sizeof(float), vec_len, vec)
-
-
-cdef class _CFile:
-    cdef FILE* fp
-    def __init__(self, loc, bytes mode):
-        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
-        self.fp = fopen(<char*>bytes_loc, mode)
-        if self.fp == NULL:
-            raise IOError
-
-    def __dealloc__(self):
-        fclose(self.fp)
-
-    def close(self):
-        fclose(self.fp)
-
-    cdef int read(self, void* dest, size_t elem_size, size_t n) except -1:
-        st = fread(dest, elem_size, n, self.fp)
-        if st != n:
-            raise IOError
-
-    cdef int write(self, size_t elem_size, size_t n, void* data) except -1:
-        st = fwrite(data, elem_size, n, self.fp)
-        if st != n:
-            raise IOError
-
-    cdef int write_unicode(self, unicode value):
-        cdef bytes py_bytes = value.encode('utf8')
-        cdef char* chars = <char*>py_bytes
-        self.write(sizeof(char), len(py_bytes), chars)
+            out_file.write_from(chars, len(word), sizeof(char))
+            out_file.write_from(vec, vec_len, sizeof(float))


 class VectorReadError(Exception):
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -7,3 +7,19 @@ import os
 def EN():
    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
    return English(data_dir=data_dir)
+
+
+def pytest_addoption(parser):
+    parser.addoption("--models", action="store_true",
+        help="include tests that require full models")
+    parser.addoption("--vectors", action="store_true",
+        help="include word vectors tests")
+    parser.addoption("--slow", action="store_true",
+        help="include slow tests")
+
+
+
+def pytest_runtest_setup(item):
+    for opt in ['models', 'vectors', 'slow']:
+        if opt in item.keywords and not item.config.getoption("--%s" % opt):
+            pytest.skip("need --%s option to run" % opt)
--- a/tests/parser/test_ner.py
+++ b/tests/parser/test_ner.py
@ -1,4 +1,6 @@
+import pytest

+@pytest.mark.models
 def test_simple_types(EN):
    tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
    ents = list(tokens.ents)
--- a/tests/parser/test_parse.py
+++ b/tests/parser/test_parse.py
@ -1,6 +1,7 @@
 import pytest


+@pytest.mark.models
 def test_root(EN):
    tokens = EN(u"i don't have other assistance")
    for t in tokens:
--- a/tests/parser/test_parse_navigate.py
+++ b/tests/parser/test_parse_navigate.py
@ -12,6 +12,7 @@ def sun_text():
    return text


+@pytest.mark.models
 def test_consistency(EN, sun_text):
    tokens = EN(sun_text)
    for head in tokens:
@ -21,6 +22,7 @@ def test_consistency(EN, sun_text):
            assert child.head is head


+@pytest.mark.models
 def test_child_consistency(EN, sun_text):
    tokens = EN(sun_text)

@ -53,6 +55,7 @@ def test_child_consistency(EN, sun_text):
        assert not children


+@pytest.mark.models
 def test_edges(EN):
    sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
    tokens = EN(sun_text)
--- a/tests/parser/test_subtree.py
+++ b/tests/parser/test_subtree.py
@ -1,6 +1,8 @@
 from __future__ import unicode_literals
+import pytest


+@pytest.mark.models
 def test_subtrees(EN):
    sent = EN('The four wheels on the bus turned quickly')
    wheels = sent[2]
--- a/tests/serialize/test_huffman.py
+++ b/tests/serialize/test_huffman.py
@ -45,7 +45,7 @@ def test1():
    codec = HuffmanCodec(list(enumerate(probs)))
    
    py_codes = py_encode(dict(enumerate(probs)))
-    py_codes = py_codes.items()
+    py_codes = list(py_codes.items())
    py_codes.sort()
    assert codec.strings == [c for i, c in py_codes]
    
@ -60,7 +60,7 @@ def test_round_trip():
    strings = list(codec.strings)
    codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
    bits = codec.encode(message)
-    string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
+    string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes())
    for word in message:
        code = codes[word]
        assert string[:len(code)] == code
@ -76,7 +76,7 @@ def test_rosetta():
    symb2freq = defaultdict(int)
    for ch in txt:
        symb2freq[ch] += 1
-    by_freq = symb2freq.items()
+    by_freq = list(symb2freq.items())
    by_freq.sort(reverse=True, key=lambda item: item[1])
    symbols = [sym for sym, prob in by_freq]

@ -96,6 +96,7 @@ def test_rosetta():
    assert my_exp_len == py_exp_len


+@pytest.mark.slow
 def test_vocab(EN):
    codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
    expected_length = 0
@ -105,6 +106,7 @@ def test_vocab(EN):
    assert 8 < expected_length < 15


+@pytest.mark.slow
 def test_freqs():
    freqs = []
    words = []
--- a/tests/serialize/test_io.py
+++ b/tests/serialize/test_io.py
@ -0,0 +1,23 @@
+import pytest
+
+from spacy.serialize.packer import Packer
+from spacy.attrs import ORTH, SPACY
+from spacy.tokens import Doc
+import math
+
+
+def test_read_write(EN):
+    doc1 = EN(u'This is a simple test. With a couple of sentences.')
+    doc2 = EN(u'This is another test document.')
+
+    with open('/tmp/spacy_docs.bin', 'wb') as file_:
+        file_.write(doc1.to_bytes())
+        file_.write(doc2.to_bytes())
+
+    with open('/tmp/spacy_docs.bin', 'rb') as file_:
+        bytes1, bytes2 = Doc.read_bytes(file_)
+        r1 = Doc(EN.vocab).from_bytes(bytes1)
+        r2 = Doc(EN.vocab).from_bytes(bytes2)
+
+    assert r1.string == doc1.string
+    assert r2.string == doc2.string
--- a/tests/serialize/test_packer.py
+++ b/tests/serialize/test_packer.py
@ -56,12 +56,12 @@ def test_char_packer(vocab):
    bits = BitArray()
    bits.seek(0)

-    byte_str = b'the dog jumped'
+    byte_str = bytearray(b'the dog jumped')
    packer.char_codec.encode(byte_str, bits)
    bits.seek(0)
    result = [b''] * len(byte_str)
    packer.char_codec.decode(bits, result)
-    assert b''.join(result) == byte_str
+    assert bytearray(result) == byte_str


 def test_packer_unannotated(tokenizer):
@ -120,5 +120,3 @@ def test_packer_annotated(tokenizer):
    assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
    assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
    assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
-
-
--- a/tests/spans/test_merge.py
+++ b/tests/spans/test_merge.py
@ -1,6 +1,8 @@
 from __future__ import unicode_literals
+import pytest


+@pytest.mark.models
 def test_merge_tokens(EN):
    tokens = EN(u'Los Angeles start.')
    assert len(tokens) == 4
@ -12,6 +14,7 @@ def test_merge_tokens(EN):
    assert tokens[0].head.orth_ == 'start'


+@pytest.mark.models
 def test_merge_heads(EN):
    tokens = EN(u'I found a pilates class near work.')
    assert len(tokens) == 8
--- a/tests/spans/test_span.py
+++ b/tests/spans/test_span.py
@ -9,6 +9,7 @@ def doc(EN):
    return EN('This is a sentence. This is another sentence. And a third.')


+@pytest.mark.models
 def test_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -17,6 +18,7 @@ def test_sent_spans(doc):
    assert sum(len(sent) for sent in sents) == len(doc)


+@pytest.mark.models
 def test_root(doc):
    np = doc[2:4]
    assert len(np) == 2
--- a/tests/spans/test_times.py
+++ b/tests/spans/test_times.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import pytest


+@pytest.mark.models
 def test_am_pm(en_nlp):
    numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
    variants = ['a.m.', 'am', 'p.m.', 'pm']
@ -14,7 +15,7 @@ def test_am_pm(en_nlp):
                tokens = en_nlp(string, merge_mwes=True)
                assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
                ents = list(tokens.ents)
-                assert len(ents) == 1
+                assert len(ents) == 1, ents
                assert ents[0].label_ == 'TIME', string
                if ents[0].start == 4 and ents[0].end == 5:
                    assert ents[0].orth_ == '%s%s%s' % (num, space, var)
--- a/tests/tagger/test_add_lemmas.py
+++ b/tests/tagger/test_add_lemmas.py
@ -17,6 +17,7 @@ def lemmas(tagged):
    return [t.lemma_ for t in tagged]


+@pytest.mark.models
 def test_lemmas(lemmas, tagged):
    assert lemmas[0] == 'banana'
    assert lemmas[1] == 'in'
--- a/tests/tagger/test_morph_exceptions.py
+++ b/tests/tagger/test_morph_exceptions.py
@ -12,6 +12,7 @@ def morph_exc():
           }


+@pytest.mark.models
 def test_load_exc(morph_exc):
    # Do this local as we want to modify it
    nlp =  English()
--- a/tests/tagger/test_tag_names.py
+++ b/tests/tagger/test_tag_names.py
@ -1,7 +1,9 @@
 from spacy.en import English
 import six
+import pytest


+@pytest.mark.models
 def test_tag_names(EN):
    tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
    pizza = tokens[2]
--- a/tests/test_docs.py
+++ b/tests/test_docs.py
@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 """Sphinx doctest is just too hard. Manually paste doctest examples here"""
 from spacy.en.attrs import IS_LOWER
+import pytest

-
+@pytest.mark.models
 def test_1():
    import spacy.en
    from spacy.parts_of_speech import ADV
@ -21,6 +22,7 @@ def test_1():
    assert o == -11.07155704498291


+@pytest.mark.models
 def test2():
    import spacy.en
    from spacy.parts_of_speech import ADV
@ -41,6 +43,7 @@ def test2():
    -11.07155704498291


+@pytest.mark.models
 def test3():
    import spacy.en
    from spacy.parts_of_speech import ADV
--- a/tests/tokens/test_array.py
+++ b/tests/tokens/test_array.py
@ -15,6 +15,7 @@ def test_attr_of_token(EN):
    assert feats_array[0][0] != feats_array[0][1]


+@pytest.mark.models
 def test_tag(EN):
    text = u'A nice sentence.'
    tokens = EN(text)
@ -26,6 +27,7 @@ def test_tag(EN):
    assert feats_array[3][1] == tokens[3].tag


+@pytest.mark.models
 def test_dep(EN):
    text = u'A nice sentence.'
    tokens = EN(text)
--- a/tests/tokens/test_token.py
+++ b/tests/tokens/test_token.py
@ -4,6 +4,7 @@ import pytest
 from spacy.parts_of_speech import ADV


+@pytest.mark.models
 def test_prob(EN):
    tokens = EN(u'Give it back', parse=False)
    give = tokens[0]
--- a/tests/tokens/test_token_api.py
+++ b/tests/tokens/test_token_api.py
@ -7,6 +7,7 @@ from spacy.en.attrs import IS_STOP
 import pytest


+@pytest.mark.models
 def test_strings(EN):
    tokens = EN(u'Give it back! He pleaded.')
    token = tokens[0]
--- a/tests/tokens/test_token_references.py
+++ b/tests/tokens/test_token_references.py
@ -9,6 +9,7 @@ data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
 # Let this have its own instances, as we have to be careful about memory here
 # that's the point, after all

+@pytest.mark.models
 def get_orphan_token(text, i):
    nlp = English(load_vectors=False, data_dir=data_dir)
    tokens = nlp(text)
@ -18,6 +19,7 @@ def get_orphan_token(text, i):
    return token


+@pytest.mark.models
 def test_orphan():
    orphan = get_orphan_token('An orphan token', 1)
    gc.collect()
@ -36,6 +38,7 @@ def _orphan_from_list(toks):
    return lst


+@pytest.mark.models
 def test_list_orphans():
    # Test case from NSchrading
    nlp = English(load_vectors=False, data_dir=data_dir)
--- a/tests/tokens/test_tokens_api.py
+++ b/tests/tokens/test_tokens_api.py
@ -5,7 +5,7 @@ from spacy.tokens import Doc
 import pytest


-def test_getitem(EN):
+def mest_getitem(EN):
    tokens = EN(u'Give it back! He pleaded.')
    assert tokens[0].orth_ == 'Give'
    assert tokens[-1].orth_ == '.'
@ -13,10 +13,19 @@ def test_getitem(EN):
        tokens[len(tokens)]


-def test_serialize(EN):
-    tokens = EN(u' Give it back! He pleaded. ')
-    packed = tokens.serialize()
-    new_tokens = Doc.deserialize(EN.vocab, packed)
+def mest_serialize(EN):
+    tokens = EN(u'Give it back! He pleaded.')
+    packed = tokens.to_bytes()
+    new_tokens = Doc(EN.vocab).from_bytes(packed)
+    assert tokens.string == new_tokens.string
+    assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
+    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
+
+
+def test_serialize_whitespace(EN):
+    tokens = EN(u' Give it back! He pleaded. ')
+    packed = tokens.to_bytes()
+    new_tokens = Doc(EN.vocab).from_bytes(packed)
    assert tokens.string == new_tokens.string
    assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
    assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
--- a/tests/tokens/test_vec.py
+++ b/tests/tokens/test_vec.py
@ -4,13 +4,14 @@ from spacy.en import English

 import pytest

-
+@pytest.mark.vectors
 def test_vec(EN):
    hype = EN.vocab['hype']
    assert hype.orth_ == 'hype'
    assert 0.08 >= hype.repvec[0] > 0.07


+@pytest.mark.vectors
 def test_capitalized(EN):
    hype = EN.vocab['Hype']
    assert hype.orth_ == 'Hype'
--- a/tests/vocab/test_intern.py
+++ b/tests/vocab/test_intern.py
@ -39,7 +39,7 @@ def test_retrieve_id(sstore):

 def test_med_string(sstore):
    nine_char_string = sstore[b'0123456789']
-    assert sstore[nine_char_string] == b'0123456789'
+    assert sstore[nine_char_string] == u'0123456789'
    dummy = sstore[b'A']
    assert sstore[b'0123456789'] == nine_char_string