diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index 633ba48e4..933de124e 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -12,7 +12,10 @@ from .attrs import get_flags
 
 
 def get_lex_props(string):
-    return {'flags': get_flags(string), 'dense': 1}
+    return {'flags': get_flags(string), 'length': len(string),
+            'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
+            'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
+            'sentiment': 0}
 
 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
 
@@ -45,7 +48,7 @@ class English(object):
     """
     def __init__(self, data_dir=LOCAL_DATA_DIR):
         self._data_dir = data_dir
-        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
+        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
                            get_lex_props=get_lex_props)
         tag_names = list(POS_TAGS.keys())
         tag_names.sort()
diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx
index 1ff465442..ab0187ec3 100644
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@@ -283,12 +283,12 @@ cdef class EnPosTagger:
     cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
         if self.lemmatizer is None:
             return lex.sic
-        cdef bytes py_string = self.strings[lex.sic]
+        cdef unicode py_string = self.strings[lex.sic]
         if pos != NOUN and pos != VERB and pos != ADJ:
             return lex.sic
         cdef set lemma_strings
         cdef unicode lemma_string
-        lemma_strings = self.lemmatizer(py_string.decode('utf8'), pos)
+        lemma_strings = self.lemmatizer(py_string, pos)
         lemma_string = sorted(lemma_strings)[0]
         lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
         return lemma
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index edd871bde..8686f8e6a 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -7,10 +7,8 @@ from .strings cimport StringStore
 cdef LexemeC EMPTY_LEXEME
 
 
-cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
-                  dict props) except *
+cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1
  
-
 cdef class Lexeme:
     cdef const float* vec
 
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 8eac4b753..dfc82d46e 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -5,27 +5,27 @@ from murmurhash.mrmr cimport hash64
 from libc.string cimport memset
 
 from .orth cimport word_shape
+from .typedefs cimport attr_t
 
 
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 
 
-cdef LexemeC init(id_t i, unicode string, hash_t hashed,
-                  StringStore string_store, dict props) except *:
-    cdef LexemeC lex
-    lex.id = i
-    lex.length = len(string)
-    lex.sic = string_store[string]
-    
-    lex.cluster = props.get('cluster', 0)
-    lex.prob = props.get('prob', 0)
+cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1:
 
-    lex.prefix = string_store[string[:1]]
-    lex.suffix = string_store[string[-3:]]
-    lex.shape = string_store[word_shape(string)]
-   
-    lex.flags = props.get('flags', 0)
-    return lex
+    lex.length = props['length']
+    lex.sic = string_store[props['sic']]
+    lex.norm1 = string_store[props['norm1']] 
+    lex.norm2 = string_store[props['norm2']] 
+    lex.shape = string_store[props['shape']] 
+    lex.prefix = string_store[props['prefix']]
+    lex.suffix = string_store[props['suffix']]
+    
+    lex.cluster = props['cluster']
+    lex.prob = props['prob']
+    lex.sentiment = props['sentiment']
+
+    lex.flags = props['flags']
 
 
 cdef class Lexeme:
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 29afde45c..df9b89dc3 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -67,7 +67,7 @@ cdef class StringStore:
             if string_or_id < 1 or string_or_id >= self.size:
                 raise IndexError(string_or_id)
             utf8str = &self.strings[<int>string_or_id]
-            return utf8str.chars[:utf8str.length]
+            return utf8str.chars[:utf8str.length].decode('utf8')
         elif isinstance(string_or_id, bytes):
             utf8str = self.intern(<char*>string_or_id, len(string_or_id))
             return utf8str.i
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 35a7c2b63..101bcad63 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -42,32 +42,5 @@ cdef class Tokens:
 
 
 cdef class Token:
-    cdef cvarray vec
-
-    cdef readonly flags_t flags
-   
-    cdef readonly attr_t id
-    cdef readonly attr_t sic
-    cdef readonly attr_t dense
-    cdef readonly attr_t shape
-    cdef readonly attr_t prefix
-    cdef readonly attr_t suffix
- 
-    cdef readonly attr_t length
-    cdef readonly attr_t cluster
-    cdef readonly attr_t pos_type
-
-    cdef readonly float prob
-    cdef readonly float sentiment
-
-    cdef readonly Morphology morph
-    cdef readonly univ_tag_t pos
-    cdef readonly int fine_pos
-    cdef readonly int idx
-    cdef readonly int lemma
-    cdef readonly int sense
-    cdef readonly int dep_tag
-    
-    cdef readonly int head_offset
-    cdef readonly uint32_t l_kids
-    cdef readonly uint32_t r_kids
+    cdef readonly Tokens _seq
+    cdef readonly int i
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 85f2b4f69..61aab89b1 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -85,7 +85,7 @@ cdef class Tokens:
             token (Token):
         """
         bounds_check(i, self.length, PADDING)
-        return cinit_token(&self.data[i])
+        return Token(self, i)
 
     def __iter__(self):
         """Iterate over the tokens.
@@ -174,38 +174,26 @@ cdef class Tokens:
             self.data[i].lex = &EMPTY_LEXEME
 
 
-cdef Token cinit_token(const TokenC* c_tok):
-    cdef Token py_tok = Token.__new__(Token)
-    py_tok.morph = c_tok.morph
-    py_tok.pos = c_tok.pos
-    py_tok.fine_pos = c_tok.fine_pos
-    py_tok.idx = c_tok.idx
-    py_tok.lemma = c_tok.lemma
-    py_tok.sense = c_tok.sense
-    py_tok.dep_tag = c_tok.dep_tag
-    py_tok.head_offset = c_tok.head
-    py_tok.l_kids = c_tok.l_kids
-    py_tok.r_kids = c_tok.r_kids
-    return py_tok
-
-
+@cython.freelist(64)
 cdef class Token:
     """An individual token.
-    """
-    def __init__(self):
-        pass
-        #self._seq = tokens
-        #self.i = i
 
-    #def __unicode__(self):
-    #    cdef const TokenC* t = &self._seq.data[self.i]
-    #    cdef int end_idx = t.idx + t.lex.length
-    #    if self.i + 1 == self._seq.length:
-    #        return self.string
-    #    if end_idx == t[1].idx:
-    #        return self.string
-    #    else:
-    #        return self.string + ' '
+    Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
+    object.
+    """
+    def __init__(self, Tokens tokens, int i):
+        self._seq = tokens
+        self.i = i
+
+    def __unicode__(self):
+        cdef const TokenC* t = &self._seq.data[self.i]
+        cdef int end_idx = t.idx + t.lex.length
+        if self.i + 1 == self._seq.length:
+            return self.string
+        if end_idx == t[1].idx:
+            return self.string
+        else:
+            return self.string + ' '
 
     def __len__(self):
         """The number of unicode code-points in the original string.
@@ -213,87 +201,87 @@ cdef class Token:
         Returns:
             length (int):
         """
-        return self.length
+        return self._seq.data[self.i].lex.length
 
-    #property idx:
-    #    """The index into the original string at which the token starts.
+    property idx:
+        """The index into the original string at which the token starts.
 
-    #    The following is supposed to always be true:
-    #    
-    #    >>> original_string[token.idx:token.idx len(token) == token.string
-    #    """
-    #    def __get__(self):
-    #        return self._seq.data[self.i].idx
+        The following is supposed to always be true:
+        
+        >>> original_string[token.idx:token.idx len(token) == token.string
+        """
+        def __get__(self):
+            return self._seq.data[self.i].idx
 
-    #property cluster:
-    #    """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
-    #
-    #    Similar words have better-than-chance likelihood of having similar cluster
-    #    IDs, although the clustering is quite noisy.  Cluster IDs make good features,
-    #    and help to make models slightly more robust to domain variation.
+    property cluster:
+        """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
+    
+        Similar words have better-than-chance likelihood of having similar cluster
+        IDs, although the clustering is quite noisy.  Cluster IDs make good features,
+        and help to make models slightly more robust to domain variation.
 
-    #    A common trick is to use only the first N bits of a cluster ID in a feature,
-    #    as the more general part of the hierarchical clustering is often more accurate
-    #    than the lower categories.
+        A common trick is to use only the first N bits of a cluster ID in a feature,
+        as the more general part of the hierarchical clustering is often more accurate
+        than the lower categories.
 
-    #    To assist in this, I encode the cluster IDs little-endian, to allow a simple
-    #    bit-mask:
+        To assist in this, I encode the cluster IDs little-endian, to allow a simple
+        bit-mask:
 
-    #    >>> six_bits = cluster & (2**6 - 1)
-    #    """
-    #    def __get__(self):
-    #        return self._seq.data[self.i].lex.cluster
+        >>> six_bits = cluster & (2**6 - 1)
+        """
+        def __get__(self):
+            return self._seq.data[self.i].lex.cluster
 
-    #property string:
-    #    """The unicode string of the word, with no whitespace padding."""
-    #    def __get__(self):
-    #        cdef const TokenC* t = &self._seq.data[self.i]
-    #        if t.lex.sic == 0:
-    #            return ''
-    #        cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
-    #        return utf8string.decode('utf8')
+    property string:
+        """The unicode string of the word, with no whitespace padding."""
+        def __get__(self):
+            cdef const TokenC* t = &self._seq.data[self.i]
+            if t.lex.sic == 0:
+                return ''
+            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
+            return py_ustr
 
-    #property lemma:
-    #    """The unicode string of the word's lemma.  If no part-of-speech tag is
-    #    assigned, the most common part-of-speech tag of the word is used.
-    #    """
-    #    def __get__(self):
-    #        cdef const TokenC* t = &self._seq.data[self.i]
-    #        if t.lemma == 0:
-    #            return self.string
-    #        cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
-    #        return utf8string.decode('utf8')
+    property lemma:
+        """The unicode string of the word's lemma.  If no part-of-speech tag is
+        assigned, the most common part-of-speech tag of the word is used.
+        """
+        def __get__(self):
+            cdef const TokenC* t = &self._seq.data[self.i]
+            if t.lemma == 0:
+                return self.string
+            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
+            return py_ustr
 
-    #property dep_tag:
-    #    """The ID integer of the word's dependency label.  If no parse has been
-    #    assigned, defaults to 0.
-    #    """
-    #    def __get__(self):
-    #        return self._seq.data[self.i].dep_tag
+    property dep_tag:
+        """The ID integer of the word's dependency label.  If no parse has been
+        assigned, defaults to 0.
+        """
+        def __get__(self):
+            return self._seq.data[self.i].dep_tag
 
-    #property pos:
-    #    """The ID integer of the word's part-of-speech tag, from the 13-tag
-    #    Google Universal Tag Set.  Constants for this tag set are available in
-    #    spacy.typedefs.
-    #    """
-    #    def __get__(self):
-    #        return self._seq.data[self.i].pos
+    property pos:
+        """The ID integer of the word's part-of-speech tag, from the 13-tag
+        Google Universal Tag Set.  Constants for this tag set are available in
+        spacy.typedefs.
+        """
+        def __get__(self):
+            return self._seq.data[self.i].pos
 
-    #property fine_pos:
-    #    """The ID integer of the word's fine-grained part-of-speech tag, as assigned
-    #    by the tagger model.  Fine-grained tags include morphological information,
-    #    and other distinctions, and allow a more accurate tagger to be trained.
-    #    """
+    property fine_pos:
+        """The ID integer of the word's fine-grained part-of-speech tag, as assigned
+        by the tagger model.  Fine-grained tags include morphological information,
+        and other distinctions, and allow a more accurate tagger to be trained.
+        """
  
-    #    def __get__(self):
-    #        return self._seq.data[self.i].fine_pos
+        def __get__(self):
+            return self._seq.data[self.i].fine_pos
 
-    #property sic:
-    #    def __get__(self):
-    #        return self._seq.data[self.i].lex.sic
+    property sic:
+        def __get__(self):
+            return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
 
-    #property head:
-    #    """The token predicted by the parser to be the head of the current token."""
-    #    def __get__(self):
-    #        cdef const TokenC* t = &self._seq.data[self.i]
-    #        return Token(self._seq, self.i + t.head)
+    property head:
+        """The token predicted by the parser to be the head of the current token."""
+        def __get__(self):
+            cdef const TokenC* t = &self._seq.data[self.i]
+            return Token(self._seq, self.i + t.head)
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 203d3c7a5..25d62cffe 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -24,12 +24,13 @@ cdef struct _Cached:
 
 
 cdef class Vocab:
-    cpdef public get_lex_props
+    cpdef public lexeme_props_getter
     cdef Pool mem
     cpdef readonly StringStore strings
-    cdef vector[LexemeC*] lexemes
+    cdef vector[const LexemeC*] lexemes
 
     cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
     
     cdef PreshMap _map
   
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 04943ac33..4043b14e0 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -5,7 +5,7 @@ from os import path
 import codecs
 
 from .lexeme cimport EMPTY_LEXEME
-from .lexeme cimport init as lexeme_init
+from .lexeme cimport set_lex_struct_props
 from .lexeme cimport Lexeme_cinit
 from .strings cimport slice_unicode
 from .strings cimport hash_string
@@ -21,24 +21,6 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 EMPTY_LEXEME.vec = EMPTY_VEC
 
 
-cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
-                  StringStore string_store, dict props) except *:
-    cdef LexemeC lex
-    lex.id = i
-    lex.length = len(string)
-    lex.sic = string_store[string]
-    
-    lex.cluster = props.get('cluster', 0)
-    lex.prob = props.get('prob', 0)
-
-    lex.prefix = string_store[string[:1]]
-    lex.suffix = string_store[string[-3:]]
-    lex.shape = string_store[word_shape(string)]
-   
-    lex.flags = props.get('flags', 0)
-    return lex
-
-
 cdef class Vocab:
     '''A map container for a language's LexemeC structs.
     '''
@@ -47,7 +29,7 @@ cdef class Vocab:
         self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
-        self.get_lex_props = get_lex_props
+        self.lexeme_props_getter = get_lex_props
 
         if data_dir is not None:
             if not path.exists(data_dir):
@@ -63,32 +45,36 @@ cdef class Vocab:
         """The current number of lexemes stored."""
         return self.lexemes.size()
 
-    cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
+    cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
         if necessary, using memory acquired from the given pool.  If the pool
         is the lexicon's own memory, the lexeme is saved in the lexicon.'''
         cdef LexemeC* lex
-        lex = <LexemeC*>self._map.get(string.key)
+        lex = <LexemeC*>self._map.get(c_str.key)
         if lex != NULL:
             return lex
-        if string.n < 3:
+        if c_str.n < 3:
             mem = self.mem
-        cdef unicode py_string = string.chars[:string.n]
+        cdef unicode py_str = c_str.chars[:c_str.n]
         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
-                             self.get_lex_props(py_string))
+        props = self.lexeme_props_getter(py_str)
+        set_lex_struct_props(lex, props, self.strings)
         if mem is self.mem:
-            self._map.set(string.key, lex)
-            while self.lexemes.size() < (lex.id + 1):
-                self.lexemes.push_back(&EMPTY_LEXEME)
-            self.lexemes[lex.id] = lex
+            lex.id = self.lexemes.size()
+            self._add_lex_to_vocab(c_str.key, lex)
         else:
-            lex[0].id = 1
+            lex.id = 1
         return lex
 
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
+        self._map.set(key, <void*>lex)
+        while self.lexemes.size() < (lex.id + 1):
+            self.lexemes.push_back(&EMPTY_LEXEME)
+        self.lexemes[lex.id] = lex
+
     def __getitem__(self,  id_or_string):
         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
-        unseen unicode string is given, a new LexemeC is created and stored.
+        unseen unicode string is given, a new lexeme is created and stored.
 
         Args:
             id_or_string (int or unicode): The integer ID of a word, or its unicode
@@ -100,24 +86,28 @@ cdef class Vocab:
             lexeme (Lexeme): An instance of the Lexeme Python class, with data
                 copied on instantiation.
         '''
-        cdef UniStr string
+        cdef UniStr c_str
         cdef const LexemeC* lexeme
         if type(id_or_string) == int:
             if id_or_string >= self.lexemes.size():
                 raise IndexError
             lexeme = self.lexemes.at(id_or_string)
         else:
-            slice_unicode(&string, id_or_string, 0, len(id_or_string))
-            lexeme = self.get(self.mem, &string)
+            slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
+            lexeme = self.get(self.mem, &c_str)
         return Lexeme_cinit(lexeme, self.strings)
 
-    def __setitem__(self, unicode uni_string, dict props):
-        cdef UniStr s
-        slice_unicode(&s, uni_string, 0, len(uni_string))
-        # Cast through the const here, since we're allowed to change our own
-        # LexemeCs.
-        lex = <LexemeC*><void*>self.get(self.mem, &s)
-        lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
+    def __setitem__(self, unicode py_str, dict props):
+        cdef UniStr c_str
+        slice_unicode(&c_str, py_str, 0, len(py_str))
+        cdef LexemeC* lex
+        lex = <LexemeC*>self._map.get(c_str.key)
+        if lex == NULL:
+            lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
+            lex.id = self.lexemes.size()
+            self._add_lex_to_vocab(c_str.key, lex)
+        set_lex_struct_props(lex, props, self.strings)
+        assert lex.sic < 1000000
 
     def dump(self, loc):
         if path.exists(loc):
@@ -154,6 +144,7 @@ cdef class Vocab:
             if st != 1:
                 break
             lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
+            lexeme.vec = EMPTY_VEC
             st = fread(lexeme, sizeof(LexemeC), 1, fp)
             if st != 1:
                 break