diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index a04b615da..3d433e497 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -41,6 +41,8 @@ def get_lex_props(string, oov_prob=-30, is_oov=False):
         'sentiment': 0
     }
 
+get_lex_attr = {}
+
 if_model_present = -1
 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
 
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 321f7c616..510840b2b 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE
 
 from .structs cimport LexemeC
 from .strings cimport StringStore
+from .vocab cimport Vocab
 
 from numpy cimport ndarray
 
@@ -15,7 +16,8 @@ cdef class Lexeme:
     cdef readonly Vocab vocab
     cdef readonly attr_t orth
 
-    cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
+    @staticmethod
+    cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
         lex.length = props['length']
         lex.orth = vocab.strings[props['orth']]
         lex.lower = vocab.strings[props['lower']]
@@ -29,7 +31,6 @@ cdef class Lexeme:
         lex.sentiment = props['sentiment']
 
         lex.flags = props['flags']
-        lex.repvec = empty_vec
 
     @staticmethod
     cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@@ -55,6 +56,34 @@ cdef class Lexeme:
             return lex.cluster
         else:
             return 0
+    
+    @staticmethod
+    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
+        if name < (sizeof(flags_t) * 8):
+            Lexeme.set_flag(lex, name, value)
+        elif name == ID:
+            lex.id = value
+        elif name == LOWER:
+            lex.lower = value
+        elif name == NORM:
+            lex.norm = value
+        elif name == SHAPE:
+            lex.shape = value
+        elif name == PREFIX:
+            lex.prefix = value
+        elif name == SUFFIX:
+            lex.suffix = value
+        elif name == CLUSTER:
+            lex.cluster = value
 
+    @staticmethod
     cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
         return lexeme.flags & (1 << flag_id)
+
+    @staticmethod
+    cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil:
+        cdef flags_t one = 1
+        if value:
+            lex.flags |= one << flag_id
+        else:
+            lex.flags &= ~(one << flag_id)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index f0b3303f1..4deec60c1 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -26,12 +26,9 @@ cdef class Lexeme:
     def __init__(self, Vocab vocab, int orth):
         self.vocab = vocab
         self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
+        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        assert self.c.orth == orth
 
-    property orth:
-        def __get__(self): 
-            return self.c.orth
-    
     property lower:
         def __get__(self): return self.c.lower
         def __set__(self, int x): self.c.lower = x
@@ -113,7 +110,7 @@ cdef class Lexeme:
         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
     
     property like_num:
-        def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
+        def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
 
     property like_email:
diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 72473b073..9d1220648 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -103,20 +103,21 @@ cdef class Matcher:
 
     def __init__(self, vocab, patterns):
         self.mem = Pool()
+        self.vocab = vocab
         for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
             self.add(entity_key, etype, attrs, specs)
 
     def add(self, entity_key, etype, attrs, specs):
         if isinstance(entity_key, basestring):
-            entity_key = vocab.strings[entity_key]
+            entity_key = self.vocab.strings[entity_key]
         if isinstance(etype, basestring):
-            etype = vocab.strings[etype]
+            etype = self.vocab.strings[etype]
         elif etype is None:
             etype = -1
         # TODO: Do something more clever about multiple patterns for single
         # entity
         for spec in specs:
-            spec = _convert_strings(spec, vocab.strings)
+            spec = _convert_strings(spec, self.vocab.strings)
             self.patterns.push_back(init_pattern(self.mem, spec, etype))
 
     @classmethod
diff --git a/spacy/orth.pyx b/spacy/orth.pyx
index ca4bbd9ba..df4e2dc32 100644
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@@ -92,6 +92,7 @@ cpdef bint like_url(unicode string):
     return False
 
 
+# TODO: This should live in the language.orth
 NUM_WORDS = set('zero one two three four five six seven eight nine ten'
                 'eleven twelve thirteen fourteen fifteen sixteen seventeen'
                 'eighteen nineteen twenty thirty forty fifty sixty seventy'
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index c187a6aa6..a4a470158 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -142,6 +142,8 @@ cdef class StringStore:
     def load(self, loc):
         with codecs.open(loc, 'r', 'utf8') as file_:
             strings = file_.read().split(SEPARATOR)
+        if strings == ['']:
+            return None
         cdef unicode string
         cdef bytes byte_string
         for string in strings: 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 7994c97c3..0fa562dfb 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -12,8 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech import UNIV_POS_NAMES
 from ..parts_of_speech cimport CONJ, PUNCT, NOUN
-from ..lexeme cimport check_flag
-from ..lexeme cimport get_attr as get_lex_attr
+from ..lexeme cimport Lexeme
 from .spans cimport Span
 from .token cimport Token
 from ..serialize.bits cimport BitArray
@@ -47,7 +46,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
     elif feat_name == ENT_TYPE:
         return token.ent_type
     else:
-        return get_lex_attr(token.lex, feat_name)
+        return Lexeme.get_struct_attr(token.lex, feat_name)
 
 
 cdef class Doc:
@@ -218,6 +217,7 @@ cdef class Doc:
             t.idx = 0
         else:
             t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
+        assert t.lex.orth != 0
         t.spacy = has_space
         self.length += 1
         self._py_tokens.append(None)
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index f1f2696cb..04945ecd1 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,6 +1,5 @@
 from libc.string cimport memcpy
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
-from ..lexeme cimport check_flag
 # Compiler crashes on memory view coercion without this. Should report bug.
 from cython.view cimport array as cvarray
 cimport numpy as np
@@ -20,6 +19,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV
 
+from ..lexeme cimport Lexeme
+
 
 cdef class Token:
     """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
@@ -42,7 +43,7 @@ cdef class Token:
         return self.string
 
     cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        return check_flag(self.c.lex, flag_id)
+        return Lexeme.check_flag(self.c.lex, flag_id)
 
     def nbor(self, int i=1):
         return self.doc[self.i+i]
@@ -286,37 +287,37 @@ cdef class Token:
             return self.vocab.strings[self.c.dep]
 
     property is_oov:
-        def __get__(self): return check_flag(self.c.lex, IS_OOV)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
 
     property is_alpha:
-        def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
 
     property is_ascii:
-        def __get__(self): return check_flag(self.c.lex, IS_ASCII)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII)
 
     property is_digit:
-        def __get__(self): return check_flag(self.c.lex, IS_DIGIT)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT)
 
     property is_lower:
-        def __get__(self): return check_flag(self.c.lex, IS_LOWER)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER)
 
     property is_title:
-        def __get__(self): return check_flag(self.c.lex, IS_TITLE)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE)
 
     property is_punct:
-        def __get__(self): return check_flag(self.c.lex, IS_PUNCT)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT)
 
     property is_space: 
-        def __get__(self): return check_flag(self.c.lex, IS_SPACE)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE)
 
     property like_url:
-        def __get__(self): return check_flag(self.c.lex, LIKE_URL)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL)
 
     property like_num:
-        def __get__(self): return check_flag(self.c.lex, LIKE_NUM)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM)
 
     property like_email:
-        def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL)
+        def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL)
 
 
 _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 2503cdcee..cf7a46388 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -27,15 +27,16 @@ cdef class Vocab:
     cpdef public lexeme_props_getter
     cdef Pool mem
     cpdef readonly StringStore strings
-    cdef readonly object pos_tags
     cdef readonly int length
     cdef public object _serializer
     cdef public object data_dir
-    cdef public float oov_prob
+    cdef public object get_lex_attr
+    cdef public object pos_tags
 
     cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
     
+    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
 
     cdef PreshMap _by_hash
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index dcb7d575c..4c35ea41c 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -12,7 +12,6 @@ import math
 import json
 
 from .lexeme cimport EMPTY_LEXEME
-from .lexeme cimport set_lex_struct_props
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .orth cimport word_shape
@@ -36,17 +35,15 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
     '''A map container for a language's LexemeC structs.
     '''
-    def __init__(self, data_dir=None, get_lex_attr=None):
+    def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=True, pos_tags=None):
         self.mem = Pool()
         self._by_hash = PreshMap()
         self._by_orth = PreshMap()
         self.strings = StringStore()
-        self.pos_tags = pos_tags if pos_tags is not None else {}
-        
         self.get_lex_attr = get_lex_attr
         self.repvec_length = 0
-        self.length = 0
-        self._add_lex_to_vocab(0, &EMPTY_LEXEME)
+        self.length = 1
+        self.pos_tags = pos_tags
         if data_dir is not None:
             if not path.exists(data_dir):
                 raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
@@ -84,7 +81,10 @@ cdef class Vocab:
         cdef LexemeC* lex
         cdef hash_t key = hash_string(string)
         lex = <LexemeC*>self._by_hash.get(key)
+        cdef size_t addr
         if lex != NULL:
+            print string, lex.orth, self.strings[string]
+            assert lex.orth == self.strings[string]
             return lex
         else:
             return self._new_lexeme(mem, string)
@@ -103,15 +103,24 @@ cdef class Vocab:
             return self._new_lexeme(mem, self.strings[orth])
 
     cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
+        cdef hash_t key
         cdef bint is_oov = mem is not self.mem
-        if len(string) < 3:
-            mem = self.mem
+        mem = self.mem
+        #if len(string) < 3:
+        #    mem = self.mem
         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        for attr, func in self.lex_attr_getters.items():
-            Lexeme.set_struct_attr(lex, attr, func(string))
+        lex.orth = self.strings[string]
+        lex.id = self.length
+        if self.get_lex_attr is not None:
+            for attr, func in self.get_lex_attr.items():
+                value = func(string)
+                if isinstance(value, unicode):
+                    value = self.strings[value]
+                Lexeme.set_struct_attr(lex, attr, value)
         if is_oov:
             lex.id = 0
         else:
+            key = hash_string(string)
             self._add_lex_to_vocab(key, lex)
         assert lex != NULL, string
         return lex
@@ -119,13 +128,14 @@ cdef class Vocab:
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
         self._by_hash.set(key, <void*>lex)
         self._by_orth.set(lex.orth, <void*>lex)
+        print "Add lex", key, lex.orth, self.strings[lex.orth]
         self.length += 1
 
     def __iter__(self):
         cdef attr_t orth
         cdef size_t addr
         for orth, addr in self._by_orth.items():
-            yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
+            yield Lexeme(self, orth)
 
     def __getitem__(self,  id_or_string):
         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
@@ -142,22 +152,12 @@ cdef class Vocab:
               An instance of the Lexeme Python class, with data copied on
               instantiation.
         '''
-        cdef const LexemeC* lexeme
         cdef attr_t orth
-        if type(id_or_string) == int:
-            orth = id_or_string
-            lexeme = <LexemeC*>self._by_orth.get(orth)
-            if lexeme == NULL:
-                raise KeyError(id_or_string)
-            assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
-        elif type(id_or_string) == unicode:
-            lexeme = self.get(self.mem, id_or_string)
-            assert lexeme.orth == self.strings[id_or_string]
+        if type(id_or_string) == unicode:
+            orth = self.strings[id_or_string]
         else:
-            raise ValueError("Vocab unable to map type: "
-                "%s. Maps unicode --> Lexeme or "
-                "int --> Lexeme" % str(type(id_or_string)))
-        return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
+            orth = id_or_string
+        return Lexeme(self, orth)
 
     def dump(self, loc):
         if path.exists(loc):