From b15619e170f9940f155f9af8bbbab5f4d5ae23e3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Thu, 25 Sep 2014 18:22:52 +0200
Subject: [PATCH] * Use PointerHash instead of locally provided _hashing module

---
 setup.py       |   1 -
 spacy/en.pyx   |   2 -
 spacy/lang.pxd |  11 +++--
 spacy/lang.pyx | 108 +++++++++----------------------------------------
 4 files changed, 26 insertions(+), 96 deletions(-)

diff --git a/setup.py b/setup.py
index 7b2a4db0f..bab596367 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,6 @@ else:
 
 exts = [
     Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.word", ["spacy/word.pyx"], language="c++",
               include_dirs=includes),
     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++",
diff --git a/spacy/en.pyx b/spacy/en.pyx
index 57dc4bbcf..a51349116 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -42,8 +42,6 @@ from libc.stdint cimport uint64_t
 cimport lang
 from spacy.lexeme cimport lexeme_check_flag
 from spacy.lexeme cimport lexeme_string_view
-from spacy._hashing cimport PointerHash
-
 
 from spacy import orth
 
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 652c9ff2f..3f414708d 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -3,7 +3,7 @@ from libc.stdint cimport uint64_t
 from spacy.word cimport Lexeme
 from spacy.tokens cimport Tokens
 from spacy.lexeme cimport LexemeC
-from spacy._hashing cimport PointerHash
+from trustyc.maps cimport PointerMap
 
 from cymem.cymem cimport Pool
 
@@ -30,7 +30,7 @@ cdef class Lexicon:
     cpdef Lexeme lookup(self, unicode string)
     cdef LexemeC* get(self, String* s) except NULL
     
-    cdef PointerHash _dict
+    cdef PointerMap _dict
     
     cdef list _string_features
     cdef list _flag_features
@@ -39,10 +39,13 @@ cdef class Lexicon:
 cdef class Language:
     cdef Pool _mem
     cdef unicode name
-    cdef PointerHash cache
-    cdef PointerHash specials
+    cdef PointerMap cache
+    cdef PointerMap specials
     cpdef readonly Lexicon lexicon
 
+    cdef object prefix_re
+    cdef object suffix_re
+
     cpdef Tokens tokenize(self, unicode text)
     cpdef Lexeme lookup(self, unicode text)
 
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 26a836d3b..e7e330b68 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -11,6 +11,7 @@ from __future__ import unicode_literals
 import json
 import random
 from os import path
+import re
 
 from .util import read_lang_data
 from spacy.tokens import Tokens
@@ -25,7 +26,7 @@ from cython.operator cimport preincrement as preinc
 from cython.operator cimport dereference as deref
 
 
-from spacy._hashing cimport PointerHash
+from trustyc.maps cimport PointerMap
 from spacy import orth
 from spacy import util
 
@@ -129,10 +130,12 @@ cdef class Language:
     def __cinit__(self, name, user_string_features, user_flag_features):
         self.name = name
         self._mem = Pool()
-        self.cache = PointerHash(2 ** 25)
-        self.specials = PointerHash(2 ** 16)
+        self.cache = PointerMap(2 ** 25)
+        self.specials = PointerMap(2 ** 16)
         lang_data = util.read_lang_data(name)
-        rules, words, probs, clusters, case_stats, tag_stats = lang_data
+        rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data
+        self.prefix_re = re.compile(prefix)
+        self.suffix_re = re.compile(suffix)
         self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
                                STRING_VIEW_FUNCS + user_string_features,
                                FLAG_FUNCS + user_flag_features)
@@ -302,93 +305,20 @@ cdef class Language:
         self.cache.set(key, lexemes)
     
     cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
-        cdef Py_UNICODE c0 = chars[0]
-        cdef Py_UNICODE c1 = chars[1]
-        if c0 == ",":
-            return 1
-        elif c0 == '"':
-            return 1
-        elif c0 == "(":
-            return 1
-        elif c0 == "[":
-            return 1
-        elif c0 == "{":
-            return 1
-        elif c0 == "*":
-            return 1
-        elif c0 == "<":
-            return 1
-        elif c0 == "$":
-            return 1
-        elif c0 == "£":
-            return 1
-        elif c0 == "€":
-            return 1
-        elif c0 == "\u201c":
-            return 1
-        elif c0 == "'":
-            return 1
-        elif c0 == "`":
-            if c1 == "`":
-                return 2
-            else:
-                return 1
-        else:
+        cdef unicode string = chars[:length]
+        match = self.prefix_re.search(string)
+        if match is None:
             return 0
- 
+        else:
+            return match.end() - match.start()
+
     cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
-        cdef Py_UNICODE c0 = chars[length - 1]
-        cdef Py_UNICODE c1 = chars[length - 2] if length >= 2 else 0
-        cdef Py_UNICODE c2 = chars[length - 3] if length >= 3 else 0
- 
-        if c0 == ",":
-            return 1
-        elif c0 == '"':
-            return 1
-        elif c0 == ')':
-            return 1
-        elif c0 == ']':
-            return 1
-        elif c0 == '}':
-            return 1
-        elif c0 == '*':
-            return 1
-        elif c0 == '!':
-            return 1
-        elif c0 == '?':
-            return 1
-        elif c0 == '%':
-            return 1
-        elif c0 == '$':
-            return 1
-        elif c0 == '>':
-            return 1
-        elif c0 == ':':
-            return 1
-        elif c0 == "'":
-            return 1
-        elif c0 == u'\u201d':
-            return 1
-        elif c0 == "s":
-            if c1 == "'":
-                return 2
-            else:
-                return 0
-        elif c0 == "S":
-            if c1 == "'":
-                return 2
-            else:
-                return 0
-        elif c0 == ".":
-            if c1 == ".":
-                if c2 == ".":
-                    return 3
-                else:
-                    return 2
-            else:
-                return 1
-        else:
+        cdef unicode string = chars[:length]
+        match = self.suffix_re.search(string)
+        if match is None:
             return 0
+        else:
+            return match.end() - match.start()
 
     def _load_special_tokenization(self, token_rules):
         '''Load special-case tokenization rules.
@@ -422,7 +352,7 @@ cdef class Lexicon:
         self._mem = Pool()
         self._flag_features = flag_features
         self._string_features = string_features
-        self._dict = PointerHash(2 ** 20)
+        self._dict = PointerMap(2 ** 20)
         self.size = 0
         cdef String string
         for uni_string in words: