* Fixed contraction tests. Need to correct problem with the way case stats and tag stats are supposed to work.

2014-08-27 20:22:33 +02:00 · 2014-08-27 20:22:33 +02:00 · fd4e61e58b
parent fdaf24604a
commit fd4e61e58b
2 changed files with 23 additions and 20 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -20,7 +20,7 @@ cdef class Language:
        self.name = name
        self.cache = {}
        self.lexicon = Lexicon()
-        #self.load_special_tokenization(util.read_tokenization(name))
+        self.load_special_tokenization(util.read_tokenization(name))

    cpdef list tokenize(self, unicode string):
        """Tokenize a string.
@ -49,6 +49,7 @@ cdef class Language:
            i += 1
        if start < i:
            tokens.extend(self._tokenize(string[start:]))
+        assert tokens
        return tokens

    cdef list _tokenize(self, unicode string):
@ -101,7 +102,7 @@ cdef class Language:
        for string, substrings in token_rules:
            lexemes = []
            for i, substring in enumerate(substrings):
-                lexemes.append(self.lookup(substring))
+                lexemes.append(self.lexicon.lookup(substring))
            self.cache[string] = lexemes
 

@ -143,13 +144,15 @@ cdef class Lexicon:
        cdef Lexeme word
        flag_id = len(self.flag_checkers)
        for string, word in self.lexicon.items():
-            if flag_checker(string, word.prob, {}):
+            if flag_checker(string, word.prob, {}, {}):
                word.set_flag(flag_id)
        self.flag_checkers.append(flag_checker)
        return flag_id

    def add_transform(self, string_transform):
        self.string_transformers.append(string_transform)
+        for string, word in self.lexicon.items():
+            word.add_view(string_transform(string, word.prob, {}, {}))
        return len(self.string_transformers) - 1

    def load_probs(self, location):
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -1,41 +1,41 @@
 from __future__ import unicode_literals

-from spacy.en import tokenize, lookup, unhash
+from spacy.en import EN


 def test_possess():
-    tokens = tokenize("Mike's")
-    assert unhash(tokens[0].lex) == "Mike"
-    assert unhash(tokens[1].lex) == "'s"
+    tokens = EN.tokenize("Mike's")
+    assert tokens[0].string == "Mike"
+    assert tokens[1].string == "'s"
    assert len(tokens) == 2


 def test_apostrophe():
-    tokens = tokenize("schools'")
+    tokens = EN.tokenize("schools'")
    assert len(tokens) == 2
-    assert unhash(tokens[1].lex) == "'"
-    assert unhash(tokens[0].lex) == "schools"
+    assert tokens[1].string == "'"
+    assert tokens[0].string == "schools"


 def test_LL():
-    tokens = tokenize("we'll")
+    tokens = EN.tokenize("we'll")
    assert len(tokens) == 2
-    assert unhash(tokens[1].lex) == "will"
-    assert unhash(tokens[0].lex) == "we"
+    assert tokens[1].string == "will"
+    assert tokens[0].string == "we"


 def test_aint():
-    tokens = tokenize("ain't")
+    tokens = EN.tokenize("ain't")
    assert len(tokens) == 2
-    assert unhash(tokens[0].lex) == "are"
-    assert unhash(tokens[1].lex) == "not"
+    assert tokens[0].string == "are"
+    assert tokens[1].string == "not"


 def test_capitalized():
-    tokens = tokenize("can't")
+    tokens = EN.tokenize("can't")
    assert len(tokens) == 2
-    tokens = tokenize("Can't")
+    tokens = EN.tokenize("Can't")
    assert len(tokens) == 2
-    tokens = tokenize("Ain't")
+    tokens = EN.tokenize("Ain't")
    assert len(tokens) == 2
-    assert unhash(tokens[0].lex) == "Are"
+    assert tokens[0].string == "Are"