* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers

2014-08-29 02:30:27 +02:00 · 2014-08-29 02:30:27 +02:00 · 5233f110c4
parent 45a22d6b2c
commit 5233f110c4
2 changed files with 79 additions and 24 deletions
--- a/spacy/ptb3.pxd
+++ b/spacy/ptb3.pxd
@ -1,14 +1,5 @@
-from spacy.spacy cimport Language
-from spacy.lexeme cimport StringHash
-from spacy.word cimport Word
+from spacy.lang cimport Language


 cdef class PennTreebank3(Language):
-    cpdef list find_substrings(self, unicode word)
-    
-
-cdef PennTreebank3 PTB3
-
-cpdef Word lookup(unicode word)
-cpdef list tokenize(unicode string)
-cpdef unicode unhash(StringHash hash_value)
+    cdef list _split(self, unicode split)
--- a/spacy/ptb3.pyx
+++ b/spacy/ptb3.pyx
@ -13,6 +13,43 @@ cimport spacy

 import re

+from spacy import orth
+
+TAG_THRESH = 0.5
+UPPER_THRESH = 0.2
+LOWER_THRESH = 0.5
+TITLE_THRESH = 0.7
+
+NR_FLAGS = 0
+
+OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
+OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
+OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
+
+IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
+IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
+IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
+IS_SPACE = NR_FLAGS; NR_FLAGS += 1
+IS_ASCII = NR_FLAGS; NR_FLAGS += 1
+IS_TITLE = NR_FLAGS; NR_FLAGS += 1
+IS_LOWER = NR_FLAGS; NR_FLAGS += 1
+IS_UPPER = NR_FLAGS; NR_FLAGS += 1
+
+CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
+CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
+CAN_NUM = NR_FLAGS; NR_FLAGS += 1
+CAN_DET = NR_FLAGS; NR_FLAGS += 1
+CAN_ADP = NR_FLAGS; NR_FLAGS += 1
+CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
+CAN_ADV = NR_FLAGS; NR_FLAGS += 1
+CAN_VERB = NR_FLAGS; NR_FLAGS += 1
+CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
+CAN_PDT = NR_FLAGS; NR_FLAGS += 1
+CAN_POS = NR_FLAGS; NR_FLAGS += 1
+CAN_PRON = NR_FLAGS; NR_FLAGS += 1
+CAN_PRT = NR_FLAGS; NR_FLAGS += 1
+
+
 # List of contractions adapted from Robert MacIntyre's tokenizer.
 CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
                 re.compile(r"(?i)\b(d)('ye)\b"),
@ -75,7 +112,45 @@ def nltk_regex_tokenize(text):


 cdef class PennTreebank3(Language):
-    cpdef list find_substrings(self, unicode chunk):
+    """Fully PTB compatible English tokenizer, tightly coupled to lexicon.
+
+    Attributes:
+        name (unicode): The two letter code used by Wikipedia for the language.
+        lexicon (Lexicon): The lexicon. Exposes the lookup method.
+    """
+
+
+    def __cinit__(self, name):
+        flag_funcs = [0 for _ in range(NR_FLAGS)]
+        
+        flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
+        flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
+        flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
+        
+        flag_funcs[IS_ALPHA] = orth.is_alpha
+        flag_funcs[IS_DIGIT] = orth.is_digit
+        flag_funcs[IS_PUNCT] = orth.is_punct
+        flag_funcs[IS_SPACE] = orth.is_space
+        flag_funcs[IS_TITLE] = orth.is_title
+        flag_funcs[IS_LOWER] = orth.is_lower
+        flag_funcs[IS_UPPER] = orth.is_upper
+        
+        flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
+        flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
+        flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
+        flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
+        flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
+        flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
+        flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
+        flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
+        flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
+        flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
+        flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
+        
+        Language.__init__(self, name, flag_funcs)
+
+
+    cdef list _split(self, unicode chunk):
        strings = nltk_regex_tokenize(chunk)
        if strings[-1] == '.':
            strings.pop()
@ -84,15 +159,4 @@ cdef class PennTreebank3(Language):
        return strings
    

-cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
-
-cpdef list tokenize(unicode string):
-    return PTB3.tokenize(string)
-
-
-cpdef Word lookup(unicode string):
-    return PTB3.lookup(string)
-
-
-cpdef unicode unhash(StringHash hash_value):
-    return PTB3.unhash(hash_value)
+PTB3 = PennTreebank3('ptb3')