* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers

2014-08-29 02:30:27 +02:00 · 2014-08-29 02:30:27 +02:00 · 5233f110c4
parent 45a22d6b2c
commit 5233f110c4
2 changed files with 79 additions and 24 deletions
--- a/spacy/ptb3.pxd
+++ b/spacy/ptb3.pxd
@ -1,14 +1,5 @@
-from spacy.spacy cimport Language
+from spacy.lang cimport Language
 from spacy.lexeme cimport StringHash
 from spacy.word cimport Word
 cdef class PennTreebank3(Language):
-    cpdef list find_substrings(self, unicode word)
+    cdef list _split(self, unicode split)
 cdef PennTreebank3 PTB3
 cpdef Word lookup(unicode word)
 cpdef list tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/ptb3.pyx
+++ b/spacy/ptb3.pyx
@ -13,6 +13,43 @@ cimport spacy
 import re
 from spacy import orth
 TAG_THRESH = 0.5
 UPPER_THRESH = 0.2
 LOWER_THRESH = 0.5
 TITLE_THRESH = 0.7
 NR_FLAGS = 0
 OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
 OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
 OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
 IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
 IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
 IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
 IS_SPACE = NR_FLAGS; NR_FLAGS += 1
 IS_ASCII = NR_FLAGS; NR_FLAGS += 1
 IS_TITLE = NR_FLAGS; NR_FLAGS += 1
 IS_LOWER = NR_FLAGS; NR_FLAGS += 1
 IS_UPPER = NR_FLAGS; NR_FLAGS += 1
 CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
 CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
 CAN_NUM = NR_FLAGS; NR_FLAGS += 1
 CAN_DET = NR_FLAGS; NR_FLAGS += 1
 CAN_ADP = NR_FLAGS; NR_FLAGS += 1
 CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
 CAN_ADV = NR_FLAGS; NR_FLAGS += 1
 CAN_VERB = NR_FLAGS; NR_FLAGS += 1
 CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
 CAN_PDT = NR_FLAGS; NR_FLAGS += 1
 CAN_POS = NR_FLAGS; NR_FLAGS += 1
 CAN_PRON = NR_FLAGS; NR_FLAGS += 1
 CAN_PRT = NR_FLAGS; NR_FLAGS += 1
 # List of contractions adapted from Robert MacIntyre's tokenizer.
 CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
                 re.compile(r"(?i)\b(d)('ye)\b"),
@ -75,7 +112,45 @@ def nltk_regex_tokenize(text):
 cdef class PennTreebank3(Language):
-    cpdef list find_substrings(self, unicode chunk):
+    """Fully PTB compatible English tokenizer, tightly coupled to lexicon.
    Attributes:
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
    def __cinit__(self, name):
        flag_funcs = [0 for _ in range(NR_FLAGS)]
        flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
        flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
        flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
        flag_funcs[IS_ALPHA] = orth.is_alpha
        flag_funcs[IS_DIGIT] = orth.is_digit
        flag_funcs[IS_PUNCT] = orth.is_punct
        flag_funcs[IS_SPACE] = orth.is_space
        flag_funcs[IS_TITLE] = orth.is_title
        flag_funcs[IS_LOWER] = orth.is_lower
        flag_funcs[IS_UPPER] = orth.is_upper
        flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
        flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
        flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
        flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
        flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
        flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
        flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
        flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
        flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
        flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
        flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
        Language.__init__(self, name, flag_funcs)
    cdef list _split(self, unicode chunk):
        strings = nltk_regex_tokenize(chunk)
        if strings[-1] == '.':
            strings.pop()
@ -84,15 +159,4 @@ cdef class PennTreebank3(Language):
        return strings
-cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
+PTB3 = PennTreebank3('ptb3')
 cpdef list tokenize(unicode string):
    return PTB3.tokenize(string)
 cpdef Word lookup(unicode string):
    return PTB3.lookup(string)
 cpdef unicode unhash(StringHash hash_value):
    return PTB3.unhash(hash_value)