diff --git a/spacy/ptb3.pxd b/spacy/ptb3.pxd index 54b8ad12a..f39c18c81 100644 --- a/spacy/ptb3.pxd +++ b/spacy/ptb3.pxd @@ -1,14 +1,5 @@ -from spacy.spacy cimport Language -from spacy.lexeme cimport StringHash -from spacy.word cimport Word +from spacy.lang cimport Language cdef class PennTreebank3(Language): - cpdef list find_substrings(self, unicode word) - - -cdef PennTreebank3 PTB3 - -cpdef Word lookup(unicode word) -cpdef list tokenize(unicode string) -cpdef unicode unhash(StringHash hash_value) + cdef list _split(self, unicode split) diff --git a/spacy/ptb3.pyx b/spacy/ptb3.pyx index 80efac36a..0d3828920 100644 --- a/spacy/ptb3.pyx +++ b/spacy/ptb3.pyx @@ -13,6 +13,43 @@ cimport spacy import re +from spacy import orth + +TAG_THRESH = 0.5 +UPPER_THRESH = 0.2 +LOWER_THRESH = 0.5 +TITLE_THRESH = 0.7 + +NR_FLAGS = 0 + +OFT_UPPER = NR_FLAGS; NR_FLAGS += 1 +OFT_LOWER = NR_FLAGS; NR_FLAGS += 1 +OFT_TITLE = NR_FLAGS; NR_FLAGS += 1 + +IS_ALPHA = NR_FLAGS; NR_FLAGS += 1 +IS_DIGIT = NR_FLAGS; NR_FLAGS += 1 +IS_PUNCT = NR_FLAGS; NR_FLAGS += 1 +IS_SPACE = NR_FLAGS; NR_FLAGS += 1 +IS_ASCII = NR_FLAGS; NR_FLAGS += 1 +IS_TITLE = NR_FLAGS; NR_FLAGS += 1 +IS_LOWER = NR_FLAGS; NR_FLAGS += 1 +IS_UPPER = NR_FLAGS; NR_FLAGS += 1 + +CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1 +CAN_CONJ = NR_FLAGS; NR_FLAGS += 1 +CAN_NUM = NR_FLAGS; NR_FLAGS += 1 +CAN_DET = NR_FLAGS; NR_FLAGS += 1 +CAN_ADP = NR_FLAGS; NR_FLAGS += 1 +CAN_ADJ = NR_FLAGS; NR_FLAGS += 1 +CAN_ADV = NR_FLAGS; NR_FLAGS += 1 +CAN_VERB = NR_FLAGS; NR_FLAGS += 1 +CAN_NOUN = NR_FLAGS; NR_FLAGS += 1 +CAN_PDT = NR_FLAGS; NR_FLAGS += 1 +CAN_POS = NR_FLAGS; NR_FLAGS += 1 +CAN_PRON = NR_FLAGS; NR_FLAGS += 1 +CAN_PRT = NR_FLAGS; NR_FLAGS += 1 + + # List of contractions adapted from Robert MacIntyre's tokenizer. CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"), re.compile(r"(?i)\b(d)('ye)\b"), @@ -75,7 +112,45 @@ def nltk_regex_tokenize(text): cdef class PennTreebank3(Language): - cpdef list find_substrings(self, unicode chunk): + """Fully PTB compatible English tokenizer, tightly coupled to lexicon. + + Attributes: + name (unicode): The two letter code used by Wikipedia for the language. + lexicon (Lexicon): The lexicon. Exposes the lookup method. + """ + + + def __cinit__(self, name): + flag_funcs = [0 for _ in range(NR_FLAGS)] + + flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH) + flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH) + flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH) + + flag_funcs[IS_ALPHA] = orth.is_alpha + flag_funcs[IS_DIGIT] = orth.is_digit + flag_funcs[IS_PUNCT] = orth.is_punct + flag_funcs[IS_SPACE] = orth.is_space + flag_funcs[IS_TITLE] = orth.is_title + flag_funcs[IS_LOWER] = orth.is_lower + flag_funcs[IS_UPPER] = orth.is_upper + + flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH) + flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH) + flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH) + flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH) + flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH) + flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH) + flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH) + flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH) + flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH) + flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH) + flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH) + + Language.__init__(self, name, flag_funcs) + + + cdef list _split(self, unicode chunk): strings = nltk_regex_tokenize(chunk) if strings[-1] == '.': strings.pop() @@ -84,15 +159,4 @@ cdef class PennTreebank3(Language): return strings -cdef PennTreebank3 PTB3 = PennTreebank3('ptb3') - -cpdef list tokenize(unicode string): - return PTB3.tokenize(string) - - -cpdef Word lookup(unicode string): - return PTB3.lookup(string) - - -cpdef unicode unhash(StringHash hash_value): - return PTB3.unhash(hash_value) +PTB3 = PennTreebank3('ptb3')