* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers

This commit is contained in:
Matthew Honnibal 2014-08-29 02:30:27 +02:00
parent 45a22d6b2c
commit 5233f110c4
2 changed files with 79 additions and 24 deletions

View File

@ -1,14 +1,5 @@
from spacy.spacy cimport Language
from spacy.lexeme cimport StringHash
from spacy.word cimport Word
from spacy.lang cimport Language
cdef class PennTreebank3(Language):
cpdef list find_substrings(self, unicode word)
cdef PennTreebank3 PTB3
cpdef Word lookup(unicode word)
cpdef list tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value)
cdef list _split(self, unicode split)

View File

@ -13,6 +13,43 @@ cimport spacy
import re
from spacy import orth
TAG_THRESH = 0.5
UPPER_THRESH = 0.2
LOWER_THRESH = 0.5
TITLE_THRESH = 0.7
NR_FLAGS = 0
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
CAN_DET = NR_FLAGS; NR_FLAGS += 1
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
CAN_POS = NR_FLAGS; NR_FLAGS += 1
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
# List of contractions adapted from Robert MacIntyre's tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
re.compile(r"(?i)\b(d)('ye)\b"),
@ -75,7 +112,45 @@ def nltk_regex_tokenize(text):
cdef class PennTreebank3(Language):
cpdef list find_substrings(self, unicode chunk):
"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.
Attributes:
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
def __cinit__(self, name):
flag_funcs = [0 for _ in range(NR_FLAGS)]
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
flag_funcs[IS_ALPHA] = orth.is_alpha
flag_funcs[IS_DIGIT] = orth.is_digit
flag_funcs[IS_PUNCT] = orth.is_punct
flag_funcs[IS_SPACE] = orth.is_space
flag_funcs[IS_TITLE] = orth.is_title
flag_funcs[IS_LOWER] = orth.is_lower
flag_funcs[IS_UPPER] = orth.is_upper
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
Language.__init__(self, name, flag_funcs)
cdef list _split(self, unicode chunk):
strings = nltk_regex_tokenize(chunk)
if strings[-1] == '.':
strings.pop()
@ -84,15 +159,4 @@ cdef class PennTreebank3(Language):
return strings
cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
cpdef list tokenize(unicode string):
return PTB3.tokenize(string)
cpdef Word lookup(unicode string):
return PTB3.lookup(string)
cpdef unicode unhash(StringHash hash_value):
return PTB3.unhash(hash_value)
PTB3 = PennTreebank3('ptb3')