2014-07-07 05:36:43 +00:00
|
|
|
# cython: profile=True
|
2014-08-20 11:39:39 +00:00
|
|
|
# cython: embedsignature=True
|
2014-08-21 16:42:47 +00:00
|
|
|
'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
|
|
|
|
scheme in several important respects:
|
|
|
|
|
2014-08-22 14:35:48 +00:00
|
|
|
* Whitespace is added as tokens, except for single spaces. e.g.,
|
2014-08-21 16:42:47 +00:00
|
|
|
|
2014-08-28 23:59:23 +00:00
|
|
|
>>> [w.string for w in EN.tokenize(u'\\nHello \\tThere')]
|
2014-08-21 16:42:47 +00:00
|
|
|
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
|
|
|
|
|
|
|
* Contractions are normalized, e.g.
|
|
|
|
|
2014-08-28 23:59:23 +00:00
|
|
|
>>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
|
2014-08-21 16:42:47 +00:00
|
|
|
[u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
|
|
|
|
|
|
|
|
* Hyphenated words are split, with the hyphen preserved, e.g.:
|
|
|
|
|
2014-08-28 23:59:23 +00:00
|
|
|
>>> [w.string for w in EN.tokenize(u'New York-based')]
|
2014-08-21 16:42:47 +00:00
|
|
|
[u'New', u'York', u'-', u'based']
|
|
|
|
|
2014-08-22 14:35:48 +00:00
|
|
|
Other improvements:
|
|
|
|
|
2014-08-21 16:42:47 +00:00
|
|
|
* Email addresses, URLs, European-formatted dates and other numeric entities not
|
|
|
|
found in the PTB are tokenized correctly
|
|
|
|
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
|
|
|
|
as a pre-process before tokenization.)
|
|
|
|
|
2014-08-22 14:35:48 +00:00
|
|
|
Take care to ensure your training and run-time data is tokenized according to the
|
2014-08-21 16:42:47 +00:00
|
|
|
same scheme. Tokenization problems are a major cause of poor performance for
|
2014-08-21 21:49:14 +00:00
|
|
|
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
|
|
|
provides a fully Penn Treebank 3-compliant tokenizer.
|
2014-07-05 18:51:42 +00:00
|
|
|
'''
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2014-12-12 03:33:51 +00:00
|
|
|
from murmurhash.mrmr cimport hash64
|
|
|
|
|
2014-08-27 15:15:39 +00:00
|
|
|
cimport lang
|
2014-12-12 03:33:51 +00:00
|
|
|
from .typedefs cimport hash_t, id_t, flags_t
|
2014-12-03 00:04:00 +00:00
|
|
|
import orth
|
2014-12-09 10:16:17 +00:00
|
|
|
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
|
|
|
from .morphology cimport X, PUNCT, EOL
|
2014-12-08 10:12:15 +00:00
|
|
|
|
2014-12-09 03:48:01 +00:00
|
|
|
from .tokens cimport Morphology
|
|
|
|
|
2014-12-08 10:12:15 +00:00
|
|
|
|
2014-12-12 03:33:51 +00:00
|
|
|
DEF USE_POS_CACHE = True
|
|
|
|
|
|
|
|
|
2014-12-08 10:12:15 +00:00
|
|
|
POS_TAGS = {
|
|
|
|
'NULL': (NO_TAG, {}),
|
|
|
|
'EOL': (EOL, {}),
|
|
|
|
'CC': (CONJ, {}),
|
|
|
|
'CD': (NUM, {}),
|
|
|
|
'DT': (DET, {}),
|
|
|
|
'EX': (DET, {}),
|
|
|
|
'FW': (X, {}),
|
|
|
|
'IN': (ADP, {}),
|
|
|
|
'JJ': (ADJ, {}),
|
|
|
|
'JJR': (ADJ, {'misc': COMPARATIVE}),
|
|
|
|
'JJS': (ADJ, {'misc': SUPERLATIVE}),
|
|
|
|
'LS': (X, {}),
|
|
|
|
'MD': (VERB, {'tenspect': MODAL}),
|
|
|
|
'NN': (NOUN, {}),
|
|
|
|
'NNS': (NOUN, {'number': PLURAL}),
|
|
|
|
'NNP': (NOUN, {'misc': NAME}),
|
|
|
|
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
|
|
|
|
'PDT': (DET, {}),
|
|
|
|
'POS': (PRT, {'case': GENITIVE}),
|
|
|
|
'PRP': (NOUN, {}),
|
|
|
|
'PRP$': (NOUN, {'case': GENITIVE}),
|
|
|
|
'RB': (ADV, {}),
|
|
|
|
'RBR': (ADV, {'misc': COMPARATIVE}),
|
|
|
|
'RBS': (ADV, {'misc': SUPERLATIVE}),
|
|
|
|
'RP': (PRT, {}),
|
|
|
|
'SYM': (X, {}),
|
|
|
|
'TO': (PRT, {}),
|
|
|
|
'UH': (X, {}),
|
|
|
|
'VB': (VERB, {}),
|
|
|
|
'VBD': (VERB, {'tenspect': PAST}),
|
|
|
|
'VBG': (VERB, {'tenspect': ING}),
|
|
|
|
'VBN': (VERB, {'tenspect': PASSIVE}),
|
|
|
|
'VBP': (VERB, {'tenspect': PRESENT}),
|
|
|
|
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
|
|
|
'WDT': (DET, {'misc': RELATIVE}),
|
|
|
|
'WP': (PRON, {'misc': RELATIVE}),
|
|
|
|
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
|
|
|
|
'WRB': (ADV, {'misc': RELATIVE}),
|
|
|
|
'!': (PUNCT, {}),
|
|
|
|
'#': (PUNCT, {}),
|
|
|
|
'$': (PUNCT, {}),
|
|
|
|
"''": (PUNCT, {}),
|
|
|
|
"(": (PUNCT, {}),
|
|
|
|
")": (PUNCT, {}),
|
|
|
|
"-LRB-": (PUNCT, {}),
|
|
|
|
"-RRB-": (PUNCT, {}),
|
|
|
|
".": (PUNCT, {}),
|
|
|
|
",": (PUNCT, {}),
|
|
|
|
"``": (PUNCT, {}),
|
|
|
|
":": (PUNCT, {}),
|
|
|
|
"?": (PUNCT, {}),
|
|
|
|
}
|
2014-07-07 10:47:21 +00:00
|
|
|
|
2014-09-10 16:11:13 +00:00
|
|
|
|
2014-12-07 12:52:41 +00:00
|
|
|
POS_TEMPLATES = (
|
|
|
|
(W_sic,),
|
2014-12-09 21:08:55 +00:00
|
|
|
(P1_lemma, P1_pos),
|
|
|
|
(P2_lemma, P2_pos),
|
2014-12-07 12:52:41 +00:00
|
|
|
(N1_sic,),
|
|
|
|
(N2_sic,),
|
|
|
|
|
|
|
|
(W_suffix,),
|
|
|
|
(W_prefix,),
|
|
|
|
|
|
|
|
(P1_pos,),
|
|
|
|
(P2_pos,),
|
|
|
|
(P1_pos, P2_pos),
|
|
|
|
(P1_pos, W_sic),
|
|
|
|
(P1_suffix,),
|
|
|
|
(N1_suffix,),
|
|
|
|
|
|
|
|
(W_shape,),
|
|
|
|
(W_cluster,),
|
|
|
|
(N1_cluster,),
|
|
|
|
(N2_cluster,),
|
|
|
|
(P1_cluster,),
|
|
|
|
(P2_cluster,),
|
2014-12-09 21:08:55 +00:00
|
|
|
|
|
|
|
(W_pos_type,),
|
|
|
|
(N1_pos_type,),
|
|
|
|
(N1_pos_type,),
|
|
|
|
(P1_pos, W_pos_type, N1_pos_type),
|
2014-12-07 12:52:41 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2014-08-27 15:15:39 +00:00
|
|
|
cdef class English(Language):
|
2014-08-28 23:59:23 +00:00
|
|
|
"""English tokenizer, tightly coupled to lexicon.
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
name (unicode): The two letter code used by Wikipedia for the language.
|
|
|
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
|
|
|
"""
|
2014-12-12 03:33:51 +00:00
|
|
|
def load_pos_cache(self, loc):
|
|
|
|
cdef int i = 0
|
|
|
|
cdef hash_t key
|
|
|
|
cdef int pos
|
|
|
|
with open(loc) as file_:
|
|
|
|
for line in file_:
|
|
|
|
pieces = line.split()
|
|
|
|
if i >= 500000:
|
|
|
|
break
|
|
|
|
i += 1
|
|
|
|
key = int(pieces[1])
|
|
|
|
pos = int(pieces[2])
|
|
|
|
self._pos_cache.set(key, <void*>pos)
|
|
|
|
|
2014-12-07 12:52:41 +00:00
|
|
|
def get_props(self, unicode string):
|
|
|
|
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
|
|
|
|
2014-12-03 00:04:00 +00:00
|
|
|
def set_flags(self, unicode string):
|
|
|
|
cdef flags_t flags = 0
|
|
|
|
flags |= orth.is_alpha(string) << IS_ALPHA
|
|
|
|
flags |= orth.is_ascii(string) << IS_ASCII
|
|
|
|
flags |= orth.is_digit(string) << IS_DIGIT
|
|
|
|
flags |= orth.is_lower(string) << IS_LOWER
|
|
|
|
flags |= orth.is_punct(string) << IS_PUNCT
|
|
|
|
flags |= orth.is_space(string) << IS_SPACE
|
|
|
|
flags |= orth.is_title(string) << IS_TITLE
|
|
|
|
flags |= orth.is_upper(string) << IS_UPPER
|
|
|
|
|
|
|
|
flags |= orth.like_url(string) << LIKE_URL
|
|
|
|
flags |= orth.like_number(string) << LIKE_NUMBER
|
|
|
|
return flags
|
2014-07-07 05:36:43 +00:00
|
|
|
|
2014-12-07 12:52:41 +00:00
|
|
|
def set_pos(self, Tokens tokens):
|
|
|
|
cdef int i
|
|
|
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
2014-12-08 10:12:15 +00:00
|
|
|
cdef TokenC* t = tokens.data
|
2014-12-12 03:33:51 +00:00
|
|
|
cdef id_t[2] bigram
|
|
|
|
cdef hash_t cache_key
|
|
|
|
cdef void* cached = NULL
|
2014-12-09 14:02:04 +00:00
|
|
|
assert self.morphologizer is not None
|
|
|
|
cdef dict tagdict = self.pos_tagger.tagdict
|
2014-12-07 12:52:41 +00:00
|
|
|
for i in range(tokens.length):
|
2014-12-12 03:33:51 +00:00
|
|
|
if USE_POS_CACHE:
|
|
|
|
bigram[0] = tokens.data[i].lex.sic
|
|
|
|
bigram[1] = tokens.data[i-1].lex.sic
|
|
|
|
cache_key = hash64(bigram, sizeof(id_t) * 2, 0)
|
|
|
|
cached = self._pos_cache.get(cache_key)
|
|
|
|
if cached != NULL:
|
|
|
|
t[i].pos = <int><size_t>cached
|
2014-12-09 14:02:04 +00:00
|
|
|
else:
|
|
|
|
fill_pos_context(context, i, t)
|
|
|
|
t[i].pos = self.pos_tagger.predict(context)
|
2014-12-09 21:08:55 +00:00
|
|
|
self.morphologizer.set_morph(i, t)
|
2014-12-07 12:52:41 +00:00
|
|
|
|
|
|
|
def train_pos(self, Tokens tokens, golds):
|
|
|
|
cdef int i
|
|
|
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
|
|
|
c = 0
|
2014-12-08 10:12:15 +00:00
|
|
|
cdef TokenC* t = tokens.data
|
2014-12-07 12:52:41 +00:00
|
|
|
for i in range(tokens.length):
|
2014-12-08 10:12:15 +00:00
|
|
|
fill_pos_context(context, i, t)
|
|
|
|
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
2014-12-09 14:02:04 +00:00
|
|
|
self.morphologizer.set_morph(i, t)
|
2014-12-08 10:12:15 +00:00
|
|
|
c += t[i].pos == golds[i]
|
2014-12-07 12:52:41 +00:00
|
|
|
return c
|
|
|
|
|
2014-12-19 14:42:09 +00:00
|
|
|
cdef int is_base_np_end(self, const TokenC* token) except -1:
|
|
|
|
pass
|
|
|
|
|
|
|
|
cdef int is_outside_base_np(self, const TokenC* token) except -1:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2014-07-07 05:36:43 +00:00
|
|
|
|
2014-12-09 14:02:04 +00:00
|
|
|
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
|
|
|
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
|
|
|
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
|
|
|
_fill_from_token(&context[W_sic], &tokens[i])
|
|
|
|
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
|
|
|
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
|
|
|
|
|
|
|
|
|
|
|
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|
|
|
context[0] = t.lex.sic
|
|
|
|
context[1] = t.lex.cluster
|
|
|
|
context[2] = t.lex.shape
|
|
|
|
context[3] = t.lex.prefix
|
|
|
|
context[4] = t.lex.suffix
|
|
|
|
context[5] = t.pos
|
2014-12-09 21:08:55 +00:00
|
|
|
context[6] = t.lemma
|
|
|
|
context[7] = t.lex.pos_type
|
2014-12-09 03:48:01 +00:00
|
|
|
|
2014-12-08 10:12:15 +00:00
|
|
|
|
2014-10-30 07:14:42 +00:00
|
|
|
EN = English('en')
|