* Work on morphological processing

This commit is contained in:
Matthew Honnibal 2014-12-08 21:12:15 +11:00
parent 7b68f911cf
commit 99bbbb6feb
10 changed files with 261 additions and 21 deletions

View File

@ -5,6 +5,57 @@ from .tokens cimport Tokens
from .tokens cimport TokenC
cpdef enum en_person_t:
NO_PERSON
FIRST
SECOND
THIRD
cpdef enum en_number_t:
NO_NUMBER
SINGULAR
PLURAL
MASS
CARDINAL
ORDINAL
cpdef enum en_gender_t:
NO_GENDER
MASCULINE
FEMININE
cpdef enum en_tenspect_t:
NO_TENSE
BASE_VERB
PRESENT
PAST
PASSIVE
ING
MODAL
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
ACCUSATIVE
GENITIVE
DEMONYM
cpdef enum misc_t:
NO_MISC
COMPARATIVE
SUPERLATIVE
RELATIVE
NAME
URL
EMAIL
EMOTICON
# Flags
cpdef enum FlagID:
IS_ALPHA

View File

@ -35,6 +35,63 @@ from __future__ import unicode_literals
cimport lang
from .typedefs cimport flags_t
import orth
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from .tagger cimport X, PUNCT, EOL
POS_TAGS = {
'NULL': (NO_TAG, {}),
'EOL': (EOL, {}),
'CC': (CONJ, {}),
'CD': (NUM, {}),
'DT': (DET, {}),
'EX': (DET, {}),
'FW': (X, {}),
'IN': (ADP, {}),
'JJ': (ADJ, {}),
'JJR': (ADJ, {'misc': COMPARATIVE}),
'JJS': (ADJ, {'misc': SUPERLATIVE}),
'LS': (X, {}),
'MD': (VERB, {'tenspect': MODAL}),
'NN': (NOUN, {}),
'NNS': (NOUN, {'number': PLURAL}),
'NNP': (NOUN, {'misc': NAME}),
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
'PDT': (DET, {}),
'POS': (PRT, {'case': GENITIVE}),
'PRP': (NOUN, {}),
'PRP$': (NOUN, {'case': GENITIVE}),
'RB': (ADV, {}),
'RBR': (ADV, {'misc': COMPARATIVE}),
'RBS': (ADV, {'misc': SUPERLATIVE}),
'RP': (PRT, {}),
'SYM': (X, {}),
'TO': (PRT, {}),
'UH': (X, {}),
'VB': (VERB, {}),
'VBD': (VERB, {'tenspect': PAST}),
'VBG': (VERB, {'tenspect': ING}),
'VBN': (VERB, {'tenspect': PASSIVE}),
'VBP': (VERB, {'tenspect': PRESENT}),
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
'WDT': (DET, {'misc': RELATIVE}),
'WP': (PRON, {'misc': RELATIVE}),
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
'WRB': (ADV, {'misc': RELATIVE}),
'!': (PUNCT, {}),
'#': (PUNCT, {}),
'$': (PUNCT, {}),
"''": (PUNCT, {}),
"(": (PUNCT, {}),
")": (PUNCT, {}),
"-LRB-": (PUNCT, {}),
"-RRB-": (PUNCT, {}),
".": (PUNCT, {}),
",": (PUNCT, {}),
"``": (PUNCT, {}),
":": (PUNCT, {}),
"?": (PUNCT, {}),
}
POS_TEMPLATES = (
@ -91,19 +148,25 @@ cdef class English(Language):
def set_pos(self, Tokens tokens):
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
cdef TokenC* t = tokens.data
for i in range(tokens.length):
fill_pos_context(context, i, tokens.data)
tokens.data[i].pos = self.pos_tagger.predict(context)
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context)
#self.morphalyser.set_token(&t[i])
def train_pos(self, Tokens tokens, golds):
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
c = 0
cdef TokenC* t = tokens.data
for i in range(tokens.length):
fill_pos_context(context, i, tokens.data)
tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
c += tokens.data[i].pos == golds[i]
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
t[i].morph = self.pos_tagger.tags[t[i].pos].morph
#self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
c += t[i].pos == golds[i]
return c
EN = English('en')

View File

@ -2,20 +2,20 @@ from libcpp.vector cimport vector
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
from preshed.maps cimport PreshMap
from preshed.maps cimport PreshMap, PreshMapArray
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .tokens cimport Tokens, TokenC
from .lexeme cimport Lexeme
from .tagger cimport Tagger
from .tagger cimport PosTag
from .utf8string cimport StringStore, UniStr
cdef class Lexicon:
cpdef public get_lex_props
cdef Pool mem
cpdef readonly size_t size
cpdef readonly StringStore strings
cdef vector[Lexeme*] lexemes
@ -29,13 +29,17 @@ cdef class Language:
cdef readonly unicode name
cdef PreshMap _cache
cdef PreshMap _specials
cdef PreshMapArray _lemmas
cpdef readonly Lexicon lexicon
cpdef readonly Tagger pos_tagger
cpdef readonly object lemmatizer
cdef object _prefix_re
cdef object _suffix_re
cdef object _infix_re
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text)

View File

@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from preshed.maps cimport PreshMap
from .lemmatizer import Lemmatizer
from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME
@ -26,6 +27,8 @@ from . import util
from .util import read_lang_data
from .tokens import Tokens
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
cdef class Language:
def __init__(self, name):
@ -39,14 +42,40 @@ cdef class Language:
self._infix_re = re.compile(infix)
self.lexicon = Lexicon(self.get_props)
self._load_special_tokenization(rules)
self._lemmas = PreshMapArray(N_UNIV_TAGS)
self.pos_tagger = None
self.lemmatizer = None
def load(self):
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.lexicon.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
if pos.pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string)
elif pos.pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string)
else:
assert pos.pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0]
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
return lemma
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
@ -254,9 +283,11 @@ cdef class Lexicon:
self._map = PreshMap(2 ** 20)
self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME)
self.size = 2
self.get_lex_props = get_props
def __len__(self):
return self.lexemes.size()
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
@ -269,14 +300,13 @@ cdef class Lexicon:
mem = self.mem
cdef unicode py_string = string.chars[:string.n]
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
self.get_lex_props(py_string))
if mem is self.mem:
self._map.set(string.key, lex)
while self.lexemes.size() < (lex.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lex.id] = lex
self.size += 1
else:
lex[0].id = 1
return lex
@ -302,6 +332,8 @@ cdef class Lexicon:
a dict if the operator is called from Python.
'''
if type(id_or_string) == int:
if id_or_string >= self.lexemes.size():
raise IndexError
return self.lexemes.at(id_or_string)[0]
cdef UniStr string
slice_unicode(&string, id_or_string, 0, len(id_or_string))
@ -359,5 +391,4 @@ cdef class Lexicon:
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lexeme.id] = lexeme
i += 1
self.size += 1
fclose(fp)

View File

@ -53,6 +53,7 @@ class Lemmatizer(object):
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules):
form = string[:len(string) - len(old)] + new
if form in index:
forms.append(form)
if not forms:
forms.append(string)
return set(forms)

View File

@ -147,6 +147,7 @@ Y PRT
Z NOUN
^ NOUN
~ X
`` .""".strip().split('\n'))
`` .
EOL EOL""".strip().split('\n'))
return mapping[tag]

View File

@ -1,11 +1,40 @@
from libc.stdint cimport uint8_t
from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t
from .tokens cimport Tokens
from .tokens cimport Tokens, Morphology
# Google universal tag set
cdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
cdef class Tagger:
@ -16,4 +45,5 @@ cdef class Tagger:
cpdef readonly LinearModel model
cpdef readonly list tag_names
cdef PosTag* tags
cdef dict tagdict

View File

@ -12,13 +12,14 @@ import cython
from thinc.features cimport Feature, count_feats
def setup_model_dir(tag_names, tag_counts, templates, model_dir):
def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'templates': templates,
'tag_names': tag_names,
'tag_map': tag_map,
'tag_counts': tag_counts,
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
@ -33,16 +34,31 @@ cdef class Tagger:
self.mem = Pool()
cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates']
tag_map = cfg['tag_map']
univ_counts = {}
cdef unicode tag
cdef unicode univ_tag
self.tag_names = cfg['tag_names']
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
for i, tag in enumerate(self.tag_names):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
self.tags[i].morph.number = props.get('number', 0)
self.tags[i].morph.tenspect = props.get('tenspect', 0)
self.tags[i].morph.mood = props.get('mood', 0)
self.tags[i].morph.gender = props.get('gender', 0)
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
self.tagdict = _make_tag_dict(cfg['tag_counts'])
self.extractor = Extractor(templates)
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
cdef class_t predict(self, const atom_t* context, object golds=None) except *:
"""Predict the tag of tokens[i]. The tagger remembers the features and
prediction, in case you later call tell_answer.
cdef class_t predict(self, atom_t* context, object golds=None) except *:
"""Predict the tag of tokens[i].
>>> tokens = EN.tokenize(u'An example sentence.')
>>> tag = EN.pos_tagger.predict(0, tokens)
@ -69,6 +85,24 @@ cdef class Tagger:
return tag_id
UNIV_TAGS = {
'NULL': NO_TAG,
'ADJ': ADJ,
'ADV': ADV,
'ADP': ADP,
'CONJ': CONJ,
'DET': DET,
'NOUN': NOUN,
'NUM': NUM,
'PRON': PRON,
'PRT': PRT,
'VERB': VERB,
'X': X,
'.': PUNCT,
'EOL': EOL
}
def _make_tag_dict(counts):
freq_thresh = 50
ambiguity_thresh = 0.98

View File

@ -5,14 +5,29 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .lexeme cimport Lexeme
from .typedefs cimport flags_t
from .utf8string cimport StringStore
from libc.stdint cimport uint8_t, uint16_t
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc
cdef struct TokenC:
const Lexeme* lex
Morphology morph
int idx
int pos
int lemma
int sense
@ -37,7 +52,7 @@ cdef class Token:
cdef public int i
cdef public int idx
cdef public int pos
cdef public int ner
cdef int lemma
cdef public atom_t id
cdef public atom_t cluster

View File

@ -51,7 +51,7 @@ cdef class Tokens:
def __getitem__(self, i):
bounds_check(i, self.length, PADDING)
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
self.data[i].sense, self.data[i].lex[0])
self.data[i].lemma, self.data[i].lex[0])
def __iter__(self):
for i in range(self.length):
@ -128,14 +128,15 @@ cdef class Tokens:
@cython.freelist(64)
cdef class Token:
def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
dict lex):
self._string_store = string_store
self.idx = idx
self.pos = pos
self.ner = ner
self.i = i
self.id = lex['id']
self.lemma = lemma
self.cluster = lex['cluster']
self.length = lex['length']
@ -156,3 +157,10 @@ cdef class Token:
return ''
cdef bytes utf8string = self._string_store[self.sic]
return utf8string.decode('utf8')
property lemma:
def __get__(self):
if self.lemma == 0:
return self.string
cdef bytes utf8string = self._string_store[self.lemma]
return utf8string.decode('utf8')