* Move morphological analysis into its own module, morphology.pyx

This commit is contained in:
Matthew Honnibal 2014-12-09 21:16:17 +11:00
parent b962fe73d7
commit 6b34a2f34b
7 changed files with 135 additions and 97 deletions

View File

@ -35,8 +35,8 @@ from __future__ import unicode_literals
cimport lang
from .typedefs cimport flags_t
import orth
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from .tagger cimport X, PUNCT, EOL
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from .morphology cimport X, PUNCT, EOL
from .tokens cimport Morphology
@ -154,8 +154,8 @@ cdef class English(Language):
for i in range(tokens.length):
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context)
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
if self.morphologizer:
self.morphologizer.set_morph(i, t)
def train_pos(self, Tokens tokens, golds):
cdef int i
@ -165,8 +165,8 @@ cdef class English(Language):
for i in range(tokens.length):
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
if self.morphologizer:
self.morphologizer.set_morph(i, t)
c += t[i].pos == golds[i]
return c

View File

@ -2,15 +2,15 @@ from libcpp.vector cimport vector
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
from preshed.maps cimport PreshMap, PreshMapArray
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .tokens cimport Tokens, TokenC
from .lexeme cimport Lexeme
from .tagger cimport Tagger
from .tagger cimport univ_tag_t
from .utf8string cimport StringStore, UniStr
from .morphology cimport Morphologizer
cdef union LexemesOrTokens:
@ -40,17 +40,14 @@ cdef class Language:
cdef readonly unicode name
cdef PreshMap _cache
cdef PreshMap _specials
cdef PreshMapArray _lemmas
cpdef readonly Lexicon lexicon
cpdef readonly Tagger pos_tagger
cpdef readonly object lemmatizer
cpdef readonly Morphologizer morphologizer
cdef object _prefix_re
cdef object _suffix_re
cdef object _infix_re
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text)

View File

@ -14,7 +14,6 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from preshed.maps cimport PreshMap
from .lemmatizer import Lemmatizer
from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME
@ -26,8 +25,6 @@ from .utf8string cimport slice_unicode
from . import util
from .util import read_lang_data
from .tokens import Tokens
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
from .tokens cimport Morphology
@ -43,39 +40,16 @@ cdef class Language:
self._infix_re = re.compile(infix)
self.lexicon = Lexicon(self.get_props)
self._load_special_tokenization(rules)
self._lemmas = PreshMapArray(N_UNIV_TAGS)
self.pos_tagger = None
self.lemmatizer = None
self.morphologizer = None
def load(self):
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.lexicon.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
if pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string)
elif pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string)
else:
assert pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0]
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos, lex.sic, <void*>lemma)
return lemma
self.morphologizer = Morphologizer(self.lexicon.strings,
path.join(util.DATA_DIR, self.name))
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])

42
spacy/morphology.pxd Normal file
View File

@ -0,0 +1,42 @@
from .tokens cimport TokenC, Morphology
from .lexeme cimport Lexeme
from .utf8string cimport StringStore
from preshed.maps cimport PreshMapArray
from cymem.cymem cimport Pool
# Google universal tag set
cpdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
cdef class Morphologizer:
cdef Pool mem
cdef StringStore strings
cdef object lemmatizer
cdef PosTag* tags
cdef PreshMapArray _morph
cdef PreshMapArray _lemmas
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cdef int set_morph(self, const int i, TokenC* tokens) except -1

81
spacy/morphology.pyx Normal file
View File

@ -0,0 +1,81 @@
from os import path
import json
from .lemmatizer import Lemmatizer
UNIV_TAGS = {
'NULL': NO_TAG,
'ADJ': ADJ,
'ADV': ADV,
'ADP': ADP,
'CONJ': CONJ,
'DET': DET,
'NOUN': NOUN,
'NUM': NUM,
'PRON': PRON,
'PRT': PRT,
'VERB': VERB,
'X': X,
'.': PUNCT,
'EOL': EOL
}
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
def __init__(self, StringStore strings, data_dir):
self.mem = Pool()
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
tag_map = cfg['tag_map']
tag_names = cfg['tag_names']
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
self._lemmas = PreshMapArray(N_UNIV_TAGS)
self._morph = PreshMapArray(len(tag_names))
self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
for i, tag in enumerate(tag_names):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
self.tags[i].morph.number = props.get('number', 0)
self.tags[i].morph.tenspect = props.get('tenspect', 0)
self.tags[i].morph.mood = props.get('mood', 0)
self.tags[i].morph.gender = props.get('gender', 0)
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
if pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string)
elif pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string)
else:
assert pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos, lex.sic, <void*>lemma)
return lemma
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].pos]
tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
if morph is NULL:
self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
tokens[i].morph = tag.morph
else:
tokens[i].morph = morph[0]

View File

@ -12,31 +12,6 @@ from .typedefs cimport hash_t
from .tokens cimport Tokens, Morphology
# Google universal tag set
cdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
cdef class Tagger:
cdef class_t predict(self, const atom_t* context, object golds=*) except *
@ -45,5 +20,4 @@ cdef class Tagger:
cpdef readonly LinearModel model
cpdef readonly list tag_names
cdef PosTag* tags
cdef dict tagdict

View File

@ -34,23 +34,10 @@ cdef class Tagger:
self.mem = Pool()
cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates']
tag_map = cfg['tag_map']
univ_counts = {}
cdef unicode tag
cdef unicode univ_tag
self.tag_names = cfg['tag_names']
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
for i, tag in enumerate(self.tag_names):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
self.tags[i].morph.number = props.get('number', 0)
self.tags[i].morph.tenspect = props.get('tenspect', 0)
self.tags[i].morph.mood = props.get('mood', 0)
self.tags[i].morph.gender = props.get('gender', 0)
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
self.tagdict = _make_tag_dict(cfg['tag_counts'])
self.extractor = Extractor(templates)
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
@ -85,23 +72,6 @@ cdef class Tagger:
return tag_id
UNIV_TAGS = {
'NULL': NO_TAG,
'ADJ': ADJ,
'ADV': ADV,
'ADP': ADP,
'CONJ': CONJ,
'DET': DET,
'NOUN': NOUN,
'NUM': NUM,
'PRON': PRON,
'PRT': PRT,
'VERB': VERB,
'X': X,
'.': PUNCT,
'EOL': EOL
}
def _make_tag_dict(counts):
freq_thresh = 50