* Refactor morphology.pyx

This commit is contained in:
Matthew Honnibal 2014-12-20 07:27:28 +11:00
parent 4c6ce7ee84
commit 4e30195c6d
2 changed files with 14 additions and 40 deletions

View File

@ -1,36 +1,9 @@
from .tokens cimport TokenC
from .lexeme cimport Lexeme
from .utf8string cimport StringStore
from .typedefs cimport id_t, Morphology
from preshed.maps cimport PreshMapArray
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray
# Google universal tag set
cpdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
from .structs cimport TokenC, Lexeme, Morphology, PosTag
from .strings cimport StringStore
from .typedefs cimport id_t, univ_tag_t
cdef class Morphologizer:

View File

@ -4,7 +4,9 @@ from os import path
import json
from .lemmatizer import Lemmatizer
from .typedefs cimport id_t
from .typedefs cimport id_t, univ_tag_t
from .typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT
from .typedefs cimport VERB, X, PUNCT, EOL
from . import util
@ -34,13 +36,12 @@ cdef struct _Cached:
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
def __init__(self, StringStore strings, data_dir):
def __init__(self, StringStore strings, object lemmatizer, **kwargs):
self.mem = Pool()
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'config.json')))
tag_map = cfg['tag_map']
self.tag_names = cfg['tag_names']
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
tag_map = kwargs['tag_map']
self.tag_names = kwargs['tag_names']
self.lemmatizer = lemmatizer
self._cache = PreshMapArray(len(self.tag_names))
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
for i, tag in enumerate(self.tag_names):
@ -54,9 +55,9 @@ cdef class Morphologizer:
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
if path.exists(path.join(data_dir, 'morphs.json')):
with open(path.join(data_dir, 'morphs.json')) as file_:
self.load_exceptions(json.load(file_))
#if path.exists(path.join(data_dir, 'morphs.json')):
# with open(path.join(data_dir, 'morphs.json')) as file_:
# self.load_exceptions(json.load(file_))
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None: