From 4e30195c6d61a2d6ad8aa59e44e0792046807877 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Dec 2014 07:27:28 +1100 Subject: [PATCH] * Refactor morphology.pyx --- spacy/morphology.pxd | 35 ++++------------------------------- spacy/morphology.pyx | 19 ++++++++++--------- 2 files changed, 14 insertions(+), 40 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 9c5d342e9..a6f020159 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,36 +1,9 @@ - -from .tokens cimport TokenC -from .lexeme cimport Lexeme -from .utf8string cimport StringStore -from .typedefs cimport id_t, Morphology - -from preshed.maps cimport PreshMapArray from cymem.cymem cimport Pool +from preshed.maps cimport PreshMapArray - -# Google universal tag set -cpdef enum univ_tag_t: - NO_TAG - ADJ - ADV - ADP - CONJ - DET - NOUN - NUM - PRON - PRT - VERB - X - PUNCT - EOL - N_UNIV_TAGS - - -cdef struct PosTag: - Morphology morph - int id - univ_tag_t pos +from .structs cimport TokenC, Lexeme, Morphology, PosTag +from .strings cimport StringStore +from .typedefs cimport id_t, univ_tag_t cdef class Morphologizer: diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 4d3600f8b..30e4aef4c 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,7 +4,9 @@ from os import path import json from .lemmatizer import Lemmatizer -from .typedefs cimport id_t +from .typedefs cimport id_t, univ_tag_t +from .typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT +from .typedefs cimport VERB, X, PUNCT, EOL from . import util @@ -34,13 +36,12 @@ cdef struct _Cached: cdef class Morphologizer: """Given a POS tag and a Lexeme, find its lemma and morphological analysis. """ - def __init__(self, StringStore strings, data_dir): + def __init__(self, StringStore strings, object lemmatizer, **kwargs): self.mem = Pool() self.strings = strings - cfg = json.load(open(path.join(data_dir, 'config.json'))) - tag_map = cfg['tag_map'] - self.tag_names = cfg['tag_names'] - self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet')) + tag_map = kwargs['tag_map'] + self.tag_names = kwargs['tag_names'] + self.lemmatizer = lemmatizer self._cache = PreshMapArray(len(self.tag_names)) self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) for i, tag in enumerate(self.tag_names): @@ -54,9 +55,9 @@ cdef class Morphologizer: self.tags[i].morph.person = props.get('person', 0) self.tags[i].morph.case = props.get('case', 0) self.tags[i].morph.misc = props.get('misc', 0) - if path.exists(path.join(data_dir, 'morphs.json')): - with open(path.join(data_dir, 'morphs.json')) as file_: - self.load_exceptions(json.load(file_)) + #if path.exists(path.join(data_dir, 'morphs.json')): + # with open(path.join(data_dir, 'morphs.json')) as file_: + # self.load_exceptions(json.load(file_)) cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: if self.lemmatizer is None: