mirror of https://github.com/explosion/spaCy.git
* Move morphological analysis into its own module, morphology.pyx
This commit is contained in:
parent
b962fe73d7
commit
6b34a2f34b
12
spacy/en.pyx
12
spacy/en.pyx
|
@ -35,8 +35,8 @@ from __future__ import unicode_literals
|
|||
cimport lang
|
||||
from .typedefs cimport flags_t
|
||||
import orth
|
||||
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from .tagger cimport X, PUNCT, EOL
|
||||
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from .morphology cimport X, PUNCT, EOL
|
||||
|
||||
from .tokens cimport Morphology
|
||||
|
||||
|
@ -154,8 +154,8 @@ cdef class English(Language):
|
|||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context)
|
||||
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
|
||||
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
|
||||
if self.morphologizer:
|
||||
self.morphologizer.set_morph(i, t)
|
||||
|
||||
def train_pos(self, Tokens tokens, golds):
|
||||
cdef int i
|
||||
|
@ -165,8 +165,8 @@ cdef class English(Language):
|
|||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
|
||||
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
|
||||
if self.morphologizer:
|
||||
self.morphologizer.set_morph(i, t)
|
||||
c += t[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
|
|
|
@ -2,15 +2,15 @@ from libcpp.vector cimport vector
|
|||
|
||||
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
|
||||
|
||||
from preshed.maps cimport PreshMap, PreshMapArray
|
||||
from preshed.maps cimport PreshMap
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens, TokenC
|
||||
from .lexeme cimport Lexeme
|
||||
from .tagger cimport Tagger
|
||||
from .tagger cimport univ_tag_t
|
||||
from .utf8string cimport StringStore, UniStr
|
||||
from .morphology cimport Morphologizer
|
||||
|
||||
|
||||
cdef union LexemesOrTokens:
|
||||
|
@ -40,17 +40,14 @@ cdef class Language:
|
|||
cdef readonly unicode name
|
||||
cdef PreshMap _cache
|
||||
cdef PreshMap _specials
|
||||
cdef PreshMapArray _lemmas
|
||||
cpdef readonly Lexicon lexicon
|
||||
cpdef readonly Tagger pos_tagger
|
||||
cpdef readonly object lemmatizer
|
||||
cpdef readonly Morphologizer morphologizer
|
||||
|
||||
cdef object _prefix_re
|
||||
cdef object _suffix_re
|
||||
cdef object _infix_re
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings)
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
|
|
|
@ -14,7 +14,6 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
|||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport PreshMap
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
from .lexeme cimport Lexeme
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
|
@ -26,8 +25,6 @@ from .utf8string cimport slice_unicode
|
|||
from . import util
|
||||
from .util import read_lang_data
|
||||
from .tokens import Tokens
|
||||
|
||||
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
|
||||
from .tokens cimport Morphology
|
||||
|
||||
|
||||
|
@ -43,39 +40,16 @@ cdef class Language:
|
|||
self._infix_re = re.compile(infix)
|
||||
self.lexicon = Lexicon(self.get_props)
|
||||
self._load_special_tokenization(rules)
|
||||
self._lemmas = PreshMapArray(N_UNIV_TAGS)
|
||||
self.pos_tagger = None
|
||||
self.lemmatizer = None
|
||||
self.morphologizer = None
|
||||
|
||||
def load(self):
|
||||
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
|
||||
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
|
||||
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
|
||||
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
|
||||
if lemma != 0:
|
||||
return lemma
|
||||
cdef bytes py_string = self.lexicon.strings[lex.sic]
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
if pos == NOUN:
|
||||
lemma_strings = self.lemmatizer.noun(py_string)
|
||||
elif pos == VERB:
|
||||
lemma_strings = self.lemmatizer.verb(py_string)
|
||||
else:
|
||||
assert pos == ADJ
|
||||
lemma_strings = self.lemmatizer.adj(py_string)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
|
||||
self._lemmas.set(pos, lex.sic, <void*>lemma)
|
||||
return lemma
|
||||
self.morphologizer = Morphologizer(self.lexicon.strings,
|
||||
path.join(util.DATA_DIR, self.name))
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
from .tokens cimport TokenC, Morphology
|
||||
from .lexeme cimport Lexeme
|
||||
from .utf8string cimport StringStore
|
||||
|
||||
from preshed.maps cimport PreshMapArray
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
# Google universal tag set
|
||||
cpdef enum univ_tag_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADV
|
||||
ADP
|
||||
CONJ
|
||||
DET
|
||||
NOUN
|
||||
NUM
|
||||
PRON
|
||||
PRT
|
||||
VERB
|
||||
X
|
||||
PUNCT
|
||||
EOL
|
||||
N_UNIV_TAGS
|
||||
|
||||
|
||||
cdef struct PosTag:
|
||||
Morphology morph
|
||||
int id
|
||||
univ_tag_t pos
|
||||
|
||||
|
||||
cdef class Morphologizer:
|
||||
cdef Pool mem
|
||||
cdef StringStore strings
|
||||
cdef object lemmatizer
|
||||
cdef PosTag* tags
|
||||
|
||||
cdef PreshMapArray _morph
|
||||
cdef PreshMapArray _lemmas
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
|
@ -0,0 +1,81 @@
|
|||
from os import path
|
||||
import json
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
UNIV_TAGS = {
|
||||
'NULL': NO_TAG,
|
||||
'ADJ': ADJ,
|
||||
'ADV': ADV,
|
||||
'ADP': ADP,
|
||||
'CONJ': CONJ,
|
||||
'DET': DET,
|
||||
'NOUN': NOUN,
|
||||
'NUM': NUM,
|
||||
'PRON': PRON,
|
||||
'PRT': PRT,
|
||||
'VERB': VERB,
|
||||
'X': X,
|
||||
'.': PUNCT,
|
||||
'EOL': EOL
|
||||
}
|
||||
|
||||
|
||||
cdef class Morphologizer:
|
||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||
"""
|
||||
def __init__(self, StringStore strings, data_dir):
|
||||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||
tag_map = cfg['tag_map']
|
||||
tag_names = cfg['tag_names']
|
||||
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
|
||||
self._lemmas = PreshMapArray(N_UNIV_TAGS)
|
||||
self._morph = PreshMapArray(len(tag_names))
|
||||
self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
|
||||
for i, tag in enumerate(tag_names):
|
||||
pos, props = tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
self.tags[i].morph.number = props.get('number', 0)
|
||||
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
||||
self.tags[i].morph.mood = props.get('mood', 0)
|
||||
self.tags[i].morph.gender = props.get('gender', 0)
|
||||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
|
||||
if lemma != 0:
|
||||
return lemma
|
||||
cdef bytes py_string = self.strings[lex.sic]
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
if pos == NOUN:
|
||||
lemma_strings = self.lemmatizer.noun(py_string)
|
||||
elif pos == VERB:
|
||||
lemma_strings = self.lemmatizer.verb(py_string)
|
||||
else:
|
||||
assert pos == ADJ
|
||||
lemma_strings = self.lemmatizer.adj(py_string)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||
self._lemmas.set(pos, lex.sic, <void*>lemma)
|
||||
return lemma
|
||||
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||
tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
|
||||
if morph is NULL:
|
||||
self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
|
||||
tokens[i].morph = tag.morph
|
||||
else:
|
||||
tokens[i].morph = morph[0]
|
|
@ -12,31 +12,6 @@ from .typedefs cimport hash_t
|
|||
from .tokens cimport Tokens, Morphology
|
||||
|
||||
|
||||
# Google universal tag set
|
||||
cdef enum univ_tag_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADV
|
||||
ADP
|
||||
CONJ
|
||||
DET
|
||||
NOUN
|
||||
NUM
|
||||
PRON
|
||||
PRT
|
||||
VERB
|
||||
X
|
||||
PUNCT
|
||||
EOL
|
||||
N_UNIV_TAGS
|
||||
|
||||
|
||||
cdef struct PosTag:
|
||||
Morphology morph
|
||||
int id
|
||||
univ_tag_t pos
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
cdef class_t predict(self, const atom_t* context, object golds=*) except *
|
||||
|
||||
|
@ -45,5 +20,4 @@ cdef class Tagger:
|
|||
cpdef readonly LinearModel model
|
||||
|
||||
cpdef readonly list tag_names
|
||||
cdef PosTag* tags
|
||||
cdef dict tagdict
|
||||
|
|
|
@ -34,23 +34,10 @@ cdef class Tagger:
|
|||
self.mem = Pool()
|
||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||
templates = cfg['templates']
|
||||
tag_map = cfg['tag_map']
|
||||
univ_counts = {}
|
||||
cdef unicode tag
|
||||
cdef unicode univ_tag
|
||||
self.tag_names = cfg['tag_names']
|
||||
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||
for i, tag in enumerate(self.tag_names):
|
||||
pos, props = tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
self.tags[i].morph.number = props.get('number', 0)
|
||||
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
||||
self.tags[i].morph.mood = props.get('mood', 0)
|
||||
self.tags[i].morph.gender = props.get('gender', 0)
|
||||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
||||
self.extractor = Extractor(templates)
|
||||
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
||||
|
@ -85,23 +72,6 @@ cdef class Tagger:
|
|||
return tag_id
|
||||
|
||||
|
||||
UNIV_TAGS = {
|
||||
'NULL': NO_TAG,
|
||||
'ADJ': ADJ,
|
||||
'ADV': ADV,
|
||||
'ADP': ADP,
|
||||
'CONJ': CONJ,
|
||||
'DET': DET,
|
||||
'NOUN': NOUN,
|
||||
'NUM': NUM,
|
||||
'PRON': PRON,
|
||||
'PRT': PRT,
|
||||
'VERB': VERB,
|
||||
'X': X,
|
||||
'.': PUNCT,
|
||||
'EOL': EOL
|
||||
}
|
||||
|
||||
|
||||
def _make_tag_dict(counts):
|
||||
freq_thresh = 50
|
||||
|
|
Loading…
Reference in New Issue