mirror of https://github.com/explosion/spaCy.git
* Work on morphological processing
This commit is contained in:
parent
7b68f911cf
commit
99bbbb6feb
51
spacy/en.pxd
51
spacy/en.pxd
|
@ -5,6 +5,57 @@ from .tokens cimport Tokens
|
|||
from .tokens cimport TokenC
|
||||
|
||||
|
||||
cpdef enum en_person_t:
|
||||
NO_PERSON
|
||||
FIRST
|
||||
SECOND
|
||||
THIRD
|
||||
|
||||
|
||||
cpdef enum en_number_t:
|
||||
NO_NUMBER
|
||||
SINGULAR
|
||||
PLURAL
|
||||
MASS
|
||||
CARDINAL
|
||||
ORDINAL
|
||||
|
||||
|
||||
cpdef enum en_gender_t:
|
||||
NO_GENDER
|
||||
MASCULINE
|
||||
FEMININE
|
||||
|
||||
|
||||
cpdef enum en_tenspect_t:
|
||||
NO_TENSE
|
||||
BASE_VERB
|
||||
PRESENT
|
||||
PAST
|
||||
PASSIVE
|
||||
ING
|
||||
MODAL
|
||||
|
||||
|
||||
cpdef enum en_case_t:
|
||||
NO_CASE
|
||||
NOMINATIVE
|
||||
ACCUSATIVE
|
||||
GENITIVE
|
||||
DEMONYM
|
||||
|
||||
|
||||
cpdef enum misc_t:
|
||||
NO_MISC
|
||||
COMPARATIVE
|
||||
SUPERLATIVE
|
||||
RELATIVE
|
||||
NAME
|
||||
URL
|
||||
EMAIL
|
||||
EMOTICON
|
||||
|
||||
|
||||
# Flags
|
||||
cpdef enum FlagID:
|
||||
IS_ALPHA
|
||||
|
|
73
spacy/en.pyx
73
spacy/en.pyx
|
@ -35,6 +35,63 @@ from __future__ import unicode_literals
|
|||
cimport lang
|
||||
from .typedefs cimport flags_t
|
||||
import orth
|
||||
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from .tagger cimport X, PUNCT, EOL
|
||||
|
||||
|
||||
POS_TAGS = {
|
||||
'NULL': (NO_TAG, {}),
|
||||
'EOL': (EOL, {}),
|
||||
'CC': (CONJ, {}),
|
||||
'CD': (NUM, {}),
|
||||
'DT': (DET, {}),
|
||||
'EX': (DET, {}),
|
||||
'FW': (X, {}),
|
||||
'IN': (ADP, {}),
|
||||
'JJ': (ADJ, {}),
|
||||
'JJR': (ADJ, {'misc': COMPARATIVE}),
|
||||
'JJS': (ADJ, {'misc': SUPERLATIVE}),
|
||||
'LS': (X, {}),
|
||||
'MD': (VERB, {'tenspect': MODAL}),
|
||||
'NN': (NOUN, {}),
|
||||
'NNS': (NOUN, {'number': PLURAL}),
|
||||
'NNP': (NOUN, {'misc': NAME}),
|
||||
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
|
||||
'PDT': (DET, {}),
|
||||
'POS': (PRT, {'case': GENITIVE}),
|
||||
'PRP': (NOUN, {}),
|
||||
'PRP$': (NOUN, {'case': GENITIVE}),
|
||||
'RB': (ADV, {}),
|
||||
'RBR': (ADV, {'misc': COMPARATIVE}),
|
||||
'RBS': (ADV, {'misc': SUPERLATIVE}),
|
||||
'RP': (PRT, {}),
|
||||
'SYM': (X, {}),
|
||||
'TO': (PRT, {}),
|
||||
'UH': (X, {}),
|
||||
'VB': (VERB, {}),
|
||||
'VBD': (VERB, {'tenspect': PAST}),
|
||||
'VBG': (VERB, {'tenspect': ING}),
|
||||
'VBN': (VERB, {'tenspect': PASSIVE}),
|
||||
'VBP': (VERB, {'tenspect': PRESENT}),
|
||||
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||
'WDT': (DET, {'misc': RELATIVE}),
|
||||
'WP': (PRON, {'misc': RELATIVE}),
|
||||
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
|
||||
'WRB': (ADV, {'misc': RELATIVE}),
|
||||
'!': (PUNCT, {}),
|
||||
'#': (PUNCT, {}),
|
||||
'$': (PUNCT, {}),
|
||||
"''": (PUNCT, {}),
|
||||
"(": (PUNCT, {}),
|
||||
")": (PUNCT, {}),
|
||||
"-LRB-": (PUNCT, {}),
|
||||
"-RRB-": (PUNCT, {}),
|
||||
".": (PUNCT, {}),
|
||||
",": (PUNCT, {}),
|
||||
"``": (PUNCT, {}),
|
||||
":": (PUNCT, {}),
|
||||
"?": (PUNCT, {}),
|
||||
}
|
||||
|
||||
|
||||
POS_TEMPLATES = (
|
||||
|
@ -91,19 +148,25 @@ cdef class English(Language):
|
|||
def set_pos(self, Tokens tokens):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
cdef TokenC* t = tokens.data
|
||||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, tokens.data)
|
||||
tokens.data[i].pos = self.pos_tagger.predict(context)
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context)
|
||||
#self.morphalyser.set_token(&t[i])
|
||||
|
||||
def train_pos(self, Tokens tokens, golds):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
c = 0
|
||||
cdef TokenC* t = tokens.data
|
||||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, tokens.data)
|
||||
tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||
c += tokens.data[i].pos == golds[i]
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||
t[i].morph = self.pos_tagger.tags[t[i].pos].morph
|
||||
#self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
|
||||
c += t[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
|
||||
|
||||
EN = English('en')
|
||||
|
|
|
@ -2,20 +2,20 @@ from libcpp.vector cimport vector
|
|||
|
||||
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
|
||||
|
||||
from preshed.maps cimport PreshMap
|
||||
from preshed.maps cimport PreshMap, PreshMapArray
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens, TokenC
|
||||
from .lexeme cimport Lexeme
|
||||
from .tagger cimport Tagger
|
||||
from .tagger cimport PosTag
|
||||
from .utf8string cimport StringStore, UniStr
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
cpdef public get_lex_props
|
||||
cdef Pool mem
|
||||
cpdef readonly size_t size
|
||||
cpdef readonly StringStore strings
|
||||
cdef vector[Lexeme*] lexemes
|
||||
|
||||
|
@ -29,13 +29,17 @@ cdef class Language:
|
|||
cdef readonly unicode name
|
||||
cdef PreshMap _cache
|
||||
cdef PreshMap _specials
|
||||
cdef PreshMapArray _lemmas
|
||||
cpdef readonly Lexicon lexicon
|
||||
cpdef readonly Tagger pos_tagger
|
||||
cpdef readonly object lemmatizer
|
||||
|
||||
cdef object _prefix_re
|
||||
cdef object _suffix_re
|
||||
cdef object _infix_re
|
||||
|
||||
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings)
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
|||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport PreshMap
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
from .lexeme cimport Lexeme
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
|
@ -26,6 +27,8 @@ from . import util
|
|||
from .util import read_lang_data
|
||||
from .tokens import Tokens
|
||||
|
||||
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
|
||||
|
||||
|
||||
cdef class Language:
|
||||
def __init__(self, name):
|
||||
|
@ -39,14 +42,40 @@ cdef class Language:
|
|||
self._infix_re = re.compile(infix)
|
||||
self.lexicon = Lexicon(self.get_props)
|
||||
self._load_special_tokenization(rules)
|
||||
self._lemmas = PreshMapArray(N_UNIV_TAGS)
|
||||
self.pos_tagger = None
|
||||
self.lemmatizer = None
|
||||
|
||||
def load(self):
|
||||
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
|
||||
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
|
||||
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
|
||||
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||
|
||||
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
|
||||
return lex.sic
|
||||
cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
|
||||
if lemma != 0:
|
||||
return lemma
|
||||
cdef bytes py_string = self.lexicon.strings[lex.sic]
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
if pos.pos == NOUN:
|
||||
lemma_strings = self.lemmatizer.noun(py_string)
|
||||
elif pos.pos == VERB:
|
||||
lemma_strings = self.lemmatizer.verb(py_string)
|
||||
else:
|
||||
assert pos.pos == ADJ
|
||||
lemma_strings = self.lemmatizer.adj(py_string)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
|
||||
self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
|
||||
return lemma
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
cdef Tokens tokens = Tokens(self.lexicon.strings, length)
|
||||
|
@ -254,9 +283,11 @@ cdef class Lexicon:
|
|||
self._map = PreshMap(2 ** 20)
|
||||
self.strings = StringStore()
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.size = 2
|
||||
self.get_lex_props = get_props
|
||||
|
||||
def __len__(self):
|
||||
return self.lexemes.size()
|
||||
|
||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
|
@ -269,14 +300,13 @@ cdef class Lexicon:
|
|||
mem = self.mem
|
||||
cdef unicode py_string = string.chars[:string.n]
|
||||
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
||||
lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
|
||||
lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
|
||||
self.get_lex_props(py_string))
|
||||
if mem is self.mem:
|
||||
self._map.set(string.key, lex)
|
||||
while self.lexemes.size() < (lex.id + 1):
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.lexemes[lex.id] = lex
|
||||
self.size += 1
|
||||
else:
|
||||
lex[0].id = 1
|
||||
return lex
|
||||
|
@ -302,6 +332,8 @@ cdef class Lexicon:
|
|||
a dict if the operator is called from Python.
|
||||
'''
|
||||
if type(id_or_string) == int:
|
||||
if id_or_string >= self.lexemes.size():
|
||||
raise IndexError
|
||||
return self.lexemes.at(id_or_string)[0]
|
||||
cdef UniStr string
|
||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||
|
@ -359,5 +391,4 @@ cdef class Lexicon:
|
|||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.lexemes[lexeme.id] = lexeme
|
||||
i += 1
|
||||
self.size += 1
|
||||
fclose(fp)
|
||||
|
|
|
@ -53,6 +53,7 @@ class Lemmatizer(object):
|
|||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
|
@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules):
|
|||
form = string[:len(string) - len(old)] + new
|
||||
if form in index:
|
||||
forms.append(form)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return set(forms)
|
||||
|
||||
|
||||
|
|
|
@ -147,6 +147,7 @@ Y PRT
|
|||
Z NOUN
|
||||
^ NOUN
|
||||
~ X
|
||||
`` .""".strip().split('\n'))
|
||||
`` .
|
||||
EOL EOL""".strip().split('\n'))
|
||||
return mapping[tag]
|
||||
|
||||
|
|
|
@ -1,11 +1,40 @@
|
|||
from libc.stdint cimport uint8_t
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.learner cimport LinearModel
|
||||
from thinc.features cimport Extractor
|
||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||
|
||||
from preshed.maps cimport PreshMapArray
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
from .tokens cimport Tokens, Morphology
|
||||
|
||||
|
||||
# Google universal tag set
|
||||
cdef enum univ_tag_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADV
|
||||
ADP
|
||||
CONJ
|
||||
DET
|
||||
NOUN
|
||||
NUM
|
||||
PRON
|
||||
PRT
|
||||
VERB
|
||||
X
|
||||
PUNCT
|
||||
EOL
|
||||
N_UNIV_TAGS
|
||||
|
||||
|
||||
cdef struct PosTag:
|
||||
Morphology morph
|
||||
int id
|
||||
univ_tag_t pos
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
|
@ -16,4 +45,5 @@ cdef class Tagger:
|
|||
cpdef readonly LinearModel model
|
||||
|
||||
cpdef readonly list tag_names
|
||||
cdef PosTag* tags
|
||||
cdef dict tagdict
|
||||
|
|
|
@ -12,13 +12,14 @@ import cython
|
|||
from thinc.features cimport Feature, count_feats
|
||||
|
||||
|
||||
def setup_model_dir(tag_names, tag_counts, templates, model_dir):
|
||||
def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
|
||||
if path.exists(model_dir):
|
||||
shutil.rmtree(model_dir)
|
||||
os.mkdir(model_dir)
|
||||
config = {
|
||||
'templates': templates,
|
||||
'tag_names': tag_names,
|
||||
'tag_map': tag_map,
|
||||
'tag_counts': tag_counts,
|
||||
}
|
||||
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
||||
|
@ -33,16 +34,31 @@ cdef class Tagger:
|
|||
self.mem = Pool()
|
||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||
templates = cfg['templates']
|
||||
tag_map = cfg['tag_map']
|
||||
univ_counts = {}
|
||||
cdef unicode tag
|
||||
cdef unicode univ_tag
|
||||
self.tag_names = cfg['tag_names']
|
||||
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||
for i, tag in enumerate(self.tag_names):
|
||||
pos, props = tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
self.tags[i].morph.number = props.get('number', 0)
|
||||
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
||||
self.tags[i].morph.mood = props.get('mood', 0)
|
||||
self.tags[i].morph.gender = props.get('gender', 0)
|
||||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
||||
self.extractor = Extractor(templates)
|
||||
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
self.model.load(path.join(model_dir, 'model'))
|
||||
|
||||
cdef class_t predict(self, const atom_t* context, object golds=None) except *:
|
||||
"""Predict the tag of tokens[i]. The tagger remembers the features and
|
||||
prediction, in case you later call tell_answer.
|
||||
cdef class_t predict(self, atom_t* context, object golds=None) except *:
|
||||
"""Predict the tag of tokens[i].
|
||||
|
||||
>>> tokens = EN.tokenize(u'An example sentence.')
|
||||
>>> tag = EN.pos_tagger.predict(0, tokens)
|
||||
|
@ -69,6 +85,24 @@ cdef class Tagger:
|
|||
return tag_id
|
||||
|
||||
|
||||
UNIV_TAGS = {
|
||||
'NULL': NO_TAG,
|
||||
'ADJ': ADJ,
|
||||
'ADV': ADV,
|
||||
'ADP': ADP,
|
||||
'CONJ': CONJ,
|
||||
'DET': DET,
|
||||
'NOUN': NOUN,
|
||||
'NUM': NUM,
|
||||
'PRON': PRON,
|
||||
'PRT': PRT,
|
||||
'VERB': VERB,
|
||||
'X': X,
|
||||
'.': PUNCT,
|
||||
'EOL': EOL
|
||||
}
|
||||
|
||||
|
||||
def _make_tag_dict(counts):
|
||||
freq_thresh = 50
|
||||
ambiguity_thresh = 0.98
|
||||
|
|
|
@ -5,14 +5,29 @@ from cymem.cymem cimport Pool
|
|||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .lexeme cimport Lexeme
|
||||
|
||||
from .typedefs cimport flags_t
|
||||
from .utf8string cimport StringStore
|
||||
from libc.stdint cimport uint8_t, uint16_t
|
||||
|
||||
|
||||
cdef struct Morphology:
|
||||
uint8_t number
|
||||
uint8_t tenspect # Tense/aspect/voice
|
||||
uint8_t mood
|
||||
uint8_t gender
|
||||
uint8_t person
|
||||
uint8_t case
|
||||
uint8_t misc
|
||||
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
const Lexeme* lex
|
||||
Morphology morph
|
||||
int idx
|
||||
int pos
|
||||
int lemma
|
||||
int sense
|
||||
|
||||
|
||||
|
@ -37,7 +52,7 @@ cdef class Token:
|
|||
cdef public int i
|
||||
cdef public int idx
|
||||
cdef public int pos
|
||||
cdef public int ner
|
||||
cdef int lemma
|
||||
|
||||
cdef public atom_t id
|
||||
cdef public atom_t cluster
|
||||
|
|
|
@ -51,7 +51,7 @@ cdef class Tokens:
|
|||
def __getitem__(self, i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
|
||||
self.data[i].sense, self.data[i].lex[0])
|
||||
self.data[i].lemma, self.data[i].lex[0])
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(self.length):
|
||||
|
@ -128,14 +128,15 @@ cdef class Tokens:
|
|||
|
||||
@cython.freelist(64)
|
||||
cdef class Token:
|
||||
def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
|
||||
def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
|
||||
dict lex):
|
||||
self._string_store = string_store
|
||||
self.idx = idx
|
||||
self.pos = pos
|
||||
self.ner = ner
|
||||
self.i = i
|
||||
self.id = lex['id']
|
||||
|
||||
self.lemma = lemma
|
||||
|
||||
self.cluster = lex['cluster']
|
||||
self.length = lex['length']
|
||||
|
@ -156,3 +157,10 @@ cdef class Token:
|
|||
return ''
|
||||
cdef bytes utf8string = self._string_store[self.sic]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
property lemma:
|
||||
def __get__(self):
|
||||
if self.lemma == 0:
|
||||
return self.string
|
||||
cdef bytes utf8string = self._string_store[self.lemma]
|
||||
return utf8string.decode('utf8')
|
||||
|
|
Loading…
Reference in New Issue