mirror of https://github.com/explosion/spaCy.git
* More work on language-generic parsing
This commit is contained in:
parent
86c4a8e3e2
commit
c2307fa9ee
|
@ -0,0 +1,11 @@
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
from ..language import Language
|
||||||
|
|
||||||
|
|
||||||
|
class Finnish(Language):
|
||||||
|
@classmethod
|
||||||
|
def default_data_dir(cls):
|
||||||
|
return path.join(path.dirname(__file__), 'data')
|
|
@ -148,13 +148,10 @@ class Language(object):
|
||||||
vectors = cls.default_vectors(data_dir)
|
vectors = cls.default_vectors(data_dir)
|
||||||
if get_lex_attr is None:
|
if get_lex_attr is None:
|
||||||
get_lex_attr = cls.default_lex_attrs(data_dir)
|
get_lex_attr = cls.default_lex_attrs(data_dir)
|
||||||
if morphology is None:
|
|
||||||
morphology = cls.default_morphology(path.join(data_dir, 'vocab'))
|
|
||||||
return Vocab.from_dir(
|
return Vocab.from_dir(
|
||||||
path.join(data_dir, 'vocab'),
|
path.join(data_dir, 'vocab'),
|
||||||
get_lex_attr=get_lex_attr,
|
get_lex_attr=get_lex_attr,
|
||||||
vectors=vectors,
|
vectors=vectors)
|
||||||
morphology=morphology)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_tokenizer(cls, vocab, data_dir):
|
def default_tokenizer(cls, vocab, data_dir):
|
||||||
|
|
|
@ -1,18 +1,41 @@
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
from preshed.maps cimport PreshMapArray
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
from .typedefs cimport attr_t
|
||||||
|
from .parts_of_speech cimport univ_pos_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct RichTagC:
|
||||||
|
uint64_t morph
|
||||||
|
int id
|
||||||
|
univ_pos_t pos
|
||||||
|
attr_t name
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct MorphAnalysisC:
|
||||||
|
RichTagC tag
|
||||||
|
attr_t lemma
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
|
cdef readonly Pool mem
|
||||||
cdef readonly object strings
|
cdef readonly object strings
|
||||||
cdef public object lemmatizer
|
cdef public object lemmatizer
|
||||||
cdef public object tag_map
|
cdef public object n_tags
|
||||||
|
cdef public object reverse_index
|
||||||
cdef public object tag_names
|
cdef public object tag_names
|
||||||
cdef public object tag_ids
|
|
||||||
cdef public int n_tags
|
|
||||||
|
|
||||||
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
|
cdef RichTagC* rich_tags
|
||||||
|
cdef PreshMapArray _cache
|
||||||
|
|
||||||
|
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||||
|
|
||||||
|
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
||||||
|
|
||||||
|
|
||||||
cdef int assign_from_dict(self, TokenC* token, props) except -1
|
|
||||||
|
|
||||||
#
|
#
|
||||||
#cpdef enum Feature_t:
|
#cpdef enum Feature_t:
|
||||||
|
|
|
@ -6,15 +6,10 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from spacy.parts_of_speech import UNIV_POS_NAMES
|
from .parts_of_speech import UNIV_POS_NAMES
|
||||||
|
from .parts_of_speech cimport ADJ, VERB, NOUN
|
||||||
|
|
||||||
|
|
||||||
cdef struct MorphAnalysisC:
|
|
||||||
uint64_t[4] features
|
|
||||||
attr_t lemma
|
|
||||||
attr_t pos
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, data_dir, lemmatizer=None):
|
def from_dir(cls, data_dir, lemmatizer=None):
|
||||||
|
@ -23,32 +18,37 @@ cdef class Morphology:
|
||||||
lemmatizer = Lemmatizer.from_dir(data_dir)
|
lemmatizer = Lemmatizer.from_dir(data_dir)
|
||||||
return cls(tag_map, {}, lemmatizer)
|
return cls(tag_map, {}, lemmatizer)
|
||||||
|
|
||||||
def __init__(self, tag_map, fused_tokens, lemmatizer):
|
def __init__(self, string_store, tag_map, lemmatizer):
|
||||||
|
self.mem = Pool()
|
||||||
|
self.strings = string_store
|
||||||
self.lemmatizer = lemmatizer
|
self.lemmatizer = lemmatizer
|
||||||
self.tag_map = tag_map
|
|
||||||
self.n_tags = len(tag_map)
|
self.n_tags = len(tag_map)
|
||||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||||
self.tag_ids = {}
|
self.reverse_index = {}
|
||||||
for i, tag_str in enumerate(self.tag_names):
|
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
||||||
self.tag_ids[tag_str] = i
|
self.rich_tags[i].id = i
|
||||||
self._cache = PreshMapArray()
|
self.rich_tags[i].name = self.strings[tag_str]
|
||||||
|
self.rich_tags[i].morph = 0
|
||||||
|
self.reverse_index[self.rich_tags[i].name] = i
|
||||||
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag, token.lex.orth)
|
cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
|
||||||
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
cached = self.decode_tag(tag)
|
analysis.tag = self.rich_tags[tag_id]
|
||||||
cached.lemma = self.lemmatize(token.pos, token.lex)
|
analysis.lemma = self.lemmatize(tag, token.lex.orth)
|
||||||
token.lemma = analysis.lemma
|
token.lemma = analysis.lemma
|
||||||
token.pos = analysis.pos
|
token.pos = analysis.tag.pos
|
||||||
token.tag = analysis.tag
|
token.tag = analysis.tag.name
|
||||||
token.morph = analysis.features
|
token.morph = analysis.tag.morph
|
||||||
|
|
||||||
cdef int assign_feature(self, TokenC* token, feature, value) except -1:
|
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def load_morph_exceptions(self, dict exc):
|
def load_morph_exceptions(self, dict exc):
|
||||||
# Map (form, pos) to (lemma, inflection)
|
# Map (form, pos) to (lemma, rich tag)
|
||||||
cdef unicode pos_str
|
cdef unicode pos_str
|
||||||
cdef unicode form_str
|
cdef unicode form_str
|
||||||
cdef unicode lemma_str
|
cdef unicode lemma_str
|
||||||
|
@ -57,121 +57,30 @@ cdef class Morphology:
|
||||||
cdef int lemma
|
cdef int lemma
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef int pos
|
cdef int pos
|
||||||
for pos_str, entries in exc.items():
|
for tag_str, entries in exc.items():
|
||||||
pos = self.tag_names.index(pos_str)
|
tag = self.strings[tag_str]
|
||||||
|
rich_tag = self.rich_tags[self.reverse_index[tag]]
|
||||||
for form_str, props in entries.items():
|
for form_str, props in entries.items():
|
||||||
lemma_str = props.get('L', form_str)
|
|
||||||
orth = self.strings[form_str]
|
|
||||||
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
cached.lemma = self.strings[lemma_str]
|
orth = self.strings[form_str]
|
||||||
self.set_features(cached, props)
|
for name_str, value_str in props.items():
|
||||||
self._cache.set(pos, orth, <void*>cached)
|
if name_str == 'L':
|
||||||
|
cached.lemma = self.strings[value_str]
|
||||||
|
else:
|
||||||
|
self.assign_feature(&cached.tag.morph, name_str, value_str)
|
||||||
|
if cached.lemma == 0:
|
||||||
|
cached.lemma = self.lemmatize(rich_tag.pos, orth)
|
||||||
|
self._cache.set(rich_tag.pos, orth, <void*>cached)
|
||||||
|
|
||||||
def _load_special_tokenization(self, special_cases):
|
def lemmatize(self, const univ_pos_t pos, attr_t orth):
|
||||||
'''Add a special-case tokenization rule.
|
if self.lemmatizer is None:
|
||||||
'''
|
return orth
|
||||||
cdef int i
|
cdef unicode py_string = self.strings[orth]
|
||||||
cdef list substrings
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||||
cdef unicode chunk
|
return orth
|
||||||
cdef unicode form
|
cdef set lemma_strings
|
||||||
cdef unicode lemma
|
cdef unicode lemma_string
|
||||||
cdef dict props
|
lemma_strings = self.lemmatizer(py_string, pos)
|
||||||
cdef LexemeC** lexemes
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
cdef hash_t hashed
|
lemma = self.strings[lemma_string]
|
||||||
for chunk, substrings in sorted(special_cases.items()):
|
return lemma
|
||||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
|
||||||
for i, props in enumerate(substrings):
|
|
||||||
# Set the special tokens up to have morphology and lemmas if
|
|
||||||
# specified, otherwise use the part-of-speech tag (if specified)
|
|
||||||
form = props['F']
|
|
||||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
|
|
||||||
morphology = self.vocab.morphology.decode_dict(props)
|
|
||||||
tokens[i].lemma = morph_analysis.lemma
|
|
||||||
tokens[i].pos = morph_analysis.pos
|
|
||||||
tokens[i].tag = morph_analysis.tag
|
|
||||||
tokens[i].morph = morph_analysis.morph
|
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
|
||||||
cached.length = len(substrings)
|
|
||||||
cached.is_lex = False
|
|
||||||
cached.data.tokens = tokens
|
|
||||||
hashed = hash_string(chunk)
|
|
||||||
self._specials.set(hashed, cached)
|
|
||||||
self._cache.set(hashed, cached)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
|
||||||
# morph.number = props.get('number', 0)
|
|
||||||
# morph.tenspect = props.get('tenspect', 0)
|
|
||||||
# morph.mood = props.get('mood', 0)
|
|
||||||
# morph.gender = props.get('gender', 0)
|
|
||||||
# morph.person = props.get('person', 0)
|
|
||||||
# morph.case = props.get('case', 0)
|
|
||||||
# morph.misc = props.get('misc', 0)
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#cdef class Morphology:
|
|
||||||
# cdef Pool mem
|
|
||||||
# cdef PreshMap table
|
|
||||||
#
|
|
||||||
# def __init__(self, tags, exceptions):
|
|
||||||
# pass
|
|
||||||
#
|
|
||||||
# def __getitem__(self, hash_t id_):
|
|
||||||
# pass
|
|
||||||
#
|
|
||||||
# cdef const InflectionC* get(self, hash_t key) except NULL:
|
|
||||||
# pass
|
|
||||||
#
|
|
||||||
# cdef MorphAnalysis analyse(const TokenC* token) except -1:
|
|
||||||
# cdef struct MorphAnalysis morphology
|
|
||||||
# tokens[i].pos = tag.pos
|
|
||||||
# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
|
|
||||||
# if cached is NULL:
|
|
||||||
# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
|
||||||
# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
|
||||||
# cached.morph = tag.morph
|
|
||||||
# self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
|
|
||||||
# tokens[i].lemma = cached.lemma
|
|
||||||
# tokens[i].morph = cached.morph
|
|
||||||
#
|
|
||||||
# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
|
|
||||||
# if self.lemmatizer is None:
|
|
||||||
# return lex.orth
|
|
||||||
# cdef unicode py_string = self.strings[lex.orth]
|
|
||||||
# if pos != NOUN and pos != VERB and pos != ADJ:
|
|
||||||
# return lex.orth
|
|
||||||
# cdef set lemma_strings
|
|
||||||
# cdef unicode lemma_string
|
|
||||||
# lemma_strings = self.lemmatizer(py_string, pos)
|
|
||||||
# lemma_string = sorted(lemma_strings)[0]
|
|
||||||
# lemma = self.strings[lemma_string]
|
|
||||||
# return lemma
|
|
||||||
#
|
|
||||||
#
|
|
||||||
#cdef class Inflection:
|
|
||||||
# cdef InflectionC* c
|
|
||||||
#
|
|
||||||
# def __init__(self, container, id_):
|
|
||||||
# self.c = container[id_]
|
|
||||||
# self.container = container
|
|
||||||
#
|
|
||||||
# for i, feat_id in enumerate(feat_ids):
|
|
||||||
# feature, value = parse_id(feat_id)
|
|
||||||
# self.add_value(feature, value, True)
|
|
||||||
#
|
|
||||||
# def has(self, Value_t feat_value_id):
|
|
||||||
# part = feat_value_id % 64
|
|
||||||
# bit = feat_value_id / 64
|
|
||||||
# if self.value_set[part] & bit:
|
|
||||||
# return True
|
|
||||||
# else:
|
|
||||||
# return False
|
|
||||||
#
|
|
||||||
# property pos: def __get__(self): return self.c.pos
|
|
||||||
#
|
|
||||||
# property id: def __get__(self): return self.c.id
|
|
||||||
#
|
|
||||||
# property features:
|
|
||||||
# pass
|
|
||||||
|
|
|
@ -25,17 +25,6 @@ cdef struct LexemeC:
|
||||||
float sentiment
|
float sentiment
|
||||||
float l2_norm
|
float l2_norm
|
||||||
|
|
||||||
cdef struct MorphFeatC:
|
|
||||||
int name
|
|
||||||
int value
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct MorphologyC:
|
|
||||||
uint64_t[4] feature_set
|
|
||||||
MorphFeatC* features
|
|
||||||
univ_pos_t pos
|
|
||||||
int n
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Entity:
|
cdef struct Entity:
|
||||||
int start
|
int start
|
||||||
|
@ -54,8 +43,8 @@ cdef struct Constituent:
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
const LexemeC* lex
|
const LexemeC* lex
|
||||||
const MorphologyC* morph
|
|
||||||
const Constituent* ctnt
|
const Constituent* ctnt
|
||||||
|
uint64_t morph
|
||||||
univ_pos_t pos
|
univ_pos_t pos
|
||||||
bint spacy
|
bint spacy
|
||||||
int tag
|
int tag
|
||||||
|
|
|
@ -104,7 +104,7 @@ cdef class Tagger:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def blank(cls, vocab, templates):
|
def blank(cls, vocab, templates):
|
||||||
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
model = Model(vocab.n_tags, templates, model_loc=None)
|
||||||
return cls(vocab, model)
|
return cls(vocab, model)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -113,7 +113,7 @@ cdef class Tagger:
|
||||||
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
||||||
else:
|
else:
|
||||||
templates = cls.default_templates()
|
templates = cls.default_templates()
|
||||||
model = Model(vocab.morphology.n_tags, templates, data_dir)
|
model = Model(vocab.n_tags, templates, data_dir)
|
||||||
return cls(vocab, model)
|
return cls(vocab, model)
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, model):
|
def __init__(self, Vocab vocab, model):
|
||||||
|
@ -128,7 +128,7 @@ cdef class Tagger:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tag_names(self):
|
def tag_names(self):
|
||||||
return self.vocab.morphology.tag_names
|
return self.vocab.tag_names
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
@ -143,14 +143,15 @@ cdef class Tagger:
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
if tokens.data[i].pos == 0:
|
if tokens.data[i].pos == 0:
|
||||||
guess = self.predict(i, tokens.data)
|
guess = self.predict(i, tokens.data)
|
||||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
|
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
||||||
|
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
|
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
|
@ -168,7 +169,9 @@ cdef class Tagger:
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
guess = self.update(i, tokens.data, golds[i])
|
guess = self.update(i, tokens.data, golds[i])
|
||||||
loss = golds[i] != -1 and guess != golds[i]
|
loss = golds[i] != -1 and guess != golds[i]
|
||||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
|
|
||||||
|
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
||||||
|
|
||||||
correct += loss == 0
|
correct += loss == 0
|
||||||
self.freqs[TAG][tokens.data[i].tag] += 1
|
self.freqs[TAG][tokens.data[i].tag] += 1
|
||||||
return correct
|
return correct
|
||||||
|
|
|
@ -7,12 +7,7 @@ from .typedefs cimport hash_t
|
||||||
from .structs cimport LexemeC, TokenC
|
from .structs cimport LexemeC, TokenC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .vocab cimport Vocab, _Cached
|
from .vocab cimport Vocab, LexemesOrTokens, _Cached
|
||||||
|
|
||||||
|
|
||||||
cdef union LexemesOrTokens:
|
|
||||||
const LexemeC* const* lexemes
|
|
||||||
TokenC* tokens
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
|
|
@ -192,9 +192,7 @@ cdef class Tokenizer:
|
||||||
tokens.push_back(prefixes[0][i], False)
|
tokens.push_back(prefixes[0][i], False)
|
||||||
if string:
|
if string:
|
||||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||||
if cache_hit:
|
if not cache_hit:
|
||||||
pass
|
|
||||||
else:
|
|
||||||
match = self.find_infix(string)
|
match = self.find_infix(string)
|
||||||
if match is None:
|
if match is None:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
|
@ -253,38 +251,10 @@ cdef class Tokenizer:
|
||||||
cdef LexemeC** lexemes
|
cdef LexemeC** lexemes
|
||||||
cdef hash_t hashed
|
cdef hash_t hashed
|
||||||
for chunk, substrings in sorted(special_cases.items()):
|
for chunk, substrings in sorted(special_cases.items()):
|
||||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
|
||||||
for i, props in enumerate(substrings):
|
|
||||||
form = props['F']
|
|
||||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
|
|
||||||
lemma = props.get('L', form)
|
|
||||||
tokens[i].lemma = self.vocab.strings[lemma]
|
|
||||||
#TODO
|
|
||||||
#self.vocab.morphology.assign_from_dict(&tokens[i], props)
|
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = len(substrings)
|
cached.length = len(substrings)
|
||||||
cached.is_lex = False
|
cached.is_lex = False
|
||||||
cached.data.tokens = tokens
|
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
||||||
hashed = hash_string(chunk)
|
key = hash_string(chunk)
|
||||||
self._specials.set(hashed, cached)
|
self._specials.set(key, cached)
|
||||||
self._cache.set(hashed, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
|
|
||||||
#if lemma is not None:
|
|
||||||
# tokens[i].lemma = self.vocab.strings[lemma]
|
|
||||||
#else:
|
|
||||||
# tokens[i].lemma = 0
|
|
||||||
#if 'pos' in props:
|
|
||||||
# inflection = self.vocab.morphology.get(props['pos'])
|
|
||||||
# inflection.assign(&tokens[i])
|
|
||||||
# # These are defaults, which can be over-ridden by the
|
|
||||||
# # token-specific props.
|
|
||||||
# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
|
|
||||||
# #tokens[i].pos = pos
|
|
||||||
# ## These are defaults, which can be over-ridden by the
|
|
||||||
# ## token-specific props.
|
|
||||||
# #set_morph_from_dict(&tokens[i].morph, morph_features)
|
|
||||||
# #if tokens[i].lemma == 0:
|
|
||||||
# # tokens[i].lemma = tokens[i].lex.orth
|
|
||||||
##set_morph_from_dict(&tokens[i].morph, props)
|
|
||||||
|
|
||||||
|
|
|
@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
||||||
|
|
||||||
|
|
||||||
ctypedef const LexemeC* const_Lexeme_ptr
|
ctypedef const LexemeC* const_Lexeme_ptr
|
||||||
ctypedef TokenC* TokenC_ptr
|
ctypedef const TokenC* const_TokenC_ptr
|
||||||
|
|
||||||
ctypedef fused LexemeOrToken:
|
ctypedef fused LexemeOrToken:
|
||||||
const_Lexeme_ptr
|
const_Lexeme_ptr
|
||||||
TokenC_ptr
|
const_TokenC_ptr
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
|
|
|
@ -209,7 +209,7 @@ cdef class Doc:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
cdef TokenC* t = &self.data[self.length]
|
cdef TokenC* t = &self.data[self.length]
|
||||||
if LexemeOrToken is TokenC_ptr:
|
if LexemeOrToken is const_TokenC_ptr:
|
||||||
t[0] = lex_or_tok[0]
|
t[0] = lex_or_tok[0]
|
||||||
else:
|
else:
|
||||||
t.lex = lex_or_tok
|
t.lex = lex_or_tok
|
||||||
|
|
|
@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME
|
||||||
|
|
||||||
cdef union LexemesOrTokens:
|
cdef union LexemesOrTokens:
|
||||||
const LexemeC* const* lexemes
|
const LexemeC* const* lexemes
|
||||||
TokenC* tokens
|
const TokenC* tokens
|
||||||
|
|
||||||
|
|
||||||
cdef struct _Cached:
|
cdef struct _Cached:
|
||||||
|
@ -37,6 +37,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||||
|
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
|
|
|
@ -17,6 +17,7 @@ from .strings cimport hash_string
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .cfile cimport CFile
|
from .cfile cimport CFile
|
||||||
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
from . import util
|
from . import util
|
||||||
|
@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's LexemeC structs.
|
'''A map container for a language's LexemeC structs.
|
||||||
'''
|
'''
|
||||||
@classmethod
|
def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
|
||||||
def default_morphology(cls):
|
|
||||||
return Morphology({'VBZ': ['VERB', {}]}, [], None)
|
|
||||||
|
|
||||||
def __init__(self, get_lex_attr=None, morphology=None, vectors=None):
|
|
||||||
self.get_lex_attr = get_lex_attr
|
|
||||||
if morphology is None:
|
|
||||||
morphology = self.default_morphology()
|
|
||||||
self.morphology = morphology
|
|
||||||
|
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_hash = PreshMap()
|
self._by_hash = PreshMap()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
|
self.get_lex_attr = get_lex_attr
|
||||||
|
self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
|
||||||
|
|
||||||
self.length = 1
|
self.length = 1
|
||||||
self._serializer = None
|
self._serializer = None
|
||||||
|
@ -60,10 +54,9 @@ cdef class Vocab:
|
||||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||||
if not path.isdir(data_dir):
|
if not path.isdir(data_dir):
|
||||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors,
|
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||||
morphology=morphology)
|
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
|
||||||
self.load_lexemes(path.join(data_dir, 'strings.txt'),
|
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
|
||||||
path.join(data_dir, 'lexemes.bin'))
|
|
||||||
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
||||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||||
return self
|
return self
|
||||||
|
@ -172,6 +165,22 @@ cdef class Vocab:
|
||||||
orth = id_or_string
|
orth = id_or_string
|
||||||
return Lexeme(self, orth)
|
return Lexeme(self, orth)
|
||||||
|
|
||||||
|
cdef const TokenC* make_fused_token(self, substrings) except NULL:
|
||||||
|
cdef int i
|
||||||
|
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||||
|
for i, props in enumerate(substrings):
|
||||||
|
token = &tokens[i]
|
||||||
|
# Set the special tokens up to have morphology and lemmas if
|
||||||
|
# specified, otherwise use the part-of-speech tag (if specified)
|
||||||
|
token.lex = <LexemeC*>self.get(self.mem, props['F'])
|
||||||
|
if 'pos' in props:
|
||||||
|
self.morphology.assign_tag(token, props['pos'])
|
||||||
|
if 'L' in props:
|
||||||
|
tokens[i].lemma = self.strings[props['L']]
|
||||||
|
for feature, value in props.get('morph', {}).items():
|
||||||
|
self.morphology.assign_feature(&token.morph, feature, value)
|
||||||
|
return tokens
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
if path.exists(loc):
|
if path.exists(loc):
|
||||||
assert not path.isdir(loc)
|
assert not path.isdir(loc)
|
||||||
|
|
Loading…
Reference in New Issue