mirror of https://github.com/explosion/spaCy.git
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time.
This commit is contained in:
parent
0973e2f107
commit
317cbbc015
|
@ -141,11 +141,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc)
|
||||
nlp.parser.model.end_training()
|
||||
nlp.entity.model.end_training()
|
||||
nlp.tagger.model.end_training()
|
||||
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
|
||||
|
||||
nlp.end_training()
|
||||
|
||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||
beam_width=None):
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from __future__ import unicode_literals
|
||||
from os import path
|
||||
import re
|
||||
import struct
|
||||
import json
|
||||
|
||||
from .. import orth
|
||||
from ..vocab import Vocab
|
||||
|
@ -8,6 +10,7 @@ from ..tokenizer import Tokenizer
|
|||
from ..syntax.arc_eager import ArcEager
|
||||
from ..syntax.ner import BiluoPushDown
|
||||
from ..syntax.parser import ParserFactory
|
||||
from ..serialize.bits import BitArray
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..multi_words import RegexMerger
|
||||
|
@ -19,6 +22,8 @@ from . import regexes
|
|||
|
||||
from ..util import read_lang_data
|
||||
|
||||
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
|
||||
|
||||
def get_lex_props(string):
|
||||
return {
|
||||
|
@ -74,7 +79,7 @@ class English(object):
|
|||
load_vectors=True
|
||||
):
|
||||
|
||||
self._data_dir = data_dir
|
||||
self.data_dir = data_dir
|
||||
|
||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
||||
get_lex_props=get_lex_props, load_vectors=load_vectors,
|
||||
|
@ -140,6 +145,29 @@ class English(object):
|
|||
self.mwe_merger(tokens)
|
||||
return tokens
|
||||
|
||||
def end_training(self, data_dir=None):
|
||||
if data_dir is None:
|
||||
data_dir = self.data_dir
|
||||
self.parser.model.end_training()
|
||||
self.entity.model.end_training()
|
||||
self.tagger.model.end_training()
|
||||
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||
|
||||
with open(path.join(data_dir, 'pos', 'tag_freqs.json'), 'w') as file_:
|
||||
json.dump(list(self.tagger.freqs[TAG].items()), file_)
|
||||
|
||||
with open(path.join(data_dir, 'deps', 'head_freqs.json'), 'w') as file_:
|
||||
json.dump(list(self.parser.moves.freqs[HEAD].items()), file_)
|
||||
|
||||
with open(path.join(data_dir, 'deps', 'dep_freqs.json'), 'w') as file_:
|
||||
json.dump(list(self.parser.moves.freqs[DEP].items()), file_)
|
||||
|
||||
with open(path.join(data_dir, 'ner', 'iob_freqs.json'), 'w') as file_:
|
||||
json.dump(list(self.entity.moves.freqs[ENT_IOB].items()), file_)
|
||||
|
||||
with open(path.join(data_dir, 'ner', 'ne_freqs.json'), 'w') as file_:
|
||||
json.dump(list(self.entity.moves.freqs[ENT_TYPE].items()), file_)
|
||||
|
||||
@property
|
||||
def tags(self):
|
||||
"""List of part-of-speech tag names."""
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from preshed.maps cimport PreshMapArray
|
||||
from preshed.counter cimport PreshCounter
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .._ml cimport Model
|
||||
|
@ -14,6 +15,7 @@ cdef class EnPosTagger:
|
|||
cdef readonly Model model
|
||||
cdef public object lemmatizer
|
||||
cdef PreshMapArray _morph_cache
|
||||
cdef public dict freqs
|
||||
|
||||
cdef PosTag* tags
|
||||
cdef readonly object tag_names
|
||||
|
|
|
@ -7,6 +7,7 @@ from libc.string cimport memset
|
|||
|
||||
from cymem.cymem cimport Address
|
||||
from thinc.typedefs cimport atom_t, weight_t
|
||||
from collections import defaultdict
|
||||
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||
|
@ -17,7 +18,7 @@ from ..tokens.doc cimport Doc
|
|||
from ..morphology cimport set_morph_from_dict
|
||||
from .._ml cimport arg_max
|
||||
|
||||
from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
|
||||
from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
@ -260,6 +261,7 @@ cdef class EnPosTagger:
|
|||
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
|
||||
'morphs.json'))))
|
||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||
self.freqs = {TAG: defaultdict(int)}
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
@ -309,6 +311,7 @@ cdef class EnPosTagger:
|
|||
tokens.data[i].tag = self.strings[self.tag_names[guess]]
|
||||
self.set_morph(i, &self.tags[guess], tokens.data)
|
||||
correct += loss == 0
|
||||
self.freqs[TAG][tokens.data[i].tag] += 1
|
||||
return correct
|
||||
|
||||
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
|
||||
|
|
|
@ -14,8 +14,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
|
|||
|
||||
|
||||
cdef class BitArray:
|
||||
def __init__(self):
|
||||
self.data = b''
|
||||
def __init__(self, data=b''):
|
||||
self.data = data
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
self.i = 0
|
||||
|
|
|
@ -4,4 +4,5 @@ from ..vocab cimport Vocab
|
|||
cdef class Packer:
|
||||
cdef readonly tuple attrs
|
||||
cdef readonly tuple _codecs
|
||||
cdef readonly object lex_codec
|
||||
cdef readonly Vocab vocab
|
||||
|
|
|
@ -8,15 +8,17 @@ from libcpp.pair cimport pair
|
|||
from cymem.cymem cimport Address, Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..vocab cimport Vocab
|
||||
from ..structs cimport LexemeC
|
||||
from ..typedefs cimport attr_t
|
||||
from .bits cimport BitArray
|
||||
from .huffman cimport HuffmanCodec
|
||||
|
||||
from os import path
|
||||
import numpy
|
||||
from .. import util
|
||||
|
||||
cimport cython
|
||||
|
||||
|
@ -67,8 +69,8 @@ cdef class _AttributeCodec:
|
|||
item.first = count
|
||||
item.second = key
|
||||
items.push(item)
|
||||
weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32)
|
||||
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
|
||||
weights = numpy.ndarray(shape=(items.size(),), dtype=numpy.float32)
|
||||
self._keys = <attr_t*>self.mem.alloc(items.size(), sizeof(attr_t))
|
||||
self._map = {}
|
||||
cdef int i = 0
|
||||
while not items.empty():
|
||||
|
@ -94,21 +96,33 @@ cdef class _AttributeCodec:
|
|||
dest[i] = <attr_t>self._keys[dest[i]]
|
||||
|
||||
|
||||
cdef class Packer:
|
||||
def __init__(self, Vocab vocab, list_of_attr_freqs):
|
||||
self.vocab = vocab
|
||||
codecs = []
|
||||
attrs = []
|
||||
def _gen_orths(Vocab vocab):
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
for orth, addr in vocab._by_orth.items():
|
||||
lex = <LexemeC*>addr
|
||||
yield orth, c_exp(lex.prob)
|
||||
|
||||
for attr, freqs in list_of_attr_freqs:
|
||||
if attr == SPACY:
|
||||
codecs.append(_BinaryCodec())
|
||||
else:
|
||||
codecs.append(_AttributeCodec(freqs))
|
||||
|
||||
cdef class Packer:
|
||||
def __init__(self, Vocab vocab, attr_freqs):
|
||||
self.vocab = vocab
|
||||
self.lex_codec = _AttributeCodec(_gen_orths(vocab))
|
||||
|
||||
codecs = [_AttributeCodec(_gen_orths(vocab)), _BinaryCodec()]
|
||||
attrs = [ORTH, SPACY]
|
||||
for attr, freqs in sorted(attr_freqs):
|
||||
if attr in (ORTH, ID, SPACY):
|
||||
continue
|
||||
codecs.append(_AttributeCodec(freqs))
|
||||
attrs.append(attr)
|
||||
self._codecs = tuple(codecs)
|
||||
self.attrs = tuple(attrs)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, Vocab vocab, data_dir):
|
||||
return cls(vocab, util.read_encoding_freqs(data_dir))
|
||||
|
||||
def pack(self, Doc doc):
|
||||
array = doc.to_array(self.attrs)
|
||||
cdef BitArray bits = BitArray()
|
||||
|
@ -124,6 +138,4 @@ cdef class Packer:
|
|||
array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32)
|
||||
for i, codec in enumerate(self._codecs):
|
||||
codec.decode(bits, array[:, i])
|
||||
doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1])
|
||||
doc.from_array(self.attrs, array)
|
||||
return doc
|
||||
return array
|
||||
|
|
|
@ -10,6 +10,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
|
|||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
|
@ -309,6 +310,9 @@ cdef class ArcEager(TransitionSystem):
|
|||
label = 'ROOT'
|
||||
gold.c.heads[i] = gold.heads[i]
|
||||
gold.c.labels[i] = self.strings[label]
|
||||
# Count frequencies, for use in encoder
|
||||
self.freqs[HEAD][gold.c.heads[i] - i] += 1
|
||||
self.freqs[DEP][gold.c.labels[i]] += 1
|
||||
for end, brackets in gold.brackets.items():
|
||||
for start, label_strs in brackets.items():
|
||||
gold.c.brackets[start][end] = 1
|
||||
|
|
|
@ -8,6 +8,7 @@ from ..structs cimport TokenC, Entity
|
|||
from thinc.typedefs cimport weight_t
|
||||
from ..gold cimport GoldParseC
|
||||
from ..gold cimport GoldParse
|
||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
|
||||
|
@ -74,6 +75,16 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
for i in range(gold.length):
|
||||
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
||||
# Count frequencies, for use in encoder
|
||||
if gold.c.ner[i].move in (BEGIN, UNIT):
|
||||
self.freqs[ENT_IOB][3] += 1
|
||||
self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
|
||||
elif gold.c.ner[i].move in (IN, LAST):
|
||||
self.freqs[ENT_IOB][2] += 1
|
||||
self.freqs[ENT_TYPE][0] += 1
|
||||
elif gold.c.ner[i].move == OUT:
|
||||
self.freqs[ENT_IOB][1] += 1
|
||||
self.freqs[ENT_TYPE][0] += 1
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
if name == '-':
|
||||
|
|
|
@ -35,6 +35,7 @@ cdef class TransitionSystem:
|
|||
cdef bint* _is_valid
|
||||
cdef readonly int n_moves
|
||||
cdef public int root_label
|
||||
cdef public freqs
|
||||
|
||||
cdef int initialize_state(self, StateClass state) except -1
|
||||
cdef int finalize_state(self, StateClass state) nogil
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from ..structs cimport TokenC
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import defaultdict
|
||||
|
||||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
|
@ -28,6 +30,9 @@ cdef class TransitionSystem:
|
|||
i += 1
|
||||
self.c = moves
|
||||
self.root_label = self.strings['ROOT']
|
||||
self.freqs = {}
|
||||
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
|
||||
self.freqs[attr] = defaultdict(int)
|
||||
|
||||
cdef int initialize_state(self, StateClass state) except -1:
|
||||
pass
|
||||
|
|
|
@ -2,6 +2,7 @@ cimport cython
|
|||
from libc.string cimport memcpy, memset
|
||||
|
||||
import numpy
|
||||
import struct
|
||||
|
||||
from ..lexeme cimport EMPTY_LEXEME
|
||||
from ..strings cimport slice_unicode
|
||||
|
@ -16,6 +17,7 @@ from ..lexeme cimport get_attr as get_lex_attr
|
|||
from .spans import Span
|
||||
from ..structs cimport UniStr
|
||||
from .token cimport Token
|
||||
from ..serialize.bits cimport BitArray
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
@ -54,7 +56,7 @@ cdef class Doc:
|
|||
Container class for annotated text. Constructed via English.__call__ or
|
||||
Tokenizer.__call__.
|
||||
"""
|
||||
def __init__(self, Vocab vocab):
|
||||
def __init__(self, Vocab vocab, orths_and_spaces=None):
|
||||
self.vocab = vocab
|
||||
size = 20
|
||||
self.mem = Pool()
|
||||
|
@ -71,24 +73,17 @@ cdef class Doc:
|
|||
self.is_tagged = False
|
||||
self.is_parsed = False
|
||||
self._py_tokens = []
|
||||
|
||||
@classmethod
|
||||
def from_ids(cls, Vocab vocab, orths, spaces):
|
||||
cdef int i
|
||||
cdef const LexemeC* lex
|
||||
cdef Doc self = cls(vocab)
|
||||
cdef bint space = 0
|
||||
cdef attr_t orth
|
||||
for i in range(len(orths)):
|
||||
orth = orths[i]
|
||||
lex = <LexemeC*>self.vocab._by_orth.get(orth)
|
||||
if lex != NULL:
|
||||
assert lex.orth == orth
|
||||
space = spaces[i]
|
||||
self.push_back(lex, space)
|
||||
else:
|
||||
raise Exception('Lexeme not found: %d' % orth)
|
||||
return self
|
||||
cdef bint space
|
||||
if orths_and_spaces is not None:
|
||||
for orth, space in orths_and_spaces:
|
||||
lex = <LexemeC*>self.vocab._by_orth.get(orth)
|
||||
if lex != NULL:
|
||||
assert lex.orth == orth
|
||||
self.push_back(lex, space)
|
||||
else:
|
||||
raise Exception('Lexeme not found: %d' % orth)
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a token.
|
||||
|
@ -389,3 +384,26 @@ cdef class Doc:
|
|||
elif attr_id == ENT_TYPE:
|
||||
for i in range(length):
|
||||
tokens[i].ent_type = values[i]
|
||||
|
||||
def to_bytes(self):
|
||||
bits = self.vocab.packer.pack(self)
|
||||
return struct.pack('I', len(bits)) + bits.as_bytes()
|
||||
|
||||
@staticmethod
|
||||
def from_bytes(Vocab vocab, file_):
|
||||
keep_reading = True
|
||||
while keep_reading:
|
||||
try:
|
||||
n_bits_str = file_.read(4)
|
||||
if len(n_bits_str) < 4:
|
||||
break
|
||||
n_bits = struct.unpack('I', n_bits_str)[0]
|
||||
n_bytes = n_bits // 8 + bool(n_bits % 8)
|
||||
data = file_.read(n_bytes)
|
||||
except StopIteration:
|
||||
keep_reading = False
|
||||
bits = BitArray(data)
|
||||
array = vocab.packer.unpack(bits)
|
||||
doc = Doc(vocab, array[:, :2])
|
||||
doc.from_array(vocab.packer.attrs, array)
|
||||
yield doc
|
||||
|
|
|
@ -2,6 +2,7 @@ from os import path
|
|||
import codecs
|
||||
import json
|
||||
import re
|
||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
|
||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
||||
|
||||
|
@ -64,7 +65,17 @@ def read_tokenization(lang):
|
|||
return entries
|
||||
|
||||
|
||||
def read_detoken_rules(lang):
|
||||
def read_encoding_freqs(data_dir):
|
||||
tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
|
||||
heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
|
||||
deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
|
||||
iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
|
||||
ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
|
||||
return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
|
||||
(ENT_TYPE, ne_types)]
|
||||
|
||||
|
||||
def read_detoken_rules(lang): # Deprecated?
|
||||
loc = path.join(DATA_DIR, lang, 'detokenize')
|
||||
entries = []
|
||||
with utf8open(loc) as file_:
|
||||
|
@ -73,7 +84,7 @@ def read_detoken_rules(lang):
|
|||
return entries
|
||||
|
||||
|
||||
def align_tokens(ref, indices):
|
||||
def align_tokens(ref, indices): # Deprecated, surely?
|
||||
start = 0
|
||||
queue = list(indices)
|
||||
for token in ref:
|
||||
|
@ -86,7 +97,7 @@ def align_tokens(ref, indices):
|
|||
assert not queue
|
||||
|
||||
|
||||
def detokenize(token_rules, words):
|
||||
def detokenize(token_rules, words): # Deprecated?
|
||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
||||
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||
chunk should be a tuple of token indices, e.g.
|
||||
|
|
|
@ -29,6 +29,7 @@ cdef class Vocab:
|
|||
cpdef readonly StringStore strings
|
||||
cdef readonly object pos_tags
|
||||
cdef readonly int length
|
||||
cdef public object packer
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
|
|
|
@ -16,6 +16,8 @@ from .orth cimport word_shape
|
|||
from .typedefs cimport attr_t
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from . import util
|
||||
from .serialize.packer cimport Packer
|
||||
|
||||
|
||||
DEF MAX_VEC_SIZE = 100000
|
||||
|
@ -53,6 +55,8 @@ cdef class Vocab:
|
|||
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
|
||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||
|
||||
self.packer = Packer(self, util.read_encoding_freqs(data_dir))
|
||||
|
||||
def __len__(self):
|
||||
"""The current number of lexemes stored."""
|
||||
return self.length
|
||||
|
|
|
@ -5,7 +5,6 @@ import numpy
|
|||
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.serialize.packer import _BinaryCodec
|
||||
from spacy.serialize.packer import make_vocab_codec
|
||||
from spacy.serialize.packer import _AttributeCodec
|
||||
from spacy.serialize.bits import BitArray
|
||||
|
||||
|
|
Loading…
Reference in New Issue