diff --git a/bin/parser/train.py b/bin/parser/train.py index abd5eb16e..497ecd6b1 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -85,12 +85,16 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', use_orig_arc_eager=False): dep_model_dir = path.join(model_dir, 'deps') ner_model_dir = path.join(model_dir, 'ner') + pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) + if path.exists(pos_model_dir): + shutil.rmtree(pos_model_dir) os.mkdir(dep_model_dir) os.mkdir(ner_model_dir) + os.mkdir(pos_model_dir) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=ArcEager.get_labels(gold_tuples), @@ -140,7 +144,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) + print('end training') nlp.end_training(model_dir) + print('done') def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 5e08e80a4..dc8c2b03b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -16,9 +16,9 @@ class Lemmatizer(object): index = {} exc = {} for pos in ['adj', 'adv', 'noun', 'verb']: - index[pos] = read_index(path.join(data_dir, 'index.%s' % pos)) - exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos)) - rules = json.load(open(path.join(data_dir, 'lemma_rules.json'))) + index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos)) + exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos)) + rules = json.load(open(path.join(data_dir, 'vocab', 'lemma_rules.json'))) return cls(index, exc, rules) def __init__(self, index, exceptions, rules): @@ -33,10 +33,8 @@ class Lemmatizer(object): pos = 'verb' elif pos == ADJ: pos = 'adj' - else: - return string lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, [])) - return min(lemmas) + return lemmas def noun(self, string): return self(string, 'noun') diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index b8a45d469..88a4f9ba2 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -54,8 +54,6 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1: cdef int i for i in range(pattern.length): if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value: - print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value - print get_token_attr(token, pattern.spec[i].attr) return False return True @@ -82,7 +80,6 @@ def _convert_strings(token_specs, string_store): if isinstance(value, bool): value = int(value) converted[-1].append((attr, value)) - print "Converted", converted[-1] return converted @@ -175,13 +172,11 @@ cdef class Matcher: cdef Pattern* state matches = [] for token_i in range(doc.length): - print 'check', doc[token_i].orth_ token = &doc.data[token_i] q = 0 for i in range(partials.size()): state = partials.at(i) if match(state, token): - print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: @@ -191,7 +186,6 @@ cdef class Matcher: for i in range(self.n_patterns): state = self.patterns[i] if match(state, token): - print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 2ce484d7b..2cc5552f9 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -43,6 +43,7 @@ cdef class Morphology: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis.tag = self.rich_tags[tag_id] analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth) + self._cache.set(tag_id, token.lex.orth, analysis) token.lemma = analysis.lemma token.pos = analysis.tag.pos token.tag = analysis.tag.name diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 6f7951987..81c31be97 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -16,12 +16,11 @@ cdef class StateClass: cdef int i for i in range(length + (PADDING * 2)): self._ents[i].end = -1 + self._sent[i].l_edge = i + self._sent[i].r_edge = i for i in range(length, length + (PADDING * 2)): self._sent[i].lex = &EMPTY_LEXEME self._sent += PADDING - for i in range(length): - self._sent[i].l_edge = i - self._sent[i].r_edge = i self._ents += PADDING self._buffer += PADDING self._stack += PADDING @@ -162,11 +161,11 @@ cdef class StateClass: cdef int dist = h_i - c_i cdef TokenC* h = &self._sent[h_i] if c_i > h_i: + h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i h.r_kids -= 1 - h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 1 else h_i else: + h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i h.l_kids -= 1 - h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 1 else h_i cdef void open_ent(self, int label) nogil: self._ents[self._e_i].start = self.B(0) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1f687b046..16f0d2f46 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -67,6 +67,8 @@ cdef class Doc: cdef int i for i in range(size + (PADDING*2)): data_start[i].lex = &EMPTY_LEXEME + data_start[i].l_edge = i + data_start[i].r_edge = i self.data = data_start + PADDING self.max_length = size self.length = 0 @@ -219,6 +221,8 @@ cdef class Doc: t.idx = 0 else: t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy + t.l_edge = self.length + t.r_edge = self.length assert t.lex.orth != 0 t.spacy = has_space self.length += 1 @@ -310,6 +314,8 @@ cdef class Doc: self.is_parsed = True for i in range(self.length): self.data[i] = parsed[i] + assert self.data[i].l_edge <= i + assert self.data[i].r_edge >= i def from_array(self, attrs, array): cdef int i, col @@ -396,7 +402,7 @@ cdef class Doc: cdef TokenC* token = &self.data[start] # Update fields token.lex = lex - token.spacy = self.data[end].spacy + token.spacy = self.data[end-1].spacy # What to do about morphology?? # TODO: token.morph = ??? token.tag = self.vocab.strings[tag] diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 5307f0fe8..939ea9db3 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -117,7 +117,9 @@ cdef class Vocab: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef hash_t key - cdef bint is_oov = mem is not self.mem + #cdef bint is_oov = mem is not self.mem + # TODO + is_oov = False mem = self.mem if len(string) < 3: mem = self.mem @@ -224,19 +226,17 @@ cdef class Vocab: raise IOError('LexemeCs file not found at %s' % loc) fp = CFile(loc, 'rb') cdef LexemeC* lexeme - cdef attr_t orth cdef hash_t key cdef unicode py_str - cdef uint64_t bad_bytes + cdef attr_t orth + assert sizeof(orth) == sizeof(lexeme.orth) i = 0 while True: - lexeme = self.mem.alloc(sizeof(LexemeC), 1) try: fp.read_into(&orth, 1, sizeof(orth)) except IOError: break - # This 64 bit chunk is there for backwards compatibility. Remove on next release. - fp.read_into(&bad_bytes, 1, sizeof(bad_bytes)) + lexeme = self.mem.alloc(sizeof(LexemeC), 1) # Copy data from the file into the lexeme fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) @@ -253,10 +253,8 @@ cdef class Vocab: fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) lexeme.repvec = EMPTY_VEC - if orth != lexeme.orth: - # TODO: Improve this error message, pending resolution to Issue #64 - raise IOError('Error reading from lexemes.bin. Integrity check fails.') - py_str = self.strings[orth] + py_str = self.strings[lexeme.orth] + assert py_str[-3:] == self.strings[lexeme.suffix], "%s (%d) suffix %s (%d)" % (repr(py_str), lexeme.orth, repr(self.strings[lexeme.suffix]), lexeme.suffix) key = hash_string(py_str) self._by_hash.set(key, lexeme) self._by_orth.set(lexeme.orth, lexeme) diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py index 5a6a8fc62..8461a854e 100644 --- a/tests/tagger/test_lemmatizer.py +++ b/tests/tagger/test_lemmatizer.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals -from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc +from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.en import LOCAL_DATA_DIR from os import path @@ -23,7 +23,7 @@ def test_read_exc(): @pytest.fixture def lemmatizer(): - return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0) + return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR)) def test_noun_lemmas(lemmatizer):