From b2e82e55f699af434823ddb825e12d2b78150ee3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Sep 2015 15:36:23 +0200 Subject: [PATCH 01/11] * Create POS model dir in training script --- bin/parser/train.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/parser/train.py b/bin/parser/train.py index abd5eb16e..497ecd6b1 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -85,12 +85,16 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', use_orig_arc_eager=False): dep_model_dir = path.join(model_dir, 'deps') ner_model_dir = path.join(model_dir, 'ner') + pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) + if path.exists(pos_model_dir): + shutil.rmtree(pos_model_dir) os.mkdir(dep_model_dir) os.mkdir(ner_model_dir) + os.mkdir(pos_model_dir) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=ArcEager.get_labels(gold_tuples), @@ -140,7 +144,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) + print('end training') nlp.end_training(model_dir) + print('done') def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): From 64d71f8893eb008d3661c3abda515939323f55a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Sep 2015 15:38:03 +0200 Subject: [PATCH 02/11] * Fix lemmatizer --- spacy/lemmatizer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 5e08e80a4..dc8c2b03b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -16,9 +16,9 @@ class Lemmatizer(object): index = {} exc = {} for pos in ['adj', 'adv', 'noun', 'verb']: - index[pos] = read_index(path.join(data_dir, 'index.%s' % pos)) - exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos)) - rules = json.load(open(path.join(data_dir, 'lemma_rules.json'))) + index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos)) + exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos)) + rules = json.load(open(path.join(data_dir, 'vocab', 'lemma_rules.json'))) return cls(index, exc, rules) def __init__(self, index, exceptions, rules): @@ -33,10 +33,8 @@ class Lemmatizer(object): pos = 'verb' elif pos == ADJ: pos = 'adj' - else: - return string lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, [])) - return min(lemmas) + return lemmas def noun(self, string): return self(string, 'noun') From 1def5a6cbe3298767a7a7e85d242dde9a8aa480d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Sep 2015 15:38:19 +0200 Subject: [PATCH 03/11] * Fix print statements in matcher --- spacy/matcher.pyx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index b8a45d469..88a4f9ba2 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -54,8 +54,6 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1: cdef int i for i in range(pattern.length): if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value: - print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value - print get_token_attr(token, pattern.spec[i].attr) return False return True @@ -82,7 +80,6 @@ def _convert_strings(token_specs, string_store): if isinstance(value, bool): value = int(value) converted[-1].append((attr, value)) - print "Converted", converted[-1] return converted @@ -175,13 +172,11 @@ cdef class Matcher: cdef Pattern* state matches = [] for token_i in range(doc.length): - print 'check', doc[token_i].orth_ token = &doc.data[token_i] q = 0 for i in range(partials.size()): state = partials.at(i) if match(state, token): - print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: @@ -191,7 +186,6 @@ cdef class Matcher: for i in range(self.n_patterns): state = self.patterns[i] if match(state, token): - print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: From 2be362033348a6c46a1f23e6d48e25c78cfc60eb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Sep 2015 15:39:24 +0200 Subject: [PATCH 04/11] * Save morphological analyses in a cache --- spacy/morphology.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fc6a4936b..77d7bd2df 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -42,6 +42,7 @@ cdef class Morphology: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis.tag = self.rich_tags[tag_id] analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth) + self._cache.set(tag_id, token.lex.orth, analysis) token.lemma = analysis.lemma token.pos = analysis.tag.pos token.tag = analysis.tag.name From 83d1a1e51249d80d188af624ab6c3c8f3501e24d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 8 Sep 2015 15:39:43 +0200 Subject: [PATCH 05/11] * Fix lemmatizer tests --- tests/tagger/test_lemmatizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py index 5a6a8fc62..8461a854e 100644 --- a/tests/tagger/test_lemmatizer.py +++ b/tests/tagger/test_lemmatizer.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals -from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc +from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.en import LOCAL_DATA_DIR from os import path @@ -23,7 +23,7 @@ def test_read_exc(): @pytest.fixture def lemmatizer(): - return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0) + return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR)) def test_noun_lemmas(lemmatizer): From 0e24d099a15dae3f96210a6bb8b93ddb9b2ea51d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Sep 2015 03:39:46 +0200 Subject: [PATCH 06/11] * Fix L/R edge bug, by ensuring l_edge and r_edge are preset, and fixing the way the edge update in del_arc. Bugs keep arising here because the edges are absolute positions, where everything else is relative. I'm also not 100% convinced that del_arc is handled correctly. Do we need to update the parents? --- spacy/syntax/stateclass.pyx | 9 ++++----- spacy/tokens/doc.pyx | 6 ++++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 6f7951987..81c31be97 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -16,12 +16,11 @@ cdef class StateClass: cdef int i for i in range(length + (PADDING * 2)): self._ents[i].end = -1 + self._sent[i].l_edge = i + self._sent[i].r_edge = i for i in range(length, length + (PADDING * 2)): self._sent[i].lex = &EMPTY_LEXEME self._sent += PADDING - for i in range(length): - self._sent[i].l_edge = i - self._sent[i].r_edge = i self._ents += PADDING self._buffer += PADDING self._stack += PADDING @@ -162,11 +161,11 @@ cdef class StateClass: cdef int dist = h_i - c_i cdef TokenC* h = &self._sent[h_i] if c_i > h_i: + h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i h.r_kids -= 1 - h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 1 else h_i else: + h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i h.l_kids -= 1 - h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 1 else h_i cdef void open_ent(self, int label) nogil: self._ents[self._e_i].start = self.B(0) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 41d24d8ac..ccde5d599 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -67,6 +67,8 @@ cdef class Doc: cdef int i for i in range(size + (PADDING*2)): data_start[i].lex = &EMPTY_LEXEME + data_start[i].l_edge = i + data_start[i].r_edge = i self.data = data_start + PADDING self.max_length = size self.length = 0 @@ -219,6 +221,8 @@ cdef class Doc: t.idx = 0 else: t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy + t.l_edge = self.length + t.r_edge = self.length assert t.lex.orth != 0 t.spacy = has_space self.length += 1 @@ -310,6 +314,8 @@ cdef class Doc: self.is_parsed = True for i in range(self.length): self.data[i] = parsed[i] + assert self.data[i].l_edge <= i + assert self.data[i].r_edge >= i def from_array(self, attrs, array): cdef int i, col From d6561988cf7e71f680eb2843a4ab6efb3cd49b79 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Sep 2015 11:49:51 +0200 Subject: [PATCH 07/11] * Fix lexemes.bin --- spacy/vocab.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index de0557c95..5da29439b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -222,7 +222,7 @@ cdef class Vocab: cdef attr_t orth cdef hash_t key cdef unicode py_str - cdef uint64_t bad_bytes + assert sizeof(orth) == sizeof(lexeme.orth) i = 0 while True: lexeme = self.mem.alloc(sizeof(LexemeC), 1) @@ -230,8 +230,6 @@ cdef class Vocab: fp.read_into(&orth, 1, sizeof(orth)) except IOError: break - # This 64 bit chunk is there for backwards compatibility. Remove on next release. - fp.read_into(&bad_bytes, 1, sizeof(bad_bytes)) # Copy data from the file into the lexeme fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) From a7f4b26c8ca9cc4c5bd7a03aa57e06b053f34ce9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 9 Sep 2015 14:33:26 +0200 Subject: [PATCH 08/11] * Tmp --- spacy/vocab.pyx | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 5da29439b..571a37da9 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -112,7 +112,9 @@ cdef class Vocab: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef hash_t key - cdef bint is_oov = mem is not self.mem + #cdef bint is_oov = mem is not self.mem + # TODO + is_oov = False mem = self.mem if len(string) < 3: mem = self.mem @@ -197,7 +199,6 @@ cdef class Vocab: cdef hash_t key for key, addr in self._by_hash.items(): lexeme = addr - fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1) fp.write_from(&lexeme.id, sizeof(lexeme.flags), 1) fp.write_from(&lexeme.length, sizeof(lexeme.length), 1) @@ -219,17 +220,17 @@ cdef class Vocab: raise IOError('LexemeCs file not found at %s' % loc) fp = CFile(loc, 'rb') cdef LexemeC* lexeme - cdef attr_t orth cdef hash_t key cdef unicode py_str + cdef attr_t orth assert sizeof(orth) == sizeof(lexeme.orth) i = 0 while True: - lexeme = self.mem.alloc(sizeof(LexemeC), 1) try: fp.read_into(&orth, 1, sizeof(orth)) except IOError: break + lexeme = self.mem.alloc(sizeof(LexemeC), 1) # Copy data from the file into the lexeme fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) @@ -246,10 +247,8 @@ cdef class Vocab: fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) lexeme.repvec = EMPTY_VEC - if orth != lexeme.orth: - # TODO: Improve this error message, pending resolution to Issue #64 - raise IOError('Error reading from lexemes.bin. Integrity check fails.') - py_str = self.strings[orth] + py_str = self.strings[lexeme.orth] + assert py_str[-3:] == self.strings[lexeme.suffix], "%s (%d) suffix %s (%d)" % (repr(py_str), lexeme.orth, repr(self.strings[lexeme.suffix]), lexeme.suffix) key = hash_string(py_str) self._by_hash.set(key, lexeme) self._by_orth.set(lexeme.orth, lexeme) From f634191e2795ac1952b6bb9e32399efe8b99faa8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 10 Sep 2015 14:44:38 +0200 Subject: [PATCH 09/11] * Fix vocab read/write --- spacy/vocab.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 571a37da9..bac55e320 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -199,8 +199,9 @@ cdef class Vocab: cdef hash_t key for key, addr in self._by_hash.items(): lexeme = addr + fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1) - fp.write_from(&lexeme.id, sizeof(lexeme.flags), 1) + fp.write_from(&lexeme.id, sizeof(lexeme.id), 1) fp.write_from(&lexeme.length, sizeof(lexeme.length), 1) fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(&lexeme.lower, sizeof(lexeme.lower), 1) From 9e7bfe84493bb6a8e0c64f7a144844f25f3765e6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 10 Sep 2015 14:45:17 +0200 Subject: [PATCH 10/11] * Fix space at end of merged token --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d878f97eb..16f0d2f46 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -402,7 +402,7 @@ cdef class Doc: cdef TokenC* token = &self.data[start] # Update fields token.lex = lex - token.spacy = self.data[end].spacy + token.spacy = self.data[end-1].spacy # What to do about morphology?? # TODO: token.morph = ??? token.tag = self.vocab.strings[tag] From e7e529edf422a82f2d3154d74e4966e4f522120c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 10 Sep 2015 14:45:43 +0200 Subject: [PATCH 11/11] * Fix Lexeme.check_flag --- spacy/lexeme.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 63280155c..75cac871c 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -72,7 +72,8 @@ cdef class Lexeme: @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: - return lexeme.flags & (1 << flag_id) + cdef flags_t one = 1 + return lexeme.flags & (one << flag_id) @staticmethod cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: