From 37919eac828bdd65102a28fd75148a707fdf5346 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Oct 2015 18:23:24 +1100 Subject: [PATCH 1/3] * Fix whitespace attachment in simpler way. Leaves problem with setting left/right children. --- spacy/syntax/arc_eager.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 30726974a..561308928 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -384,10 +384,7 @@ cdef class ArcEager(TransitionSystem): for i in range(st.length): # Always attach spaces to the previous word if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): - if i >= 1: - st.add_arc(i-1, i, st._sent[i].dep) - else: - st.add_arc(i+1, i, st._sent[i].dep) + st._sent[i].head = -1 if (i >= 1) else 1 if st._sent[i].sent_start and st._sent[i].head == -1: st._sent[i].sent_start = False # If we had this space token as the start of a sentence, From 329ae57520d2f6ea7430fda0f6e8afd3975f23d3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Oct 2015 09:46:38 +0200 Subject: [PATCH 2/3] * Fix whitespace attachment thing --- spacy/syntax/arc_eager.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 561308928..a7ed8874c 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -384,13 +384,16 @@ cdef class ArcEager(TransitionSystem): for i in range(st.length): # Always attach spaces to the previous word if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): - st._sent[i].head = -1 if (i >= 1) else 1 if st._sent[i].sent_start and st._sent[i].head == -1: st._sent[i].sent_start = False # If we had this space token as the start of a sentence, # move that sentence start forward one if (i + 1) < st.length and not st._sent[i+1].sent_start: st._sent[i+1].sent_start = True + if i >= 1: + st.add_arc(i-1, i, st._sent[i].dep) + else: + st.add_arc(i+1, i, st._sent[i].dep) elif st._sent[i].head == 0 and st._sent[i].dep == 0: st._sent[i].dep = self.root_label # If we're not using the Break transition, we segment via root-labelled From 7673e3a32cc06b668a56053ac18361c4a1e94671 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Oct 2015 18:50:39 +1100 Subject: [PATCH 3/3] * Fix test that was failing on travis --- tests/test_basic_load.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/test_basic_load.py b/tests/test_basic_load.py index eb7adbe97..233ddd848 100644 --- a/tests/test_basic_load.py +++ b/tests/test_basic_load.py @@ -25,28 +25,34 @@ from thinc.learner import LinearModel class TestLoadVocab(unittest.TestCase): def test_load(self): - vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab')) + if path.exists(path.join(English.default_data_dir(), 'vocab')): + vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab')) class TestLoadTokenizer(unittest.TestCase): def test_load(self): data_dir = English.default_data_dir() - vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) - tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer')) + if path.exists(path.join(data_dir, 'vocab')): + vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) + tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer')) class TestLoadTagger(unittest.TestCase): def test_load(self): data_dir = English.default_data_dir() - vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) - tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab) + + if path.exists(path.join(data_dir, 'vocab')): + vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) + tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab) class TestLoadParser(unittest.TestCase): def test_load(self): data_dir = English.default_data_dir() - vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) - parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager) + if path.exists(path.join(data_dir, 'vocab')): + vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) + if path.exists(path.join(data_dir, 'deps')): + parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager) def test_load_careful(self): config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1} @@ -67,8 +73,9 @@ class TestLoadParser(unittest.TestCase): # n classes. moves.n_moves above # n features. len(templates) + 1 above - model = LinearModel(92, 116) - model.load(model_loc) + if path.exists(model_loc): + model = LinearModel(92, 116) + model.load(model_loc) if __name__ == '__main__':