From a676d668070cc3a23de30db99e55ee5f7593f515 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 22:29:34 +0100 Subject: [PATCH 1/9] * Update the CoNLL train script, to get working on other languages --- bin/parser/conll_train.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/bin/parser/conll_train.py b/bin/parser/conll_train.py index 40c7b71f8..da9bb807a 100755 --- a/bin/parser/conll_train.py +++ b/bin/parser/conll_train.py @@ -5,7 +5,7 @@ from __future__ import unicode_literals import os from os import path import shutil -import codecs +import io import random import time import gzip @@ -56,12 +56,20 @@ def _parse_line(line): if len(pieces) == 4: word, pos, head_idx, label = pieces head_idx = int(head_idx) + elif len(pieces) == 15: + id_ = int(pieces[0].split('_')[-1]) + word = pieces[1] + pos = pieces[4] + head_idx = int(pieces[8])-1 + label = pieces[10] else: - id_ = int(pieces[0]) + id_ = int(pieces[0].split('_')[-1]) word = pieces[1] pos = pieces[4] head_idx = int(pieces[6])-1 label = pieces[7] + if head_idx == 0: + label = 'ROOT' return word, pos, head_idx, label @@ -69,8 +77,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) + gold = GoldParse(tokens, annot_tuples, make_projective=False) + scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct')) def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, @@ -122,11 +130,11 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0 def main(train_loc, dev_loc, model_dir): - with codecs.open(train_loc, 'r', 'utf8') as file_: + with io.open(train_loc, 'r', encoding='utf8') as file_: train_sents = read_conll(file_) - train(English, train_sents, model_dir) + #train(English, train_sents, model_dir) nlp = English(data_dir=model_dir) - dev_sents = read_conll(open(dev_loc)) + dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8')) scorer = Scorer() for _, sents in dev_sents: for annot_tuples, _ in sents: From bf5a7cc598964f90d4f2e50ef915b87e22e27f87 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 22:30:00 +0100 Subject: [PATCH 2/9] * Update train_pos_tagger example --- examples/train_pos_tagger.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/train_pos_tagger.py b/examples/train_pos_tagger.py index b1750ea64..43bd607c7 100644 --- a/examples/train_pos_tagger.py +++ b/examples/train_pos_tagger.py @@ -62,8 +62,10 @@ def main(output_dir): tokens = tokenizer.tokens_from_list(words) tagger.train(tokens, tags) random.shuffle(DATA) - tagger.model.end_training(path.join(output_dir, 'pos', 'model')) - vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt')) + tagger.model.end_training() + tagger.model.dump(path.join(output_dir, 'pos', 'model')) + with io.open(output_dir, 'vocab', 'strings.json') as file_: + tagger.vocab.strings.dump(file_) if __name__ == '__main__': From d0f06c5cc424887cd72b945324a8c58499113617 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 22:30:22 +0100 Subject: [PATCH 3/9] * Add missing tags to the German tag map --- lang_data/de/tag_map.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lang_data/de/tag_map.json b/lang_data/de/tag_map.json index ee1bb1b81..29da20a39 100644 --- a/lang_data/de/tag_map.json +++ b/lang_data/de/tag_map.json @@ -18,8 +18,10 @@ "KOUI": {"pos": "SCONJ"}, "KOUS": {"pos": "SCONJ"}, "NE": {"pos": "PROPN"}, +"NNE": {"pos": "PROPN"}, "NN": {"pos": "NOUN"}, "PAV": {"pos": "ADV", "PronType": "Dem"}, +"PROAV": {"pos": "ADV", "PronType": "Dem"}, "PDAT": {"pos": "DET", "PronType": "Dem"}, "PDS": {"pos": "PRON", "PronType": "Dem"}, "PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, @@ -52,5 +54,6 @@ "VVINF": {"pos": "VERB", "VerbForm": "Inf"}, "VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, "VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, -"XY": {"pos": "X"} +"XY": {"pos": "X"}, +"SP": {"pos": "SPACE"} } From 7cbff48ace76807a1d84da10f317267ece767f53 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 22:30:51 +0100 Subject: [PATCH 4/9] * Set the German lemma rules to be an empty JSON object --- lang_data/de/lemma_rules.json | 1 + 1 file changed, 1 insertion(+) diff --git a/lang_data/de/lemma_rules.json b/lang_data/de/lemma_rules.json index e69de29bb..0967ef424 100644 --- a/lang_data/de/lemma_rules.json +++ b/lang_data/de/lemma_rules.json @@ -0,0 +1 @@ +{} From 59123443e2a4e92be1ff4f1cbd5f37cc2b0ddfce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 22:49:55 +0100 Subject: [PATCH 5/9] * Check for presence/absence of the different models in Language.end_training --- spacy/language.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 69abf16b3..980af0734 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -272,22 +272,43 @@ class Language(object): def end_training(self, data_dir=None): if data_dir is None: data_dir = self.data_dir - self.parser.model.end_training() - self.parser.model.dump(path.join(data_dir, 'deps', 'model')) - self.entity.model.end_training() - self.entity.model.dump(path.join(data_dir, 'ner', 'model')) - self.tagger.model.end_training() - self.tagger.model.dump(path.join(data_dir, 'pos', 'model')) + if self.parser: + self.parser.model.end_training() + self.parser.model.dump(path.join(data_dir, 'deps', 'model')) + if self.entity: + self.entity.model.end_training() + self.entity.model.dump(path.join(data_dir, 'ner', 'model')) + if self.tagger: + self.tagger.model.end_training() + self.tagger.model.dump(path.join(data_dir, 'pos', 'model')) strings_loc = path.join(data_dir, 'vocab', 'strings.json') with io.open(strings_loc, 'w', encoding='utf8') as file_: self.vocab.strings.dump(file_) + self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin')) + if self.tagger: + tagger_freqs = list(self.tagger.freqs[TAG].items()) + else: + tagger_freqs = [] + if self.parser: + dep_freqs = list(self.parser.moves.freqs[DEP].items()) + head_freqs = list(self.parser.moves.freqs[HEAD].items()) + else: + dep_freqs = [] + head_freqs = [] + if self.entity: + entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items()) + entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items()) + else: + entity_iob_freqs = [] + entity_type_freqs = [] with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: file_.write( json.dumps([ - (TAG, list(self.tagger.freqs[TAG].items())), - (DEP, list(self.parser.moves.freqs[DEP].items())), - (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), - (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), - (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) + (TAG, tagger_freqs), + (DEP, dep_freqs), + (ENT_IOB, entity_iob_freqs), + (ENT_TYPE, entity_type_freqs), + (HEAD, head_freqs) + ])) From e2ed6251d761c115b5849d404f2e33a94044eda5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 22:58:06 +0100 Subject: [PATCH 6/9] * Fancy up the CLI for the conll train script --- bin/parser/conll_train.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/parser/conll_train.py b/bin/parser/conll_train.py index da9bb807a..8075dcd8a 100755 --- a/bin/parser/conll_train.py +++ b/bin/parser/conll_train.py @@ -129,10 +129,18 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0 print('done') -def main(train_loc, dev_loc, model_dir): +@plac.annotations( + train_loc=("Location of CoNLL 09 formatted training file"), + dev_loc=("Location of CoNLL 09 formatted development file"), + model_dir=("Location of output model directory"), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + n_iter=("Number of training iterations", "option", "i", int), +) +def main(train_loc, dev_loc, model_dir, n_iter=15): with io.open(train_loc, 'r', encoding='utf8') as file_: train_sents = read_conll(file_) - #train(English, train_sents, model_dir) + if not eval_only: + train(English, train_sents, model_dir, n_iter=n_iter) nlp = English(data_dir=model_dir) dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8')) scorer = Scorer() From 99b8906100f5139469203a718f4f8b84805a9af6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 22:59:06 +0100 Subject: [PATCH 7/9] * Accept punct_labels as an argument to the scorer --- spacy/scorer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index 8ec3617c6..043cf5b2c 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -70,7 +70,7 @@ class Scorer(object): def ents_f(self): return self.ner.fscore * 100 - def score(self, tokens, gold, verbose=False): + def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): assert len(tokens) == len(gold) gold_deps = set() @@ -78,7 +78,7 @@ class Scorer(object): gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) for id_, word, tag, head, dep, ner in gold.orig_annot: gold_tags.add((id_, tag)) - if dep.lower() not in ('p', 'punct'): + if dep.lower() not in punct_labels: gold_deps.add((id_, head, dep.lower())) cand_deps = set() cand_tags = set() @@ -87,12 +87,12 @@ class Scorer(object): continue gold_i = gold.cand_to_gold[token.i] if gold_i is None: - if token.dep_.lower() not in ('p', 'punct'): + if token.dep_.lower() not in punct_labels: self.tokens.fp += 1 else: self.tokens.tp += 1 cand_tags.add((gold_i, token.tag_)) - if token.dep_.lower() not in ('p', 'punct') and token.orth_.strip(): + if token.dep_.lower() not in punct_labels and token.orth_.strip(): gold_head = gold.cand_to_gold[token.head.i] # None is indistinct, so we can't just add it to the set # Multiple (None, None) deps are possible From f204daf27b9e96aacb64abe964a99298a798afc4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 22:59:59 +0100 Subject: [PATCH 8/9] * Add error warning that a gold tag is unrecognised --- spacy/tagger.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 493cc4f99..97ec0eff6 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -216,6 +216,11 @@ cdef class Tagger: def train(self, Doc tokens, object gold_tag_strs): assert len(tokens) == len(gold_tag_strs) + for tag in gold_tag_strs: + if tag not in self.tag_names: + msg = ("Unrecognized gold tag: %s. tag_map.json must contain all" + "gold tags, to maintain coarse-grained mapping.") + raise ValueError(msg % tag) golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] cdef int correct = 0 cdef Pool mem = Pool() From 1a2ee73e9887fe1304389d987402b020a9432a13 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 2 Feb 2016 23:00:53 +0100 Subject: [PATCH 9/9] * Add missing pos and tag attributes to API --- website/src/jade/docs/_api.jade | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/website/src/jade/docs/_api.jade b/website/src/jade/docs/_api.jade index 4db74cc9b..bceee0000 100644 --- a/website/src/jade/docs/_api.jade +++ b/website/src/jade/docs/_api.jade @@ -145,6 +145,9 @@ mixin LexemeDistributional +Define("vector") | A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model. + +Define("has_vector") + | A boolean value indicating whether a vector. + mixin Func(type1, type2) #{"λ " + type1 + ", " + type2} @@ -373,6 +376,17 @@ mixin Func(type1, type2) +Define("whitespace_") | The number of immediate syntactic children following the word in the string. + details(open=true) + summary: h4 Part-of-Speech Tags + + ul + +Define("pos / pos_") + | A coarse-grained, less detailed tag that represents the word-class of the token. The set of #[code .pos] tags are consistent across languages. The available tags are ADJ, ADP, ADV, AUX, CONJ, DET, INTJ, NOUN, NUM, PART, PRON, PROPN, PUNCT, SCONJ, SYM, VERB, X, EOL, SPACE. + + ul + +Define("tag / tag_") + | A fine-grained, more detailed tag that represents the word-class and some basic morphological information for the token. These tags are primarily designed to be good features for subsequent models, particularly the syntactic parser. They are language and treebank dependent. The tagger is trained to predict these fine-grained tags, and then a mapping table is used to reduce them to the coarse-grained #[code .pos] tags. + details(open=true) summary: h4 Navigating the Parse Tree