diff --git a/examples/training/conllu.py b/examples/training/conllu.py index e1fbecfe6..fd2a91222 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -13,6 +13,7 @@ from spacy.gold import GoldParse, minibatch from spacy.syntax.nonproj import projectivize from collections import Counter from timeit import default_timer as timer +from spacy.matcher import Matcher import random import numpy.random @@ -22,42 +23,6 @@ from spacy._align import align random.seed(0) numpy.random.seed(0) -def prevent_bad_sentences(doc): - '''This is an example pipeline component for fixing sentence segmentation - mistakes. The component sets is_sent_start to False, which means the - parser will be prevented from making a sentence boundary there. The - rules here aren't necessarily a good idea.''' - for token in doc[1:]: - if token.nbor(-1).text == ',': - token.is_sent_start = False - elif not token.nbor(-1).whitespace_: - token.is_sent_start = False - elif not token.nbor(-1).is_punct: - token.is_sent_start = False - elif token.nbor(-1).is_left_punct: - token.is_sent_start = False - return doc - - -def load_model(lang): - '''This shows how to adjust the tokenization rules, to special-case - for ways the CoNLLU tokenization differs. We need to get the tokenizer - accuracy high on the various treebanks in order to do well. If we don't - align on a content word, all dependencies to and from that word will - be marked as incorrect. - ''' - English = spacy.util.get_lang_class(lang) - English.Defaults.token_match = re.compile(r'=+|!+|\?+|\*+|_+').match - nlp = English() - nlp.tokenizer.add_special_case('***', [{'ORTH': '***'}]) - nlp.tokenizer.add_special_case("):", [{'ORTH': ")"}, {"ORTH": ":"}]) - nlp.tokenizer.add_special_case("and/or", [{'ORTH': "and"}, {"ORTH": "/"}, {"ORTH": "or"}]) - nlp.tokenizer.add_special_case("non-Microsoft", [{'ORTH': "non-Microsoft"}]) - nlp.tokenizer.add_special_case("mis-matches", [{'ORTH': "mis-matches"}]) - nlp.tokenizer.add_special_case("X.", [{'ORTH': "X"}, {"ORTH": "."}]) - nlp.tokenizer.add_special_case("b/c", [{'ORTH': "b/c"}]) - return nlp - def get_token_acc(docs, golds): '''Quick function to evaluate tokenization accuracy.''' @@ -229,8 +194,16 @@ def print_progress(itn, losses, scorer): )) print(tpl.format(itn, **scores)) + def print_conllu(docs, file_): + merger = Matcher(docs[0].vocab) + merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}]) for i, doc in enumerate(docs): + matches = merger(doc) + spans = [(doc[start].idx, doc[end+1].idx+len(doc[end+1])) + for (_, start, end) in matches if end < (len(doc)-1)] + for start_char, end_char in spans: + doc.merge(start_char, end_char) file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) @@ -246,13 +219,15 @@ def print_conllu(docs, file_): file_.write('\n') -def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev_loc, +def main(lang, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev_loc, output_loc): - nlp = load_model(spacy_model) - vec_nlp = spacy.util.load_model('spacy/data/en_core_web_lg/en_core_web_lg-2.0.0') - nlp.vocab.vectors = vec_nlp.vocab.vectors - for lex in vec_nlp.vocab: - _ = nlp.vocab[lex.orth_] + nlp = spacy.blank(lang) + if lang == 'en': + vec_nlp = spacy.util.load_model('spacy/data/en_core_web_lg/en_core_web_lg-2.0.0') + nlp.vocab.vectors = vec_nlp.vocab.vectors + for lex in vec_nlp.vocab: + _ = nlp.vocab[lex.orth_] + vec_nlp = None with open(conllu_train_loc) as conllu_file: with open(text_train_loc) as text_file: docs, golds = read_data(nlp, conllu_file, text_file, @@ -262,6 +237,7 @@ def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev nlp.add_pipe(nlp.create_pipe('parser')) nlp.parser.add_multitask_objective('tag') nlp.parser.add_multitask_objective('sent_start') + nlp.parser.moves.add_action(2, 'subtok') nlp.add_pipe(nlp.create_pipe('tagger')) for gold in golds: for tag in gold.tags: @@ -281,7 +257,7 @@ def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. batch_sizes = spacy.util.compounding(spacy.util.env_opt('batch_from', 1), - spacy.util.env_opt('batch_to', 8), + spacy.util.env_opt('batch_to', 2), spacy.util.env_opt('batch_compound', 1.001)) for i in range(30): docs = refresh_docs(docs)