spaCy/examples/training/conllu.py

'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used.
'''
from __future__ import unicode_literals
import plac
import tqdm
import re
import spacy
import spacy.util
from spacy.gold import GoldParse, minibatch
from spacy.syntax.nonproj import projectivize
from collections import Counter
from timeit import default_timer as timer

from spacy._align import align

def prevent_bad_sentences(doc):
    '''This is an example pipeline component for fixing sentence segmentation
    mistakes. The component sets is_sent_start to False, which means the
    parser will be prevented from making a sentence boundary there. The
    rules here aren't necessarily a good idea.'''
    for token in doc[1:]:
        if token.nbor(-1).text == ',':
            token.is_sent_start = False
        elif not token.nbor(-1).whitespace_:
            token.is_sent_start = False
        elif not token.nbor(-1).is_punct:
            token.is_sent_start = False
    return doc


def load_model(lang):
    '''This shows how to adjust the tokenization rules, to special-case
    for ways the CoNLLU tokenization differs. We need to get the tokenizer
    accuracy high on the various treebanks in order to do well. If we don't
    align on a content word, all dependencies to and from that word will
    be marked as incorrect.
    '''
    English = spacy.util.get_lang_class(lang)
    English.Defaults.infixes += ('(?<=[^-\d])[+\-\*^](?=[^-\d])',)
    English.Defaults.infixes += ('(?<=[^-])[+\-\*^](?=[^-\d])',)
    English.Defaults.infixes += ('(?<=[^-\d])[+\-\*^](?=[^-])',)
    English.Defaults.token_match = re.compile(r'=+').match
    nlp = English()
    nlp.tokenizer.add_special_case('***', [{'ORTH': '***'}])
    nlp.tokenizer.add_special_case("):", [{'ORTH': ")"}, {"ORTH": ":"}])
    nlp.tokenizer.add_special_case("and/or", [{'ORTH': "and"}, {"ORTH": "/"}, {"ORTH": "or"}])
    nlp.tokenizer.add_special_case("non-Microsoft", [{'ORTH': "non-Microsoft"}])
    nlp.tokenizer.add_special_case("mis-matches", [{'ORTH': "mis-matches"}])
    nlp.tokenizer.add_special_case("X.", [{'ORTH': "X"}, {"ORTH": "."}])
    nlp.tokenizer.add_special_case("b/c", [{'ORTH': "b/c"}])
    return nlp
    

def get_token_acc(docs, golds):
    '''Quick function to evaluate tokenization accuracy.'''
    miss = 0
    hit = 0
    for doc, gold in zip(docs, golds):
        for i in range(len(doc)):
            token = doc[i]
            align = gold.words[i]
            if align == None:
                miss += 1
            else:
                hit += 1
    return miss, hit


def golds_to_gold_tuples(docs, golds):
    '''Get out the annoying 'tuples' format used by begin_training, given the
    GoldParse objects.'''
    tuples = []
    for doc, gold in zip(docs, golds):
        text = doc.text
        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
        sents = [((ids, words, tags, heads, labels, iob), [])]
        tuples.append((text, sents))
    return tuples


def split_text(text):
    paragraphs = text.split('\n\n')
    paragraphs = [par.strip().replace('\n', ' ') for par in paragraphs]
    return paragraphs


def read_conllu(file_):
    docs = []
    doc = []
    sent = []
    for line in file_:
        if line.startswith('# newdoc'):
            if doc:
                docs.append(doc)
            doc = []
        elif line.startswith('#'):
            continue
        elif not line.strip():
            if sent:
                doc.append(sent)
            sent = []
        else:
            sent.append(line.strip().split())
    if sent:
        doc.append(sent)
    if doc:
        docs.append(doc)
    return docs


def get_docs(nlp, text):
    paragraphs = split_text(text)
    docs = [nlp.make_doc(par) for par in paragraphs]
    return docs


def get_golds(docs, conllu):
    # sd is spacy doc; cd is conllu doc
    # cs is conllu sent, ct is conllu token
    golds = []
    for sd, cd in zip(docs, conllu):
        words = []
        tags = []
        heads = []
        deps = []
        for cs in cd:
            for id_, word, lemma, pos, tag, morph, head, dep, _1, _2 in cs:
                if '.' in id_:
                    continue
                i = len(words)
                id_ = int(id_)-1
                head = int(head)-1 if head != '0' else id_
                head_dist = head - id_
                words.append(word)
                tags.append(tag)
                heads.append(i+head_dist)
                deps.append('ROOT' if dep == 'root' else dep)
            heads, deps = projectivize(heads, deps)
        entities = ['-'] * len(words)
        gold = GoldParse(sd, words=words, tags=tags, heads=heads, deps=deps,
                         entities=entities)
        golds.append(gold)
    return golds

def parse_dev_data(nlp, text_loc, conllu_loc):
    with open(text_loc) as file_:
        docs = get_docs(nlp, file_.read())
    with open(conllu_loc) as file_:
        conllu_dev = read_conllu(file_)
    golds = list(get_golds(docs, conllu_dev))
    scorer = nlp.evaluate(zip(docs, golds))
    return docs, scorer


def print_progress(itn, losses, scorer):
    scores = {}
    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
                'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
        scores[col] = 0.0
    scores['dep_loss'] = losses.get('parser', 0.0)
    scores['ner_loss'] = losses.get('ner', 0.0)
    scores['tag_loss'] = losses.get('tagger', 0.0)
    scores.update(scorer.scores)
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.3f}',
        '{ner_loss:.3f}',
        '{uas:.3f}',
        '{ents_p:.3f}',
        '{ents_r:.3f}',
        '{ents_f:.3f}',
        '{tags_acc:.3f}',
        '{token_acc:.3f}',
    ))
    print(tpl.format(itn, **scores))

def print_conllu(docs, file_):
    for i, doc in enumerate(docs):
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, t in enumerate(sent):
                if t.head.i == t.i:
                    head = 0
                else:
                    head = k + (t.head.i - t.i) + 1
                fields = [str(k+1), t.text, t.lemma_, t.pos_, t.tag_, '_', str(head), t.dep_, '_', '_']
                file_.write('\t'.join(fields) + '\n')
            file_.write('\n')


def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev_loc,
         output_loc):
    with open(conllu_train_loc) as file_:
        conllu_train = read_conllu(file_)
    nlp = load_model(spacy_model)
    print("Get docs")
    with open(text_train_loc) as file_:
        docs = get_docs(nlp, file_.read())
    golds = list(get_golds(docs, conllu_train))
    print("Create parser")
    nlp.add_pipe(nlp.create_pipe('parser'))
    nlp.add_pipe(nlp.create_pipe('tagger'))
    for gold in golds:
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    optimizer = nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
    # Replace labels that didn't make the frequency cutoff
    actions = set(nlp.parser.labels)
    label_set = set([act.split('-')[1] for act in actions if '-' in act])
    for gold in golds:
        for i, label in enumerate(gold.labels):
            if label is not None and label not in label_set:
                gold.labels[i] = label.split('||')[0]
    n_train_words = sum(len(doc) for doc in docs)
    print(n_train_words)
    print("Begin training")
    for i in range(10):
        with open(text_train_loc) as file_:
            docs = get_docs(nlp, file_.read())
        docs = docs[:len(golds)]
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            losses = {}
            for batch in minibatch(list(zip(docs, golds)), size=1):
                if not batch:
                    continue
                batch_docs, batch_gold = zip(*batch)

                nlp.update(batch_docs, batch_gold, sgd=optimizer,
                           drop=0.2, losses=losses)
                pbar.update(sum(len(doc) for doc in batch_docs))
        
        with nlp.use_params(optimizer.averages):
            dev_docs, scorer = parse_dev_data(nlp, text_dev_loc, conllu_dev_loc)
            print_progress(i, losses, scorer)
            with open(output_loc, 'w') as file_:
                print_conllu(dev_docs, file_)

if __name__ == '__main__':
    plac.call(main)
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes`
			`.conllu format for development data, allowing the official scorer to be used.`
			`'''`
			`from __future__ import unicode_literals`
			`import plac`
			`import tqdm`
			`import re`
			`import spacy`
			`import spacy.util`
			`from spacy.gold import GoldParse, minibatch`
			`from spacy.syntax.nonproj import projectivize`
			`from collections import Counter`
			`from timeit import default_timer as timer`

			`from spacy._align import align`

			`def prevent_bad_sentences(doc):`
			`'''This is an example pipeline component for fixing sentence segmentation`
			`mistakes. The component sets is_sent_start to False, which means the`
			`parser will be prevented from making a sentence boundary there. The`
			`rules here aren't necessarily a good idea.'''`
			`for token in doc[1:]:`
			`if token.nbor(-1).text == ',':`
			`token.is_sent_start = False`
			`elif not token.nbor(-1).whitespace_:`
			`token.is_sent_start = False`
			`elif not token.nbor(-1).is_punct:`
			`token.is_sent_start = False`
			`return doc`


			`def load_model(lang):`
			`'''This shows how to adjust the tokenization rules, to special-case`
			`for ways the CoNLLU tokenization differs. We need to get the tokenizer`
			`accuracy high on the various treebanks in order to do well. If we don't`
			`align on a content word, all dependencies to and from that word will`
			`be marked as incorrect.`
			`'''`
			`English = spacy.util.get_lang_class(lang)`
			`English.Defaults.infixes += ('(?<=[^-\d])[+\-\*^](?=[^-\d])',)`
			`English.Defaults.infixes += ('(?<=[^-])[+\-\*^](?=[^-\d])',)`
			`English.Defaults.infixes += ('(?<=[^-\d])[+\-\*^](?=[^-])',)`
			`English.Defaults.token_match = re.compile(r'=+').match`
			`nlp = English()`
			`nlp.tokenizer.add_special_case('*', [{'ORTH': '*'}])`
			`nlp.tokenizer.add_special_case("):", [{'ORTH': ")"}, {"ORTH": ":"}])`
			`nlp.tokenizer.add_special_case("and/or", [{'ORTH': "and"}, {"ORTH": "/"}, {"ORTH": "or"}])`
			`nlp.tokenizer.add_special_case("non-Microsoft", [{'ORTH': "non-Microsoft"}])`
			`nlp.tokenizer.add_special_case("mis-matches", [{'ORTH': "mis-matches"}])`
			`nlp.tokenizer.add_special_case("X.", [{'ORTH': "X"}, {"ORTH": "."}])`
			`nlp.tokenizer.add_special_case("b/c", [{'ORTH': "b/c"}])`
			`return nlp`


			`def get_token_acc(docs, golds):`
			`'''Quick function to evaluate tokenization accuracy.'''`
			`miss = 0`
			`hit = 0`
			`for doc, gold in zip(docs, golds):`
			`for i in range(len(doc)):`
			`token = doc[i]`
			`align = gold.words[i]`
			`if align == None:`
			`miss += 1`
			`else:`
			`hit += 1`
			`return miss, hit`


			`def golds_to_gold_tuples(docs, golds):`
			`'''Get out the annoying 'tuples' format used by begin_training, given the`
			`GoldParse objects.'''`
			`tuples = []`
			`for doc, gold in zip(docs, golds):`
			`text = doc.text`
			`ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)`
			`sents = [((ids, words, tags, heads, labels, iob), [])]`
			`tuples.append((text, sents))`
			`return tuples`


			`def split_text(text):`
			`paragraphs = text.split('\n\n')`
			`paragraphs = [par.strip().replace('\n', ' ') for par in paragraphs]`
			`return paragraphs`


			`def read_conllu(file_):`
			`docs = []`
			`doc = []`
			`sent = []`
			`for line in file_:`
			`if line.startswith('# newdoc'):`
			`if doc:`
			`docs.append(doc)`
			`doc = []`
			`elif line.startswith('#'):`
			`continue`
			`elif not line.strip():`
			`if sent:`
			`doc.append(sent)`
			`sent = []`
			`else:`
			`sent.append(line.strip().split())`
			`if sent:`
			`doc.append(sent)`
			`if doc:`
			`docs.append(doc)`
			`return docs`


			`def get_docs(nlp, text):`
			`paragraphs = split_text(text)`
			`docs = [nlp.make_doc(par) for par in paragraphs]`
			`return docs`


			`def get_golds(docs, conllu):`
			`# sd is spacy doc; cd is conllu doc`
			`# cs is conllu sent, ct is conllu token`
			`golds = []`
			`for sd, cd in zip(docs, conllu):`
			`words = []`
			`tags = []`
			`heads = []`
			`deps = []`
			`for cs in cd:`
			`for id_, word, lemma, pos, tag, morph, head, dep, _1, _2 in cs:`
			`if '.' in id_:`
			`continue`
			`i = len(words)`
			`id_ = int(id_)-1`
			`head = int(head)-1 if head != '0' else id_`
			`head_dist = head - id_`
			`words.append(word)`
			`tags.append(tag)`
			`heads.append(i+head_dist)`
			`deps.append('ROOT' if dep == 'root' else dep)`
			`heads, deps = projectivize(heads, deps)`
			`entities = ['-'] * len(words)`
			`gold = GoldParse(sd, words=words, tags=tags, heads=heads, deps=deps,`
			`entities=entities)`
			`golds.append(gold)`
			`return golds`

			`def parse_dev_data(nlp, text_loc, conllu_loc):`
			`with open(text_loc) as file_:`
			`docs = get_docs(nlp, file_.read())`
			`with open(conllu_loc) as file_:`
			`conllu_dev = read_conllu(file_)`
			`golds = list(get_golds(docs, conllu_dev))`
			`scorer = nlp.evaluate(zip(docs, golds))`
			`return docs, scorer`


Fix conllu script 2018-02-21 13:46:54 +00:00			`def print_progress(itn, losses, scorer):`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`scores = {}`
			`for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',`
			`'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:`
			`scores[col] = 0.0`
			`scores['dep_loss'] = losses.get('parser', 0.0)`
			`scores['ner_loss'] = losses.get('ner', 0.0)`
			`scores['tag_loss'] = losses.get('tagger', 0.0)`
			`scores.update(scorer.scores)`
			`tpl = '\t'.join((`
			`'{:d}',`
			`'{dep_loss:.3f}',`
			`'{ner_loss:.3f}',`
			`'{uas:.3f}',`
			`'{ents_p:.3f}',`
			`'{ents_r:.3f}',`
			`'{ents_f:.3f}',`
			`'{tags_acc:.3f}',`
			`'{token_acc:.3f}',`
			`))`
			`print(tpl.format(itn, **scores))`

			`def print_conllu(docs, file_):`
			`for i, doc in enumerate(docs):`
			`file_.write("# newdoc id = {i}\n".format(i=i))`
			`for j, sent in enumerate(doc.sents):`
			`file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))`
			`file_.write("# text = {text}\n".format(text=sent.text))`
			`for k, t in enumerate(sent):`
			`if t.head.i == t.i:`
			`head = 0`
			`else:`
			`head = k + (t.head.i - t.i) + 1`
			`fields = [str(k+1), t.text, t.lemma_, t.pos_, t.tag_, '_', str(head), t.dep_, '_', '_']`
			`file_.write('\t'.join(fields) + '\n')`
			`file_.write('\n')`


			`def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev_loc,`
			`output_loc):`
			`with open(conllu_train_loc) as file_:`
			`conllu_train = read_conllu(file_)`
			`nlp = load_model(spacy_model)`
			`print("Get docs")`
			`with open(text_train_loc) as file_:`
			`docs = get_docs(nlp, file_.read())`
			`golds = list(get_golds(docs, conllu_train))`
			`print("Create parser")`
			`nlp.add_pipe(nlp.create_pipe('parser'))`
			`nlp.add_pipe(nlp.create_pipe('tagger'))`
			`for gold in golds:`
			`for tag in gold.tags:`
			`if tag is not None:`
			`nlp.tagger.add_label(tag)`
			`optimizer = nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))`
Replace labels that didn't make freq cutoff 2018-02-21 14:59:22 +00:00			`# Replace labels that didn't make the frequency cutoff`
			`actions = set(nlp.parser.labels)`
			`label_set = set([act.split('-')[1] for act in actions if '-' in act])`
			`for gold in golds:`
			`for i, label in enumerate(gold.labels):`
			`if label is not None and label not in label_set:`
			`gold.labels[i] = label.split('\|\|')[0]`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`n_train_words = sum(len(doc) for doc in docs)`
			`print(n_train_words)`
			`print("Begin training")`
			`for i in range(10):`
			`with open(text_train_loc) as file_:`
			`docs = get_docs(nlp, file_.read())`
			`docs = docs[:len(golds)]`
			`with tqdm.tqdm(total=n_train_words, leave=False) as pbar:`
			`losses = {}`
			`for batch in minibatch(list(zip(docs, golds)), size=1):`
			`if not batch:`
			`continue`
			`batch_docs, batch_gold = zip(*batch)`

			`nlp.update(batch_docs, batch_gold, sgd=optimizer,`
			`drop=0.2, losses=losses)`
			`pbar.update(sum(len(doc) for doc in batch_docs))`

			`with nlp.use_params(optimizer.averages):`
			`dev_docs, scorer = parse_dev_data(nlp, text_dev_loc, conllu_dev_loc)`
Fix conllu script 2018-02-21 13:46:54 +00:00			`print_progress(i, losses, scorer)`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`with open(output_loc, 'w') as file_:`
			`print_conllu(dev_docs, file_)`

			`if __name__ == '__main__':`
			`plac.call(main)`